In [1]:
import sys
import re
import contractions
from collections import Counter
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup
import unicodedata
from nltk.corpus import stopwords
nltk_stopwords = set(list(stopwords.words('english'))) -  set(['nor', 'no', 'not', 'nothing', 'neither', 'never', 'none', "up", "down","latency", "slot", "standby", "left", "right", "ise"])
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [3]:
df = pd.read_csv('data/case_study_data.csv')

In [4]:
df.head()

Unnamed: 0,complaint_id,product_group,text
0,2815595,bank_service,On XX/XX/2017 my check # XXXX was debited from...
1,2217937,bank_service,I opened a Bank of the the West account. The a...
2,2657456,bank_service,wells fargo in nj opened a business account wi...
3,1414106,bank_service,A hold was placed on my saving account ( XXXX ...
4,1999158,bank_service,Dear CFPB : I need to send a major concern/com...


In [239]:
def text_preprocess(text_list, verbose = True):
    
    clean_text_list = []
    nrows = len(text_list)
    i = 0
    for text in text_list:
        text = str(text)
        #text = ' '.join(text.split())
        text = text.lower()
        text = re.sub(r'wells fargo', 'wells_fargo', text)
        
        text = re.sub(r'x{1,9}\/x{1,9}\/x{1,9}', 'xxxx', text)
        #text = re.sub(r'xx/xx/xx', 'xx', text)
     
        #text = re.sub(r'xx/xx/xxxx', 'xx', text)
        text = re.sub(r'x{1,9}\/x{1,9}\/\d{1,4}', 'xxxx', text)
        text = re.sub(r' \{\$\d{1,9}.\d{1,9}\}', ' xxxx', text)
       
        # Remove extra whitespace
        text = re.sub(' +', ' ', text)
        text = re.sub('\n', ' ', text)
        
        #Remove URLs
        text = re.sub('(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*', '', text, flags=re.MULTILINE)

        #Remove emails
        text = re.sub('\S*@\S*\s?',' ',text)

        #Remove time formats Ex:  05:09:09
        text = re.sub(r'\s\d{1,2}:\d{1,2}:\d{1,2}', ' ', text) 

       
        #Remove date formats Ex:  05/09 ,   05/09/2014 05-09-2014 05-Jan-2014
        text = re.sub(r'\s\d{1,4}/\d{1,4}/\d{1,4}', ' ', text)
        text = re.sub(r'\s\d{1,4}-\d{1,4}-\d{1,4}', ' ', text)
        text = re.sub(r'\s\d{1,2}/\d{1,2}', ' ', text)
        #text = re.sub(r'\d{1,4}-\w{1,4}-\d{1,4}', ' ', text)
        #Remove time formats Ex:  05:09
        text = re.sub(r'\s\d{1,2}:\d{1,2}', ' ', text)  

        #Remove date formats Ex:  05-09
        text = re.sub(r'\s\d{1,2}-\d{1,2}', ' ', text)

        #Remove punctuation
        text = re.sub(r'[^\w.&\s]', '', text)
        #Remove multiple instance of the repeating characters -=/_ Ex: Replace observation==== with observation
        regex  = re.compile('(\=\=+)|(\/\/+)|(\_\_+)|(\-\-+)') 
        text = re.sub(regex, "", text)
        
        text = re.sub(r'\d{1,4}', 'xxxx', text)
        
        

        #Remove repeating words fan fan tay = fan tray
        text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)
        
        #Remove special characters except /_%-
        #text = re.sub("[-]","", text)
        text = re.sub("[^a-zA-Z0-9?/_%\n-]"," ", text)
        
        text = '' if text =='none' else text.strip()
        clean_text_list.append(text)
        if verbose == 1:
            if (i % 50000 == 0):  #Print status every 100 iterations
                print("Clean up in Progress: " + str(50000 * i/float(nrows))[:4] + "%.   " \
                              + str(i) + " notes of " + str(nrows) + " cleaned up")
        i += 1

        
    
    if verbose:
        print("Clean up in Progress: " + str(50000 * nrows/float(nrows))[:4] + "%.   " \
                              + str(nrows) + " notes of " + str(nrows) + " cleaned up")
   
    return clean_text_list

def feature_preprocess(text_list, verbose = True):
    text_list = text_preprocess(text_list,  verbose) 
    vocab_counter =  Counter(' '.join([cs for cs in text_list]).split())
    print('\nLength of all Vocab: '  +str(len(vocab_counter)))  
        
    return text_list

def nltk_pre_process(text_list, stem = True, verbose = True):
    
    def strip_html_tags(text):
        soup = BeautifulSoup(text, "html.parser")
        [s.extract() for s in soup(['iframe', 'script'])]
        stripped_text = soup.get_text()
        stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
        return stripped_text

    def remove_accented_chars(text):
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return text

    def expand_contractions(text):
        return contractions.fix(text)
    
    clean_documents = []  
    i = 1
    nrows = len(text_list)
    for document in text_list:
        # strip HTML
        #document = strip_html_tags(document)
        
        # remove extra newlines (often might be present in really noisy text)
        document = document.translate(document.maketrans("\n\t\r", "   "))
        # remove accented characters
        document = remove_accented_chars(document)
        # expand contractions    
        document = expand_contractions(document)
        
        document = ' '.join([word for word in document.split() if word not in (nltk_stopwords) and word!=''])
        if stem:
            #document = ' '.join([stemmer.stem(word) for word in document.split()])
            document = ' '.join([lemmatizer.lemmatize(word) for word in document.split()])

        # remove special characters and\or digits
        # insert spaces between special characters to isolate them    
        #special_char_pattern = re.compile(r'([{.(-)!}])')
        #document = special_char_pattern.sub(" \\1 ", document)

        # remove extra whitespace
        document = re.sub(' +', ' ', document)
        document = document.strip()
        clean_documents.append(document)
        if verbose and i%50000==0:
            print("\rFinal clean up in Progres:  " + str(i) + " documents of " + str(nrows) + " cleaned up")
        i += 1
    
    return clean_documents

In [235]:
clean_summary = feature_preprocess(df['text'].tolist(), verbose = 1)
df['clean_text'] = clean_summary
                                  

Clean up in Progress: 0.0%.   0 notes of 268359 cleaned up
Clean up in Progress: 9315%.   50000 notes of 268359 cleaned up
Clean up in Progress: 1863%.   100000 notes of 268359 cleaned up
Clean up in Progress: 2794%.   150000 notes of 268359 cleaned up
Clean up in Progress: 3726%.   200000 notes of 268359 cleaned up
Clean up in Progress: 4657%.   250000 notes of 268359 cleaned up
Clean up in Progress: 5000%.   268359 notes of 268359 cleaned up

Length of all Vocab: 117229


In [238]:
df.head()

Unnamed: 0,complaint_id,product_group,text,clean_text
0,2815595,bank_service,On XX/XX/2017 my check # XXXX was debited from...,on xxxx my check xxxx was debited from my che...
1,2217937,bank_service,I opened a Bank of the the West account. The a...,i opened a bank of the west account the accou...
2,2657456,bank_service,wells fargo in nj opened a business account wi...,wells_fargo in nj opened a business account wi...
3,1414106,bank_service,A hold was placed on my saving account ( XXXX ...,a hold was placed on my saving account xxxx ...
4,1999158,bank_service,Dear CFPB : I need to send a major concern/com...,dear cfpb i need to send a major concerncompl...


In [240]:
prep_text = nltk_pre_process(df.clean_text.tolist())
df['prep_text'] = prep_text

Final clean up in Progres:  50000 documents of 268359 cleaned up
Final clean up in Progres:  100000 documents of 268359 cleaned up
Final clean up in Progres:  150000 documents of 268359 cleaned up
Final clean up in Progres:  200000 documents of 268359 cleaned up
Final clean up in Progres:  250000 documents of 268359 cleaned up


In [241]:
df.head()

Unnamed: 0,complaint_id,product_group,text,clean_text,prep_text
0,2815595,bank_service,On XX/XX/2017 my check # XXXX was debited from...,on xxxx my check xxxx was debited from my che...,xxxx check xxxx debited checking account check...
1,2217937,bank_service,I opened a Bank of the the West account. The a...,i opened a bank of the west account the accou...,opened bank west account account came promotio...
2,2657456,bank_service,wells fargo in nj opened a business account wi...,wells_fargo in nj opened a business account wi...,wells_fargo nj opened business account without...
3,1414106,bank_service,A hold was placed on my saving account ( XXXX ...,a hold was placed on my saving account xxxx ...,hold placed saving account xxxx institution sa...
4,1999158,bank_service,Dear CFPB : I need to send a major concern/com...,dear cfpb i need to send a major concerncompl...,dear cfpb need send major concerncomplaint fal...


In [242]:
groups = df.groupby(['product_group']).groups
for group in groups:
    print(group, ': ', df.loc[df.product_group == group].iloc[0].prep_text)
    print('='*100)

bank_service :  xxxx check xxxx debited checking account check not cashed xxxx requested copy endorsement fund credited bank america not give satisfactory answer since contractor breach contract needed proof payment court
credit_card :  account discover card since xxxx paid agreed monthly ever since however decided not grant credit longer available credit limit xxxx refuse allow use account angry say least
credit_reporting :  disputed three inaccurate xxxx day late payment credit report directly experian submitted documentation certified mail xxxx xxxx experian including record payment due date xxxx month xxxx along printout bank statement verifying date fund withdrawn account payment vehicle loan xxxx noted inconsistency xxxx reporting bureau account noted inconsistent accounting actual payment amount listed account displayed credit report asked amount paid date paid updated report explained credit union loan got bought new credit union not applying payment correctly requested look ac

In [243]:
df.to_csv('data/preprocessed_case_study_data.csv', index = False)