In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import scattertext as st
from scattertext import CorpusFromPandas, produce_scattertext_explorer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim import matutils, models

import re
import string
import spacy
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag, pos_tag_sents
from nltk.chunk import ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

sns.set(context = 'paper', style='white', font_scale = 0.8)

In [3]:
df = pd.read_csv('complaints-2021-12-05.csv')

In [4]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/25/19,Checking or savings account,Checking account,Managing an account,Funds not handled or disbursed as instructed,,,JPMORGAN CHASE & CO.,IL,60610,,,Fax,03/27/19,Closed with explanation,Yes,,3190068
1,07/12/19,Debt collection,Other debt,Attempts to collect debt not owed,Debt was paid,,Company disputes the facts presented in the co...,"NRA Group, LLC",IL,60628,,Consent not provided,Web,07/12/19,Closed with explanation,Yes,,3304855
2,11/12/21,Debt collection,Medical debt,Attempts to collect debt not owed,Debt is not yours,,,"Kinum, Inc., Indianapolis, IN Branch",IL,60620,Older American,,Phone,11/12/21,Closed with explanation,Yes,,4903716
3,08/30/19,Debt collection,Credit card debt,Attempts to collect debt not owed,Debt is not yours,"In XXXX, I started receiving mail and phone ca...",,Resurgent Capital Services L.P.,IL,62205,,Consent provided,Web,08/30/19,Closed with explanation,Yes,,3359230
4,06/21/19,Credit card or prepaid card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,In XX/XX/2019 I received a phone call from XXX...,Company has responded to the consumer and the ...,SYNCHRONY FINANCIAL,IL,60643,,Consent provided,Web,06/21/19,Closed with explanation,Yes,,3282179


### EDA

In [5]:
df = df.rename(columns=str.lower)

In [6]:
df.columns = df.columns.str.replace(' ','_')

In [7]:
df.columns

Index(['date_received', 'product', 'sub-product', 'issue', 'sub-issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zip_code', 'tags', 'consumer_consent_provided?',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response?', 'consumer_disputed?', 'complaint_id'],
      dtype='object')

In [8]:
df.consumer_complaint_narrative.isna().sum()

8276

In [9]:
df.shape

(16122, 18)

In [10]:
df.dropna(subset=['consumer_complaint_narrative'], inplace=True)

In [11]:
df.shape

(7846, 18)

In [12]:
df.consumer_complaint_narrative.isna().sum()

0

In [13]:
df.drop_duplicates('complaint_id', inplace=True) #df.duplicated(subset=['PaperID', 'Abstract'], keep=False).sum()

In [14]:
df.reset_index(drop=True, inplace=True)

In [15]:
df.duplicated(subset=['date_received', 'product', 'sub-product', 'issue', 'sub-issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zip_code', 'tags', 'consumer_consent_provided?',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response?', 'consumer_disputed?', 'complaint_id'], keep='first'). value_counts()

False    7846
dtype: int64

In [16]:
df.sample(5)

Unnamed: 0,date_received,product,sub-product,issue,sub-issue,consumer_complaint_narrative,company_public_response,company,state,zip_code,tags,consumer_consent_provided?,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response?,consumer_disputed?,complaint_id
1545,09/06/21,Debt collection,Mortgage debt,Took or threatened to take negative or legal a...,Threatened or suggested your credit would be d...,Fay services account # XXXX began foreclosure ...,,"Fay Servicing, LLC",IL,60449,,Consent provided,Web,09/22/21,Closed with explanation,Yes,,4695921
1561,02/18/19,Student loan,Private student loan,Dealing with your lender or servicer,Problem with customer service,I have been trying to get in touch Conduent ( ...,Company has responded to the consumer and the ...,ACS Education Services,IL,60659,,Consent provided,Web,02/26/19,Closed with explanation,Yes,,3155138
2310,03/28/19,Vehicle loan or lease,Loan,Problems at the end of the loan or lease,Problem related to refinancing,I over pay my XXXX off pay for XXXX .toXXXX X...,,"Westlake Services, LLC",IL,60085,Servicemember,Consent provided,Web,03/28/19,Closed with explanation,Yes,,3194094
663,02/18/20,Credit card or prepaid card,General-purpose credit card or charge card,"Advertising and marketing, including promotion...",Didn't receive advertised or promotional terms,I received a ''Premium Rewards '' from BMO on ...,Company has responded to the consumer and the ...,BMO HARRIS BANK NATIONAL ASSOCIATION,IL,61820,,Consent provided,Web,02/18/20,Closed with non-monetary relief,Yes,,3534853
1779,12/28/18,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,i had an account with US Bank which i opened i...,Company has responded to the consumer and the ...,U.S. BANCORP,IL,60453,,Consent provided,Web,12/28/18,Closed with explanation,Yes,,3111418


In [17]:
df.drop(['state', 'consumer_consent_provided?', 
         'submitted_via', 'date_sent_to_company', 'complaint_id'], axis=1, inplace=True)

In [18]:
df.reset_index(drop=True, inplace=True)

In [19]:
df.loc[5, 'consumer_complaint_narrative']

'I opened a Checking account in Chase Bank on XX/XX/2019. I opened it online and there was an offer that if I make direct deposit of {$500.00} in first 60 days of account opening, then I will get bonus of {$200.00} within 10 days. While opening the account online, it says " COUPON APPLIED AUTOMATICALLY \'\' so i don\'t need to apply any coupon or visit the branch for the same. My first direct deposit was made on XX/XX/2019 ( {$2000.00} approx ). After that second direct deposit was made on XX/XX/2019 ( Almost same amount ). But I didn\'t get my bonus. I called customer care on XX/XX/2019 to enquire my bonus, they said that I did not applied coupon so I will not get the bonus. I explained them that the website is saying that coupon is automatically applied. Then they asked me to visit branch. I visited the branch on XX/XX/2019 and meet the relationship officer " XXXX XXXX \'\'. She said that as I opened account online so branch has nothing to do in it and sent an email to branch manager

### Data Acquisition

In [20]:
corpus_check = df.consumer_complaint_narrative

In [21]:
sum([len(d.split(' ')) for d in corpus_check]) > 100000 # check for data acquisition requirement

True

In [22]:
df.to_csv('complaints.csv', index=False)

### Text Preprocessing

In [23]:
# remove numbers, punctuation, and convert to lower case
non_num = lambda x: re.sub(r'\d+', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x.lower())
remove_ws = lambda x: x.rstrip()

df['complaints'] = df['consumer_complaint_narrative'].map(non_num).map(punc_lower).map(remove_ws)

In [23]:
df['complaints'][29]

'i transferred two fees on my loan a year ago i paid extra each month to target that balance by xxxx extra each month i noticed a balance transfer fee on my invoice and contacted the company to find out why i was being assessed am interest fee and a balance transfer fee i was told that the promotion had needed and i still owed almost the entire transfer that i did when i opened my credit card i explained to xxxx that cant be possible when i have never paid the monthly payment i always paid more like xxxx more each month she told me the extra money each month goes to the interest on the current balance i told her how is that fair to the customer and how the heck do i pay the balance transfer off if i as the consumer can not pay down the balance nor can i see the breakdown when i make a payment she was rude did not want to understand how their practices are misleading to the consumer and not fair i asked to speak with her supervisor which she transferred the call to xxxx who was more ple

In [24]:
df['consumer_complaint_narrative'][29]

'I transferred two fees on my loan a year ago. I paid extra each month to target that balance by XXXX extra each month. I noticed a balance transfer fee on my invoice and contacted the company to find out why I was being assessed am Interest fee and a balance transfer fee. I was told that the promotion had needed and I still owed almost the entire transfer that I did when I opened my credit card!! I explained to XXXX that cant be possible when I have never paid the monthly payment I always paid more. Like XXXX more each month. She told me the extra money each month goes to The interest on the current balance. I told her how is that fair to the customer and how the heck do I pay the balance transfer off if I as the consumer can not pay down the balance nor can I see the breakdown when I make a payment!!! She was rude did not want to understand how their practices are misleading to the consumer and not fair!! I asked to speak with her Supervisor, which she transferred the call to XXXX wh

### Tokenization

In [25]:
def tokenize_text(row):
    temp_tokenized_txt = word_tokenize(row['complaints'])
    return temp_tokenized_txt

In [26]:
df['complaints'] = df['complaints'].apply(word_tokenize)

In [27]:
df.loc[29,'complaints']

['i',
 'transferred',
 'two',
 'fees',
 'on',
 'my',
 'loan',
 'a',
 'year',
 'ago',
 'i',
 'paid',
 'extra',
 'each',
 'month',
 'to',
 'target',
 'that',
 'balance',
 'by',
 'xxxx',
 'extra',
 'each',
 'month',
 'i',
 'noticed',
 'a',
 'balance',
 'transfer',
 'fee',
 'on',
 'my',
 'invoice',
 'and',
 'contacted',
 'the',
 'company',
 'to',
 'find',
 'out',
 'why',
 'i',
 'was',
 'being',
 'assessed',
 'am',
 'interest',
 'fee',
 'and',
 'a',
 'balance',
 'transfer',
 'fee',
 'i',
 'was',
 'told',
 'that',
 'the',
 'promotion',
 'had',
 'needed',
 'and',
 'i',
 'still',
 'owed',
 'almost',
 'the',
 'entire',
 'transfer',
 'that',
 'i',
 'did',
 'when',
 'i',
 'opened',
 'my',
 'credit',
 'card',
 'i',
 'explained',
 'to',
 'xxxx',
 'that',
 'cant',
 'be',
 'possible',
 'when',
 'i',
 'have',
 'never',
 'paid',
 'the',
 'monthly',
 'payment',
 'i',
 'always',
 'paid',
 'more',
 'like',
 'xxxx',
 'more',
 'each',
 'month',
 'she',
 'told',
 'me',
 'the',
 'extra',
 'money',
 'each',
 '

### Remove non-English Words

In [28]:
words = set(nltk.corpus.words.words())  #set of English words 

df['complaints'] = df['complaints'].apply(lambda x: [item for item in x if item in words])

### Remove Stop Words

In [107]:
words = ['i', 'im', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", 
         "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 
         'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',
         'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 
         'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
         'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'other', 'another', 'and', 'but','if', 'or', 
         'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
         'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 
         'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 
         'where', 'why','how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 
         'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
         'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 
         'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
         "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
         "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', 
         "won't", 'wouldn', "wouldn't", 'times', 'keep', 'keeping', 'kept', 'tell', 'told', 'telling', 'ask',
         'asked', 'feel', 'go', 'went', 'going', 'write', 'wrote', 'writing', 'say', 'saying', 'said', 'call',
        'called', 'calling', 'x', 'xx', 'xxx', 'xxxx', 'xxxxx', 'xxxxxx', 'xxxxxxx', 'xxxxxxxx', 'dont', 
         'never', 'need', 'really', 'try', 'tried', 'trying', 'ill', 'would', 'upon', 'via', 'however', 
         'within', 'either', 'neither', 'chase', 'jp morgan chase', 'want', 'like', 'one', 'two', 'three', 
         'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'nothing', 'everything', 'anyway', 'suppose', 
        'supposed', 'day', 'discover', 'citi', 'wells', 'fargo', 'bmo', 'hsbc', 
         'equifax', 'pnc', 'td', 'ally', 'able', 'didnt', 'day', 'month', 'year']

In [91]:
company_list = df['company'].unique().tolist()
company_list2 = [x.lower() for x in company_list]

In [92]:
words = words + company_list2
words

['i',
 'im',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'other',
 'another',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 '

In [93]:
df['complaints'] = df['complaints'].apply(lambda x: [item for item in x if item not in words])

In [94]:
df['complaints'][507]

['applied',
 'loan',
 'modification',
 'due',
 'illness',
 'problem',
 'freedom',
 'update',
 'lender',
 'find',
 'phone',
 'make',
 'approval',
 'freedom',
 'mortgage',
 'law',
 'send',
 'pending',
 'modification',
 'update',
 'first',
 'class',
 'mail',
 'every',
 'agent',
 'speak',
 'freedom',
 'mortgage',
 'law',
 'last',
 'notice',
 'receive',
 'mail',
 'state',
 'title',
 'search',
 'intentionally',
 'fail',
 'struggle',
 'trial',
 'period',
 'date',
 'payment',
 'new',
 'loan',
 'make',
 'complaint',
 'freedom',
 'bad',
 'start',
 'send',
 'loss',
 'mitigation',
 'start',
 'due',
 'today',
 'freedom',
 'law',
 'loan',
 'number']

In [95]:
df['complaints'][2549]

['complaint',
 'bank',
 'response',
 'address',
 'actual',
 'complaint',
 'previous',
 'issue',
 'ago',
 'bank',
 'new',
 'platform',
 'issue',
 'currently',
 'product',
 'defect',
 'banking',
 'platform',
 'deposit',
 'business',
 'account',
 'detail',
 'batch',
 'see',
 'attach',
 'example',
 'several',
 'miss',
 'report',
 'back',
 'spend',
 'long',
 'could',
 'figure',
 'tool',
 'issue',
 'extremely',
 'significant',
 'lead',
 'business',
 'inaccurate',
 'accounting',
 'billing',
 'incorrectly',
 'speak',
 'woman',
 'confirm',
 'product',
 'defect',
 'work',
 'digital',
 'team',
 'get',
 'problem',
 'fix',
 'phone',
 'leave',
 'executive',
 'office',
 'several',
 'response',
 'bank',
 'misrepresent',
 'accurate',
 'business',
 'bank',
 'account',
 'end']

### Stemming

In [96]:
#def stem_text(text):
#    doc_stemmed = []
#    stemmer = LancasterStemmer()
    
#    for word in text:
#        doc_stemmed.append(stemmer.stem(word))
        
#    return doc_stemmed

In [97]:
# df['complaints'] = df['complaints'].apply(stem_text)

Since stemming can make words hard to interpret, this step is skipped for now.

### Parts of Speech

In [98]:
#tagging words with parts of speech 
df['complaints_tagged'] = pos_tag_sents(df['complaints'].tolist())

In [99]:
#inspect tagged words for one abstract
df['complaints_tagged'][1]

[('receive', 'JJ'),
 ('phone', 'NN'),
 ('handle', 'NN'),
 ('case', 'NN'),
 ('hung', 'NN'),
 ('didnt', 'NN'),
 ('anything', 'NN'),
 ('depth', 'JJ'),
 ('anything', 'NN'),
 ('receive', 'JJ'),
 ('letter', 'NN'),
 ('today', 'NN'),
 ('speak', 'VB'),
 ('card', 'NN'),
 ('true', 'JJ'),
 ('lie', 'JJ'),
 ('letter', 'NN'),
 ('send', 'VBP'),
 ('letter', 'NN'),
 ('basically', 'RB'),
 ('lie', 'JJ'),
 ('inform', 'NN'),
 ('take', 'VB'),
 ('case', 'NN'),
 ('hung', 'NN'),
 ('else', 'RB')]

In [100]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

### Lemmatization

In [101]:
wordNetLemmatizer = WordNetLemmatizer()

In [102]:
#creating a function to convert post_tags to WordNet friendly tags, then lemmatizing
def lemmatize_all(tagged_text):
    doc_lemm = []
    wnl = WordNetLemmatizer()
    for word, tag in (tagged_text):
        if tag.startswith("NN"):
            doc_lemm.append(wnl.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('VB'):
            doc_lemm.append(wnl.lemmatize(word, wordnet.VERB))
        elif tag.startswith('JJ'):
            doc_lemm.append(wnl.lemmatize(word, wordnet.ADJ))
        else:
            doc_lemm.append(word)
    return doc_lemm

In [103]:
df['complaints'] = df['complaints_tagged'].apply(lemmatize_all)

In [104]:
df.sample(5)

Unnamed: 0,date_received,product,sub-product,issue,sub-issue,consumer_complaint_narrative,company_public_response,company,zip_code,tags,company_response_to_consumer,timely_response?,consumer_disputed?,complaints,complaints_tagged
1496,06/17/20,Credit card or prepaid card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,I am a long-term citi card holder ( gold card ...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",60657,Older American,Closed with non-monetary relief,Yes,,"[card, holder, gold, card, end, end, rope, pho...","[(card, NN), (holder, NN), (gold, NN), (card, ..."
5680,04/11/19,Mortgage,FHA mortgage,Trouble during payment process,,Since XX/XX/XXXX I have tried to dispute this ...,Company has responded to the consumer and the ...,Freedom Mortgage Company,60060,,Closed with explanation,Yes,,"[since, dispute, credit, bureau, course, accur...","[(since, IN), (dispute, NN), (credit, NN), (bu..."
5048,08/19/20,Debt collection,Medical debt,False statements or representation,Attempted to collect wrong amount,I received a Delinquency Notification dated XX...,,"Harris & Harris, Ltd.",60030,,Closed with explanation,Yes,,"[receive, delinquency, notification, detail, b...","[(receive, JJ), (delinquency, NN), (notificati..."
2506,09/18/19,Mortgage,Conventional home mortgage,Struggling to pay mortgage,,To whom It may concern I am late 3 payment so ...,,NATIONSTAR MORTGAGE,60804,,Closed with explanation,Yes,,"[may, concern, late, payment, far, unable, use...","[(may, MD), (concern, NN), (late, JJ), (paymen..."
463,06/08/21,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,I opened a personal checking account with Axos...,,"AXOS FINANCIAL, INC.",60031,,Closed with explanation,Yes,,"[personal, account, bank, time, registered, so...","[(personal, JJ), (account, NN), (bank, NN), (t..."


In [105]:
df['complaints'][29]

['transfer',
 'loan',
 'year',
 'ago',
 'extra',
 'month',
 'target',
 'balance',
 'extra',
 'month',
 'balance',
 'transfer',
 'fee',
 'invoice',
 'company',
 'find',
 'assessed',
 'interest',
 'fee',
 'balance',
 'transfer',
 'fee',
 'promotion',
 'still',
 'almost',
 'entire',
 'transfer',
 'credit',
 'card',
 'cant',
 'possible',
 'monthly',
 'payment',
 'always',
 'month',
 'extra',
 'money',
 'month',
 'interest',
 'current',
 'balance',
 'fair',
 'customer',
 'heck',
 'pay',
 'balance',
 'transfer',
 'consumer',
 'pay',
 'balance',
 'see',
 'breakdown',
 'make',
 'payment',
 'rude',
 'understand',
 'mislead',
 'consumer',
 'fair',
 'speak',
 'supervisor',
 'transfer',
 'pleasant',
 'agree',
 'waive',
 'ass',
 'investigate',
 'process',
 'ensure',
 'mislead',
 'take',
 'advantage']

In [106]:
df.to_csv('complaints_nlp.csv', index=False)