# DATA

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv("Consumer_Complaints (1).csv")
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,07/17/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,,,ALLY FINANCIAL INC.,TX,75035,,,Web,07/17/2019,In progress,Yes,,3309495
1,07/17/2019,Credit card or prepaid card,General-purpose prepaid card,Trouble using the card,Problem using the card to withdraw money from ...,,,SQUARE INC,KS,674XX,,,Web,07/17/2019,In progress,Yes,,3310031
2,07/17/2019,Debt collection,Other debt,Took or threatened to take negative or legal a...,Threatened or suggested your credit would be d...,,,"Diversified Consultants, Inc.",FL,,,,Web,07/17/2019,In progress,Yes,,3309687
3,07/17/2019,Mortgage,VA mortgage,Trouble during payment process,,,,"FLAGSTAR BANK, FSB",VA,22554,Servicemember,,Web,07/17/2019,In progress,Yes,,3308925
4,07/17/2019,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,,Company believes it acted appropriately as aut...,BYL Collection Services,TN,370XX,Servicemember,,Web,07/17/2019,Closed with explanation,Yes,,3308914


In [3]:
df1 = df[["Product", "Consumer complaint narrative"]]
df1["Product"] = df1["Product"].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Preprocessing

Removing Null Values

In [4]:
df1.isnull().sum()

Product                              0
Consumer complaint narrative    938347
dtype: int64

In [5]:
df1 = df1.dropna()
df1.isnull().sum()

Product                         0
Consumer complaint narrative    0
dtype: int64

Cleaning the classes

In [6]:
df1["Product"].value_counts()

credit reporting, credit repair services, or other personal consumer reports    97830
debt collection                                                                 89379
mortgage                                                                        54109
credit reporting                                                                31588
credit card or prepaid card                                                     22815
student loan                                                                    22408
credit card                                                                     18838
bank account or service                                                         14885
checking or savings account                                                     13669
consumer loan                                                                    9473
vehicle loan or lease                                                            6083
money transfer, virtual currency, or money service    

In [7]:
df1 = df1.replace(to_replace =["credit card", "prepaid card"],  
                            value ="credit card or prepaid card")
df1 = df1.replace(to_replace ="credit reporting",
                            value ="credit reporting, credit repair services, or other personal consumer reports")
df1 = df1.replace(to_replace ="virtual currency",
                            value ="money transfer, virtual currency, or money service")
df1 = df1.replace(to_replace ="money transfers",
                            value ="money transfer, virtual currency, or money service")
df1 = df1.replace(to_replace =["student loan", "consumer loan", "vehicle loan or lease", "payday loan, title loan, or personal loan", "payday loan"],  
                            value ="loans")
df1 = df1.replace(to_replace ="checking or savings account",  
                            value ="bank account or service")

In [8]:
df1 = df1[df1.Product != 'money transfer, virtual currency, or money service']
df1 = df1[df1.Product != 'other financial service']

In [9]:
df1["Product"].value_counts()

credit reporting, credit repair services, or other personal consumer reports    129418
debt collection                                                                  89379
mortgage                                                                         54109
loans                                                                            44409
credit card or prepaid card                                                      43103
bank account or service                                                          28554
Name: Product, dtype: int64

In [10]:
# Encoding classes
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1["Product"] = le.fit_transform(df1["Product"])
df1.head()

Unnamed: 0,Product,Consumer complaint narrative
43037,2,!!!!!! This is not a Duplicate!!!! I have cont...
44968,2,{$16000.00}. This is not my debt.
47834,3,When they call they dont answer the call and t...
48834,3,I provided account information which showed de...
49002,3,There are 2 charges on my credit report from a...


Selecting 100 Rows

In [11]:
dfs = df1.sample(n = 1000, random_state = 42, replace=False)
dfs.head()

Unnamed: 0,Product,Consumer complaint narrative
620922,4,I have Parents Plus loans that we 're taken ou...
541267,5,Our VA loan was sold to Loancare within 30 day...
768022,3,I had a XXXX credit card I closed the account ...
626746,0,PNC charged XXXX-XXXX dollar late fees on a eq...
831296,1,I am filing this complaint about American Expr...


In [13]:
dfs["Product"].value_counts()

2    341
3    228
5    134
1    111
4    103
0     83
Name: Product, dtype: int64

In [14]:
corpus = dfs["Consumer complaint narrative"].tolist()

Tokenizing, Lemmatizing and Removing Stopwords and Punctuations

In [15]:
from nltk.corpus import stopwords
from string import punctuation
stuff_to_be_removed = list(stopwords.words("english"))+list(punctuation)

In [16]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Devyani\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [17]:
final_corpus= []
for i in range(len(corpus)):
    text = word_tokenize(corpus[i].lower())
    text = [t for t in text if len(t) > 2]
    text = [lemmatizer.lemmatize(y) for y in text if y not in stuff_to_be_removed]
    sent = " ".join(text)
    sent = " ".join(w for w in nltk.wordpunct_tokenize(sent) \
         if w.lower() in words or not w.isalpha())
    
    final_corpus.append(sent)

In [18]:
new_df = pd.DataFrame(final_corpus)
dfs = dfs.reset_index(drop=True)
new_df["Product"] = dfs["Product"]
new_df.columns = ["Consumer complaint narrative", "Product"]
new_df.head()

Unnamed: 0,Consumer complaint narrative,Product
0,parent plus loan ' re taken loan 8 . 25 repaym...,4
1,loan sold within day / / issue / / date statem...,5
2,credit card closed account lost job come find ...,3
3,- dollar late fee equity line credit bought na...,0
4,filing complaint express policy raising intere...,1


TF - IDF Value with N-Grams

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(2,3))

In [20]:
vector= tfidf.fit_transform(new_df["Consumer complaint narrative"])

In [21]:
vector

<1000x114407 sparse matrix of type '<class 'numpy.float64'>'
	with 134679 stored elements in Compressed Sparse Row format>

In [22]:
vector.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Data Train - Test Split

In [23]:
X = vector.toarray()
y = new_df["Product"]

In [24]:
from sklearn.model_selection import train_test_split as tts
X_train,X_test,y_train,y_test=tts(X,y,test_size=0.3,random_state=40)

# Multinomial Naive Bayes

In [25]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
y_pred=mnb.predict(X_test)

In [27]:
mnb.score(X_test, y_test)

0.38

Precision and Recall

In [28]:
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))

precision: [0.   0.   0.38 0.   0.   0.  ]
recall: [0. 0. 1. 0. 0. 0.]


  'precision', 'predicted', average, warn_for)
