In [None]:
!wget https://statso.io/wp-content/uploads/2022/11/complaints.zip

--2023-04-12 05:17:12--  https://statso.io/wp-content/uploads/2022/11/complaints.zip
Resolving statso.io (statso.io)... 192.0.78.24, 192.0.78.25
Connecting to statso.io (statso.io)|192.0.78.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 433524260 (413M) [application/zip]
Saving to: ‘complaints.zip’


2023-04-12 05:17:17 (98.6 MB/s) - ‘complaints.zip’ saved [433524260/433524260]



In [None]:
!unzip /content/complaints.zip

Archive:  /content/complaints.zip
   creating: complaints/
  inflating: complaints/consumercomplaints.csv  
  inflating: __MACOSX/complaints/._consumercomplaints.csv  


In [29]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import nltk
import re
from nltk.corpus import stopwords
import string

data = pd.read_csv("/content/complaints/consumercomplaints.csv")

In [None]:
data.shape

(3101969, 7)

In [None]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
0,0,2022-11-11,Mortgage,Conventional home mortgage,Trouble during payment process,,
1,1,2022-11-23,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
2,2,2022-11-16,Mortgage,VA mortgage,Trouble during payment process,,
3,3,2022-11-15,Checking or savings account,Checking account,Managing an account,Fee problem,"Hi, I have been banking with Wells Fargo for o..."
4,4,2022-11-07,Mortgage,Other type of mortgage,Trouble during payment process,,


In [None]:
# Get memory usage of the DataFrame
memory_usage = data.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Memory usage of the DataFrame: {memory_usage:.2f} MB")

Memory usage of the DataFrame: 2478.12 MB


In [None]:
data.memory_usage(deep=True)

Index                                  128
Unnamed: 0                        24815752
Date received                    207831923
Product                          314547609
Sub-product                      223362665
Issue                            297696487
Sub-issue                        250998757
Consumer complaint narrative    1279244102
dtype: int64

In [None]:
data.memory_usage()

Index                                128
Unnamed: 0                      24815752
Date received                   24815752
Product                         24815752
Sub-product                     24815752
Issue                           24815752
Sub-issue                       24815752
Consumer complaint narrative    24815752
dtype: int64

In [None]:
24815752/3101969 
#Rows / Memory consumed by product column => Memory consumed by each product record is 8 byte.

8.0

In [None]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
    
data["Consumer complaint narrative"] = data["Consumer complaint narrative"].apply(clean)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
data = data[["Consumer complaint narrative", "Product"]]
x = np.array(data["Consumer complaint narrative"])
y = np.array(data["Product"])

In [13]:
# Get memory usage of the DataFrame
memory_usage = data.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Memory usage of the DataFrame: {memory_usage:.2f} MB")

Memory usage of the DataFrame: 1095.37 MB


In [19]:
import plotly.express as px
import plotly.graph_objects as go


In [20]:
# Types of News Categories
categories = data["Product"].value_counts()
label = categories.index
counts = categories.values
figure = px.bar(data, x=label, 
                y = counts, 
            title="Types of Products")
figure.show()

In [17]:
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

In [22]:
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train,y_train)

In [72]:
X_train.shape, y_test.shape

((2078319, 228665), (1023650,))

In [76]:
y_test_pred = sgdmodel.predict(X_test)

In [81]:
import pandas as pd
pd.Series(y_test).value_counts()


Credit reporting, credit repair services, or other personal consumer reports    472608
Debt collection                                                                 149375
Mortgage                                                                        120431
Credit card or prepaid card                                                      54646
Checking or savings account                                                      46773
Credit reporting                                                                 46408
Credit card                                                                      29346
Bank account or service                                                          28623
Student loan                                                                     23466
Money transfer, virtual currency, or money service                               15791
Vehicle loan or lease                                                            12321
Consumer Loan                              

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score


In [82]:
accuracy_score(y_test, y_test_pred)

0.572985883847018

In [83]:
cm=confusion_matrix(y_test, y_test_pred)

In [85]:
cm.sum(axis=1) #Actuals of each class

array([ 28623,  46773,  10487,  29346,  54646,  46408, 472608, 149375,
        15791,   1726, 120431,    333,   1841,   8212,   1255,  23466,
        12321,      8])

In [86]:
cm.sum(axis=0) #Prediction for each class

array([  2513,  16890,    741,   1498,  26216,   2418, 859828,  57251,
         7487,    670,  30477,    691,    167,   2365,    324,   9950,
         4154,     10])

In [92]:
cm.diagonal()

array([  1186,  12095,    321,    782,  17931,    551, 462267,  45373,
         5744,     48,  27671,      2,     49,   1310,    165,   8377,
         2665,      0])

In [93]:
cm.diagonal()/cm.sum(axis=1) # Recall of each class

array([0.04143521, 0.25858936, 0.03060933, 0.02664758, 0.32813015,
       0.01187295, 0.97811929, 0.3037523 , 0.3637515 , 0.02780997,
       0.22976642, 0.00600601, 0.02661597, 0.15952265, 0.1314741 ,
       0.35698457, 0.21629738, 0.        ])

In [94]:
cm.diagonal()/cm.sum(axis=0) # Precision of each class

array([0.47194588, 0.7161042 , 0.43319838, 0.52202937, 0.68397162,
       0.22787428, 0.53762729, 0.79252764, 0.76719647, 0.07164179,
       0.90793057, 0.00289436, 0.29341317, 0.55391121, 0.50925926,
       0.84190955, 0.64155031, 0.        ])

In [62]:
temp = data.sample(3)#["Consumer complaint narrative"]#
temp

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
581700,581700,2019-08-22,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,
1658486,1658486,2017-10-17,Credit card or prepaid card,General-purpose credit card or charge card,Closing your account,Company closed your account,I had a personal CC through MBNA since XXXX of...
273118,273118,2022-03-18,Student loan,Federal student loan servicing,Dealing with your lender or servicer,Received bad information about your loan,This past fall I received multiple emails from...


In [66]:
row=2

In [67]:
complaint = temp.iloc[row,]['Consumer complaint narrative']
complaint



In [69]:
data1 = cv.transform([complaint]).toarray()
output = sgdmodel.predict(data1)
output

array(['Student loan'], dtype='<U76')