In [None]:
from __future__ import absolute_import , division , print_function
import itertools 
import os 
import numpy as np 
import pandas as pd 
import tensorflow as tf 
from sklearn.preprocessing import LabelBinarizer , LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow import keras 
from keras.models import Sequential 
from keras.layers import Dense , Activation , Dropout 
from keras.preprocessing import text , sequence 
from keras import utils 

print("You have tensorflow version : ",tf.__version__)

You have tensorflow version :  2.12.0


In [None]:
df = pd.read_csv(r"Consumer_Complaints.csv")
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer Complaint,Company Public Response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date Sent to Company,Company Response to Consumer,Timely response?,Consumer disputed?,Complaint ID,Unnamed: 18
0,03-12-2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,03/17/2014,Closed with explanation,Yes,No,759217.0,
1,10-01-2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10-05-2016,Closed with explanation,Yes,No,2141773.0,
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100.0,
3,06-08-2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,06-10-2014,Closed with explanation,Yes,Yes,885638.0,
4,09/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,09/13/2014,Closed with explanation,Yes,Yes,1027760.0,


In [None]:
columns =  ["Consumer Complaint","Product"]
df=df[columns]
df = df[pd.notnull(df['Consumer Complaint'])]
df.head()

Unnamed: 0,Consumer Complaint,Product
1,I have outdated information on my credit repor...,Credit reporting
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
7,An account on my credit report has a mistaken ...,Credit reporting
12,This company refuses to provide me verificatio...,Debt collection
16,This complaint is in regards to Square Two Fin...,Debt collection


In [None]:
df['Product'].value_counts()

Debt collection                                                                 393
Mortgage                                                                        280
Credit reporting                                                                279
Credit card                                                                     197
Bank account or service                                                         136
Student loan                                                                     94
Consumer Loan                                                                    81
Payday loan                                                                      21
Money transfers                                                                  14
Prepaid card                                                                     11
Credit reporting, credit repair services, or other personal consumer reports      6
Other financial service                                                     

In [None]:
#we need to devide our data into train and test data 
train_size = int(len(df)* 0.8 )
train_complaints = df['Consumer Complaint'][:train_size]
train_product = df['Product'][:train_size]

test_complaints = df['Consumer Complaint'][train_size:]
test_product = df['Product'][train_size:]

In [None]:
# we need to tokenize the complaints to make theme readble by the model 
tokenize =  text.Tokenizer(num_words = 1000 , char_level=False)
#now we update the vocabulary of the tokenizer 
tokenize.fit_on_texts(train_complaints)  #learn him the vocabulary of our texts  
x_train = tokenize.texts_to_matrix(train_complaints)  #use him to predict the sequance of integer for train_compalints 
x_test = tokenize.texts_to_matrix(test_complaints)

In [None]:
# now we need to transforme the product into a readble format by our neural network so we use the label encoder 
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [None]:
"""if we feed this encoded data directlty to the neural networks maybe it will be interpreted as ordinal or countinious values 
so we need to represented as on-ot value for the network can interpreted it as categorical values """
#one hot representation represent the category with a vector of 0 and there is a 1 in the position who have the same number as the encoded value of this label 
num_classes = np.max(y_train)+1 #max number uin the array 
y_train = utils.to_categorical(y_train , num_classes)
y_test = utils.to_categorical(y_test, num_classes)
print("x_train",x_train )
print("y_train",y_train )

x_train [[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]
y_train [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [None]:
# now we need to build the neural network 
model =  Sequential() #creating a neural network using keras library 
model.add(Dense(512, input_shape=(1000,))) #defining activation function that we work with and the len of inpute layer
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(32, input_shape=(1000,)))
model.add(Activation('relu')) #defining the hidden layer and theire activation function 
model.add(Dense(num_classes))
model.add(Activation('softmax'))  # output layer 
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
batch_size =32
epochs = 5
history = model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_split=0.1)
#validation data is the amount of data use for validation ,  epochs --> number of time the entire data passed

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
score = model.evaluate(x_test,y_test,verbose=1,batch_size=batch_size)
print(score)

[0.8455938100814819, 0.7565789222717285]


In [31]:
text_labels= encoder.classes_
i=2
for i in range(10):
  prediction = model.predict(np.array([x_test[i]]))
  predicted_label = text_labels[np.argmax(prediction)]
  print(test_complaints.iloc[i][:50]+"...")
  print("actual product: ",test_product.iloc[i])
  print("predicted product: ",predicted_label)

I noticed on my latest credit report a collection ...
actual product:  Debt collection
predicted product:  Debt collection
I 'm filing a complaint because of the lack of hel...
actual product:  Mortgage
predicted product:  Mortgage
This is my second letter to the CFPB regarding the...
actual product:  Mortgage
predicted product:  Mortgage
I tried opening an Account with XXXX ... ..subject...
actual product:  Bank account or service
predicted product:  Debt collection
I have lost my Discover card and copy of my social...
actual product:  Credit card
predicted product:  Credit card
In early XX/XX/2016, I tried logging into my Chase...
actual product:  Credit card
predicted product:  Credit card
My Mortgage company affected my credit negatively....
actual product:  Mortgage
predicted product:  Mortgage
There are multiple reasons in the list above that ...
actual product:  Debt collection
predicted product:  Debt collection
Our home of 12 years is in foreclosure. We were pa...
actual produ