[View in Colaboratory](https://colab.research.google.com/github/dhanaji/Machine-Learning-with-Python/blob/master/Text_20Classification_20Keras.ipynb)

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.8.0


Using TensorFlow backend.


In [0]:
import urllib
urllib.urlretrieve('https://data.consumerfinance.gov/api/views/s6ew-h6mp/rows.csv?accessType=DOWNLOAD','Consumer_Complaints.csv')

In [3]:
df = pd.read_csv('Consumer_Complaints.csv', encoding='latin-1')
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,03/17/2014,Closed with explanation,Yes,No,759217
1,10/01/2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10/05/2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,06/08/2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,06/10/2014,Closed with explanation,Yes,Yes,885638
4,09/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,09/13/2014,Closed with explanation,Yes,Yes,1027760


In [0]:
col = ['Consumer complaint narrative', 'Product']
df = df[col]
df = df[pd.notnull(df['Consumer complaint narrative'])]
df.head()

Unnamed: 0,Consumer complaint narrative,Product
1,I have outdated information on my credit repor...,Credit reporting
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
7,An account on my credit report has a mistaken ...,Credit reporting
12,This company refuses to provide me verificatio...,Debt collection
16,This complaint is in regards to Square Two Fin...,Debt collection


In [0]:
df.isnull().sum()

Consumer complaint narrative    0
Product                         0
dtype: int64

In [0]:
df['Product'].value_counts()

Debt collection                                                                 2575
Mortgage                                                                        2121
Credit reporting                                                                2049
Credit card                                                                     1237
Bank account or service                                                          950
Student loan                                                                     770
Consumer Loan                                                                    607
Payday loan                                                                      111
Money transfers                                                                  104
Prepaid card                                                                      90
Other financial service                                                           26
Virtual currency                                                 

In [0]:
# Split data into train and test
train_size = int(len(df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))

Train size: 8514
Test size: 2129


In [0]:
train_narrative = df['Consumer complaint narrative'][:train_size]
train_product = df['Product'][:train_size]

test_narrative = df['Consumer complaint narrative'][train_size:]
test_product = df['Product'][train_size:]

In [0]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [0]:
tokenize.fit_on_texts(train_narrative) # only fit on train
x_train = tokenize.texts_to_matrix(train_narrative)
x_test = tokenize.texts_to_matrix(test_narrative)

In [0]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [0]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [0]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (8514, 1000)
x_test shape: (2129, 1000)
y_train shape: (8514, 13)
y_test shape: (2129, 13)


In [0]:
batch_size = 32
epochs = 5

In [0]:
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [0]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 7662 samples, validate on 852 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.7458917609734846
Test accuracy: 0.789572569393339


In [0]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50], "...")
    print('Actual label:' + test_product.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

I received an employment offer from XXXX XXXX cabl ...
Actual label:Credit reporting
Predicted label: Credit reporting

citi bank offered me for a citi XXXX XXXX XXXX XXX ...
Actual label:Credit card
Predicted label: Credit card

This company made inquiries to my credit report. I ...
Actual label:Consumer Loan
Predicted label: Credit reporting

I was put on an account with XXXX college. I was s ...
Actual label:Debt collection
Predicted label: Debt collection

I can not get my mortgage company to send me a bil ...
Actual label:Mortgage
Predicted label: Mortgage

I started receiving calls from Amsher XX/XX/2016 i ...
Actual label:Debt collection
Predicted label: Debt collection

Equifax is ignoring my numerous requests to invest ...
Actual label:Credit reporting
Predicted label: Credit reporting

The personal identities of my wife and I, which we ...
Actual label:Debt collection
Predicted label: Bank account or service

The company XXXX wanted to hire me to purchase pro ...
Actual label