In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers
from keras import optimizers

Using Theano backend.


### Load the Data
The objective is to build a neural network using Keras. Create a classification model using Deep Learning.  I will be using the Keras library.  Evaluate performance of a neural network using Keras.

In [15]:
data = pd.read_csv('data/complaints.csv', error_bad_lines=False, engine="python")

In [16]:
data.head(50)

Unnamed: 0,Product,Consumer complaint narrative
0,Student loan,In XX/XX/XXXX I filled out the Fedlaon applica...
1,Student loan,I am being contacted by a debt collector for p...
2,Student loan,I cosigned XXXX student loans at SallieMae for...
3,Student loan,Navient has sytematically and illegally failed...
4,Student loan,My wife became eligible for XXXX Loan Forgiven...
5,Student loan,"Hello, I am a XXXX resident who has multiple X..."
6,Student loan,My account was sold to them and expect me to p...
7,Student loan,"On XX/XX/XXXX, I was sued in XXXX County, XXXX..."
8,Student loan,"To Whom It May Concern, I applied for a privat..."
9,Student loan,My loans through XXXX were deferred as I am a ...


In [17]:
data.shape

(23795, 2)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23795 entries, 0 to 23794
Data columns (total 2 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Product                       23795 non-null  object
 1   Consumer complaint narrative  23795 non-null  object
dtypes: object(2)
memory usage: 371.9+ KB


data.isna().sum()

In [20]:
data['Product'].value_counts()

Student loan               11404
Credit card                 9540
Bank account or service     2851
Name: Product, dtype: int64

### Preprocessing
Before we build our neural network, we need to do several preprocessing steps. First, we will create word vector counts (a bag of words type representation) of our complaints text. Next, we will change 
the category labels to integers. Finally, we will perform our usual train-test split before building and training our neural network using Keras. 

### One-hot encoding of the complaints
Our first step again is to transform our textual data into a numerical representation. As we saw in some of our previous lessons on NLP, there are many ways to do this. Here, we'll use the Tokenizer() class from the preprocessing.text sub-module of the Keras package.

As with our previous work using NLTK, this will transform our text complaints into word vectors. (Note that the method of creating a vector is different from our previous work with NLTK; as you'll see, word order will be preserved as opposed to a bag of words representation). In the below code, we'll only keep the 2,000 most common words and use one-hot encoding.

In [23]:
# Raw text complaints
complaints = data['Consumer complaint narrative'] 

# Initialize a tokenizer 
tokenizer = Tokenizer(num_words=2000) 

# Fit it to the complaints
tokenizer.fit_on_texts(complaints) 

# Generate sequences
sequences = tokenizer.texts_to_sequences(complaints) 
print('sequences type:', type(sequences))

# Similar to sequences, but returns a numpy array
one_hot_results= tokenizer.texts_to_matrix(complaints, mode='binary') 
print('one_hot_results type:', type(one_hot_results))

# Useful if we wish to decode (more explanation below)
word_index = tokenizer.word_index 

# Tokens are the number of unique words across the corpus
print('Found %s unique tokens.' % len(word_index)) 

# Our coded data
print('Dimensions of our coded results:', np.shape(one_hot_results)) 

sequences type: <class 'list'>
one_hot_results type: <class 'numpy.ndarray'>
Found 28752 unique tokens.
Dimensions of our coded results: (23795, 2000)


In [25]:
reverse_index = dict([(value, key) for (key, value) in word_index.items()])
reverse_index

{1: 'i',
 2: 'the',
 3: 'xxxx',
 4: 'to',
 5: 'and',
 6: 'my',
 7: 'a',
 8: 'that',
 9: 'of',
 10: 'was',
 11: 'in',
 12: 'they',
 13: 'for',
 14: 'on',
 15: 'have',
 16: 'not',
 17: 'me',
 18: 'this',
 19: 'is',
 20: 'with',
 21: 'it',
 22: 'xx',
 23: 'account',
 24: 'credit',
 25: '00',
 26: 'had',
 27: 'loan',
 28: 'from',
 29: 'payment',
 30: 'be',
 31: 'as',
 32: 'would',
 33: 'card',
 34: 'them',
 35: 'been',
 36: 'at',
 37: 'but',
 38: 'loans',
 39: 'payments',
 40: 'an',
 41: 'told',
 42: 'no',
 43: 'bank',
 44: 'by',
 45: 'or',
 46: 'when',
 47: 'did',
 48: 'are',
 49: 'am',
 50: 'do',
 51: "n't",
 52: 'pay',
 53: 'interest',
 54: 'time',
 55: 'has',
 56: 'which',
 57: 'were',
 58: 'so',
 59: 'all',
 60: 'their',
 61: 'called',
 62: 'navient',
 63: 'because',
 64: 'if',
 65: 'out',
 66: 'after',
 67: 'received',
 68: 'we',
 69: 'can',
 70: "''",
 71: 'any',
 72: 'you',
 73: 'could',
 74: "'s",
 75: 'due',
 76: 'never',
 77: 'student',
 78: 'up',
 79: 'about',
 80: 'will',
 81:

In [28]:
comment_idx_to_preview = 19
print('Original complaint text:')
print(complaints[comment_idx_to_preview])
print('\n\n')

#The reverse_index cell block above must be complete in order for this cell block to successively execute.
decoded_review = ' '.join([reverse_index.get(i) for i in sequences[comment_idx_to_preview]])
print('Decoded review from Tokenizer:')
print(decoded_review)

Original complaint text:
I have already filed several complaints about AES/PHEAA. I was notified by a XXXX XXXX let @ XXXX, who pretended to be from your office, he said he was from CFPB. I found out this morning he is n't from your office, but is actually works at XXXX. 

This has wasted weeks of my time. They AES/PHEAA confirmed and admitted ( see attached transcript of XXXX, conversation at XXXX ( XXXX ) with XXXX that proves they verified the loans are not mine ) the student loans they had XXXX, and collected on, and reported negate credit reporting in my name are in fact, not mine. 
They conclued their investigation on XXXX admitting they made a mistake and have my name on soneone elses loans. I these XXXX loans total {$10000.00}, original amount. My XXXX loans I got was total {$3500.00}. We proved by providing AES/PHEAA, this with my original promissary notes I located recently, the XXXX of my college provided AES/PHEAA with their original shoeinf amounts of my XXXX loans which s

In [31]:
### Convert the Products to Numerical Categories

In [30]:
product = data['Product']

# Initialize
le = preprocessing.LabelEncoder() 
le.fit(product)
print('Original class labels:')
print(list(le.classes_))
print('\n')
product_cat = le.transform(product)  

# If you wish to retrieve the original descriptive labels post production
# list(le.inverse_transform([0, 1, 3, 3, 0, 6, 4])) 

print('New product labels:')
print(product_cat)
print('\n')

# Each row will be all zeros except for the category for that observation 
print('One hot labels; 7 binary columns, one for each of the categories.') 
product_onehot = to_categorical(product_cat)
print(product_onehot)
print('\n')

print('One hot labels shape:')
print(np.shape(product_onehot))

Original class labels:
['Bank account or service', 'Credit card', 'Student loan']


New product labels:
[2 2 2 ... 0 0 0]


One hot labels; 7 binary columns, one for each of the categories.
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


One hot labels shape:
(23795, 3)


In [37]:
product_onehot.shape

(23795, 3)

In [38]:
product_onehot

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

### Train-test split

In [32]:
from sklearn.model_selection import train_test_split 

In [35]:
product_cat

array([2, 2, 2, ..., 0, 0, 0])

In [None]:
X = data['']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)