# 1 Processing

In [19]:
import  pandas as pd
import logging
import glob
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 500)
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

In [20]:
# Split training and test
data = pd.read_csv('consumer_complaints.csv')
pData = data.dropna(subset=["Consumer complaint narrative", "Issue"])
traindf, testdf = train_test_split(pData, 
                                   test_size=.12)
# print data sizes
print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')
traindf.head(3)

Train: 290,028 rows 18 columns
Test: 39,550 rows 18 columns


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
1118279,02/13/2018,"Credit reporting, credit repair services, or other personal consumer reports",Credit reporting,Unable to get your credit report or credit score,Other problem getting your report or credit score,I AM UNABLE TO OBATIN MY EQUIFAX CREDIT FILE OR SCORE. \nPREVIOUSLY I WAS ABLE TO PULL MY REPORT UP UNTIL XX/XX/XXXX THEN ON XX/XX/XXXX I TRYED REQUESTING MY FILE TROUGH XXXX AND I WAS ADVISED BY BOTH XXXX AND EQUIFAX THAT MY FILE IS MISSING MY SOCIAL SECURITY NUMBER AND FOR THAT REASON I AM UNABLE TO OBTAIN MY REPORT! \n\nI SPOKE TO THEM SEVERAL TIMES. \nEQUIFAX HAS BEEN GIVING ME THE RUNAROUND THE HAVE BEEN RUDE UNPROFESSIONAL AND UNCOMFORTABLE. I WAS ADVISED BY GLOBAL EQUIFAX ON XX/XX/XXX...,,"EQUIFAX, INC.",FL,331XX,,Consent provided,Web,02/13/2018,Closed with explanation,Yes,,2813030
435082,06/23/2015,Bank account or service,Checking account,"Making/receiving payments, sending money",,hello i have filed a complain about wells fargo with cfpb complain # XXXX and explained that even though a let them know that reason i was canceling my debt card with them was that XXXX rent a car was attempting to charge my card something i dont owe and canceled the card befor they actuly charged and still wells fargo paid those fradulant chargesthey did not do anything and know this company continues to charge my card as of XX/XX/XXXX XXXXwith no reason please help me my account is about t...,Company chooses not to provide a public response,WELLS FARGO & COMPANY,GA,300XX,,Consent provided,Web,06/23/2015,Closed with monetary relief,Yes,No,1434805
1103519,08/04/2015,Debt collection,I do not know,Communication tactics,Frequent or repeated calls,They say they are from Fitzgerald and associates XXXX. They will not tell me what it about just that i owe money. They refuse to create paper trail or provide information. They will not provide me anything in writing. They threatened to take my car away. They called me on my cell phone at XXXX on XXXX. They called me at work on XXXX and i asked them not to call me here. They called again on XXXX i asked them not to call me again.,Company believes it acted appropriately as authorized by contract or law,"Fitzgerald Goldman & Associates, Inc.",WI,531XX,,Consent provided,Web,08/04/2015,Closed with explanation,No,Yes,1501032


In [21]:
trainRawNarrative = traindf['Consumer complaint narrative'].tolist()
trainIssueRaw = traindf['Issue']

In [22]:
%reload_ext autoreload
%autoreload 2
from ktext.preprocess import processor

In [23]:
%%time
# Clean, tokenize, and apply padding / truncating such that each document length = 70
#  also, retain only the top 8,000 words in the vocabulary and set the remaining words
#  to 1 which will become common index for rare words 
body_pp = processor(keep_n=8000, padding_maxlen=144)
train_body_vecs = body_pp.fit_transform(trainRawNarrative)



Wall time: 5min


In [24]:
issue_pp = processor(append_indicators=True, keep_n=9000, 
                     padding_maxlen=12, padding ='post')

# process the title data
train_issue_vecs = issue_pp.fit_transform(trainIssueRaw)



In [25]:
import dill as dpickle
import numpy as np

# Save the preprocessor
with open('body_pp.dpkl', 'wb') as f:
    dpickle.dump(body_pp, f)

with open('issue_pp.dpkl', 'wb') as f:
    dpickle.dump(issue_pp, f)

# Save the processed data
np.save('train_issue_vecs.npy', train_issue_vecs)
np.save('train_body_vecs.npy', train_body_vecs)

In [26]:
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor

In [27]:
encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('train_issue_vecs.npy')

Shape of encoder input: (290028, 144)
Shape of decoder input: (290028, 11)
Shape of decoder target: (290028, 11)


In [28]:
num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor('issue_pp.dpkl')

Size of vocabulary for body_pp.dpkl: 8,002
Size of vocabulary for issue_pp.dpkl: 244


In [29]:
%matplotlib inline
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
from keras import optimizers

In [30]:
#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

# Intermediate GRU layer (optional)
#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)
#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)

# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just 
#  encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

########################
#### Decoder Model ####
decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder (ex: Issue Titles)
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

########################
#### Seq2Seq Model ####

#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

In [31]:
from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()
#viz_model_architecture(seq2seq_Model)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 300)    73200       Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, 144)          0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 300)    1200        Decoder-Word-Embedding[0][0]     
__________________________________________________________________________________________________
Encoder-Mo

In [None]:
from keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'tutorial_seq2seq'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 1200
epochs = 7
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

Train on 255224 samples, validate on 34804 samples
Epoch 1/7


  '. They will not be included '


Epoch 2/7
Epoch 3/7




In [None]:
#save model
seq2seq_Model.save('seq2seq_model_tutorial.h5')

In [None]:
from seq2seq_utils import Seq2Seq_Inference
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                 decoder_preprocessor=issue_pp,
                                 seq2seq_model=seq2seq_Model)
seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)