# 1 Processing

In [19]:
import  pandas as pd
import logging
import glob
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 500)
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

In [20]:
# Split training and test
data = pd.read_csv('consumer_complaints.csv')
pData = data.dropna(subset=["Consumer complaint narrative", "Issue"])
traindf, testdf = train_test_split(pData, 
                                   test_size=.12)
# print data sizes
print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')
traindf.head(3)

Train: 290,028 rows 18 columns
Test: 39,550 rows 18 columns


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
1118279,02/13/2018,"Credit reporting, credit repair services, or other personal consumer reports",Credit reporting,Unable to get your credit report or credit score,Other problem getting your report or credit score,I AM UNABLE TO OBATIN MY EQUIFAX CREDIT FILE OR SCORE. \nPREVIOUSLY I WAS ABLE TO PULL MY REPORT UP UNTIL XX/XX/XXXX THEN ON XX/XX/XXXX I TRYED REQUESTING MY FILE TROUGH XXXX AND I WAS ADVISED BY BOTH XXXX AND EQUIFAX THAT MY FILE IS MISSING MY SOCIAL SECURITY NUMBER AND FOR THAT REASON I AM UNABLE TO OBTAIN MY REPORT! \n\nI SPOKE TO THEM SEVERAL TIMES. \nEQUIFAX HAS BEEN GIVING ME THE RUNAROUND THE HAVE BEEN RUDE UNPROFESSIONAL AND UNCOMFORTABLE. I WAS ADVISED BY GLOBAL EQUIFAX ON XX/XX/XXX...,,"EQUIFAX, INC.",FL,331XX,,Consent provided,Web,02/13/2018,Closed with explanation,Yes,,2813030
435082,06/23/2015,Bank account or service,Checking account,"Making/receiving payments, sending money",,hello i have filed a complain about wells fargo with cfpb complain # XXXX and explained that even though a let them know that reason i was canceling my debt card with them was that XXXX rent a car was attempting to charge my card something i dont owe and canceled the card befor they actuly charged and still wells fargo paid those fradulant chargesthey did not do anything and know this company continues to charge my card as of XX/XX/XXXX XXXXwith no reason please help me my account is about t...,Company chooses not to provide a public response,WELLS FARGO & COMPANY,GA,300XX,,Consent provided,Web,06/23/2015,Closed with monetary relief,Yes,No,1434805
1103519,08/04/2015,Debt collection,I do not know,Communication tactics,Frequent or repeated calls,They say they are from Fitzgerald and associates XXXX. They will not tell me what it about just that i owe money. They refuse to create paper trail or provide information. They will not provide me anything in writing. They threatened to take my car away. They called me on my cell phone at XXXX on XXXX. They called me at work on XXXX and i asked them not to call me here. They called again on XXXX i asked them not to call me again.,Company believes it acted appropriately as authorized by contract or law,"Fitzgerald Goldman & Associates, Inc.",WI,531XX,,Consent provided,Web,08/04/2015,Closed with explanation,No,Yes,1501032


In [21]:
trainRawNarrative = traindf['Consumer complaint narrative'].tolist()
trainIssueRaw = traindf['Issue']

In [22]:
%reload_ext autoreload
%autoreload 2
from ktext.preprocess import processor

In [23]:
%%time
# Clean, tokenize, and apply padding / truncating such that each document length = 70
#  also, retain only the top 8,000 words in the vocabulary and set the remaining words
#  to 1 which will become common index for rare words 
body_pp = processor(keep_n=8000, padding_maxlen=144)
train_body_vecs = body_pp.fit_transform(trainRawNarrative)



Wall time: 5min


In [24]:
issue_pp = processor(append_indicators=True, keep_n=9000, 
                     padding_maxlen=12, padding ='post')

# process the title data
train_issue_vecs = issue_pp.fit_transform(trainIssueRaw)



In [25]:
import dill as dpickle
import numpy as np

# Save the preprocessor
with open('body_pp.dpkl', 'wb') as f:
    dpickle.dump(body_pp, f)

with open('issue_pp.dpkl', 'wb') as f:
    dpickle.dump(issue_pp, f)

# Save the processed data
np.save('train_issue_vecs.npy', train_issue_vecs)
np.save('train_body_vecs.npy', train_body_vecs)

In [26]:
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor

In [27]:
encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('train_issue_vecs.npy')

Shape of encoder input: (290028, 144)
Shape of decoder input: (290028, 11)
Shape of decoder target: (290028, 11)


In [28]:
num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor('issue_pp.dpkl')

Size of vocabulary for body_pp.dpkl: 8,002
Size of vocabulary for issue_pp.dpkl: 244


In [29]:
%matplotlib inline
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
from keras import optimizers

In [30]:
#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

# Intermediate GRU layer (optional)
#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)
#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)

# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just 
#  encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

########################
#### Decoder Model ####
decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder (ex: Issue Titles)
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

########################
#### Seq2Seq Model ####

#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

In [31]:
from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()
#viz_model_architecture(seq2seq_Model)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 300)    73200       Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, 144)          0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 300)    1200        Decoder-Word-Embedding[0][0]     
__________________________________________________________________________________________________
Encoder-Mo

In [32]:
from keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'tutorial_seq2seq'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 1200
epochs = 7
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

Train on 255224 samples, validate on 34804 samples
Epoch 1/7


  '. They will not be included '


Epoch 2/7
Epoch 3/7


Epoch 4/7


Epoch 5/7


Epoch 6/7


Epoch 7/7




In [33]:
#save model
seq2seq_Model.save('seq2seq_model_tutorial.h5')

  '. They will not be included '


In [34]:
from seq2seq_utils import Seq2Seq_Inference
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                 decoder_preprocessor=issue_pp,
                                 seq2seq_model=seq2seq_Model)
seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)




2332585
Issue Body:
 XX/XX/2017. 

XXXX XXXX XXXX, FL XXXX Equifax XXXX XXXX XXXX MA XXXX XXXX Credit Reporting : Please be advised this is my XXXX written request. The unverified items listed below remain on my credit report in violation of the Federal Law. You are required under FCRA to have a copy of the original creators documentation on file to verify that this information is mine is correct. In the results of your fist investigation, you stated in writing that you " verified '' that these items are being " reported correctly ''? Who verified these accounts? 

You have not provided me a copy of any original documentation required under Section XXXX ( a ) ( XXXX ) ( A XXXX & XXXX ( a ) ( XXXX ) ( A ) ( a consumer contract with my signature on it ) and Section XXXX ( XXXX ) ( A ) of the FCRA - You are required to " ... promptly DELETE all information which can not be verified '' The law is very clear as to the civil liability and remedy available to me for " negligent noncomplian


Original Title:
 Struggling to pay mortgage

****** Machine Generated Title (Prediction) ******:
 struggling to pay mortgage



2956804
Issue Body:
 Never my addresses : XXXX XXXX XXXX XXXX XXXX, GA XXXX XXXX XXXX XXXX XXXX XXXX XXXX, GA XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, GA XXXX 

Original Title:
 Incorrect information on your report

****** Machine Generated Title (Prediction) ******:
 incorrect information on your report



2892532
Issue Body:
 Received letter from Capital One saying my account was closed due to inactivity.

Account was closed onXX/XX/XXXXI received the letter on XX/XX/XXXX The letter is dated on XX/XX/XXXX.

By Capital Bank closing my account they have harmed by credit score and availability to get credit. Since my available credit has gone down it now looks like I am not a good candidate for the best available interest rate to members with A-/B+ status.


Abusive on their part. Now they want me to apply for a new credit card with my credit limit being 


Original Title:
 Other transaction problem

****** Machine Generated Title (Prediction) ******:
 fraud or scam



1409280
Issue Body:
 Purchased a computer through XXXX. This purchase was financed by applying for the XXXX store card. It was approved at {$2200.00}. Computer cost was only {$1600.00}. XXXX ( XXXX ) days later, MyFICO reported a 100 % account balance increase. Received the shipment from the XXXX seller approximatly XXXX days later. Have not received the necessary information to setup payment account ( online ) from issuing company ( XXXX XXXX ). Contacted XXXX XXXX ( issuer of credit terms ), and they could not help without having an account number. XXXX does not provide this information ( good security, but, negatively impacts credit terms ). When contacting XXXX XXXX, automated voice line stated that there was a balance of {$1500.00}, with {$600.00} of credit to use. This is not reflective of what was reported to Equifax ( per FICO 's complaint process to Equifax ). Thi


Original Title:
 Disclosure verification of debt

****** Machine Generated Title (Prediction) ******:
 disclosure verification of debt



1539228
Issue Body:
 I have had a XXXX XXXX account since XXXX. My account has always been current - as a XXXX studentXXXX and professional. My current balance is {$19000.00}. 

I was laid off from my job in XXXX, XXXX. I contacted XXXX regarding my job status in XXXX, XXXX and let you know that I would be living off my bi-monthly unemployment benefits for the unforeseeable future. In the meantime, I would continue my fervent job hunt. At that time, I set up a monthly payment plan of {$97.00} for three months in which no interest, late fees or over -the- credit- line fees would be applied. I also agreed to close my account for the time being. I asked what would happen after three months if I had not found a job yet and the representative assured me that we would continue the agreement- if need be. 

Three months later, after having made my monthly p


Original Title:
 Managing the loan or lease

****** Machine Generated Title (Prediction) ******:
 charged fees or interest i didn t expect



2338446
Issue Body:
 The IRS agreed to take the tax lien off of my credit file in a direct conversation with them. It should not have been placed as a lien. I disputed this with the bureaus. XXXX removed it but Trans Union did not. 

Original Title:
 Incorrect information on credit report

****** Machine Generated Title (Prediction) ******:
 incorrect information on your report



2771191
Issue Body:
 I have several inquiries that are appearing on my report that are not mine. XXXX XXXX # XXXX, XXXX # XXXX, XXXX, XXXX XXXX. Would you please have these inquiries removed that are a result of attempted fraud. I have fraud alerts that are in place on my report that will not allow new accounts to be opened without my consent. 

Original Title:
 Improper use of your report

****** Machine Generated Title (Prediction) ******:
 incorrect information on y


1867861
Issue Body:
 Honda Finance Services has an Easy Pay service which automatically withdraws a monthly payment. They withdrew this monthly payment twice. This is apparently a routine problem with them, and is affecting XXXX people today. There is no way to contact them to report a problem or resolve it, other than a toll free number that is not working today, and at the best of times does not allow direct contact with a customer representative. 

Original Title:
 Managing the loan or lease

****** Machine Generated Title (Prediction) ******:
 managing the loan or lease



2883112
Issue Body:
 My account was reported as past due, however, I was approved for a forbearance after speaking with lender. The same is not shown on my credit report. I am requesting that this correction be made to my report as it is affecting my positive payment history and creditworthiness. 

Towards that end, I am also befuddled as to how this account is being managed. I have not received any corresponden


Original Title:
 Loan servicing, payments, escrow account

****** Machine Generated Title (Prediction) ******:
 loan modification collection foreclosure



2140793
Issue Body:
 my credit is set up under XXXX XXXX for a purpose. The landlord/ management of this mobile home park in XXXX illegally checked my credit. How do I know this? the mobile home park sent me XXXX and XXXX XXXX as there is their right. However, I signed the lease as XXXX. XXXX XXXX only Experian, XXXX XXXX, and XXXX know anything about XXXX XXXX the lease signature is attached 

Original Title:
 Improper use of my credit report

****** Machine Generated Title (Prediction) ******:
 incorrect information on credit report



2482750
Issue Body:
 failure to acknowledge a RFI within  XXXX  business days. 

Original Title:
 Trouble during payment process

****** Machine Generated Title (Prediction) ******:
 loan servicing payments escrow account



2554128
Issue Body:
 Due to financial hardship I was in a status called fo


Original Title:
 Improper use of your report

****** Machine Generated Title (Prediction) ******:
 improper use of your report



1304531
Issue Body:
 I could no longer pay these enormous charges so I hired a company in FL to take over for me. Either they did nothing or the Pay Day loan company will not accept their terms. I get several letters a week and they have threatened to take civil action against me. I am over XXXX and I get a XXXX check. 

Original Title:
 Taking/threatening an illegal action

****** Machine Generated Title (Prediction) ******:
 taking threatening an illegal action



2109318
Issue Body:
 I have received several emails over the past few years regarding an " account '' with Wells Fargo, that I know nothing about. They seem to to be coming from different senders within the bank. I was able to retrieve one of the emails sent recently ( see attached ). The email was sent to : XXXX. 

Original Title:
 Account opening, closing, or management

****** Machine Generat