# 1 Processing

In [1]:
import  pandas as pd
import logging
import glob
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 500)
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

In [2]:
# Split training and test
data = pd.read_csv('consumer_complaints.csv')
pData = data.dropna(subset=["Consumer complaint narrative", "Issue", "Sub-issue"])
traindf, testdf = train_test_split(pData, 
                                   test_size=.10)
# print data sizes
print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')
traindf.head(3)

Train: 200,970 rows 18 columns
Test: 22,331 rows 18 columns


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
1050036,08/08/2016,Debt collection,Medical,Cont'd attempts collect debt not owed,Debt is not mine,Provided no proof of debt during disputes and I have continuously disputed the debt with a credit reporting agency without any information on what the debt is from being provided by the organization.,,"Io, Inc.",VA,,Servicemember,Consent provided,Web,08/08/2016,Closed,Yes,No,2050841
1081960,12/29/2015,Student loan,Non-federal student loan,Dealing with my lender or servicer,Trouble with how payments are handled,"My Private Wells Fargo loan is literally theft. I have been unable to have a balance budget with a {$250.00} monthly payment along with all my other debts, bills, and living expenses. My city 's living expenses are going up, making it difficult on a bi-weekly basis to have money to eat. Also, making difficult to save money to pursue my passion of music, as it was reason for me to pursue a college degree and required this loan. Spoke with the Loans Consolidation department and the Loan Modifi...",Company chooses not to provide a public response,WELLS FARGO & COMPANY,TX,787XX,,Consent provided,Web,12/29/2015,Closed with explanation,Yes,No,1719881
1074571,03/16/2018,Student loan,Private student loan,Dealing with your lender or servicer,Need information about your loan balance or loan terms,"I admit that I owe something but the loan amount itself should not be more than {$10000.00} then whatever interest it would have accrued over time. \n\nAlso, the student loan company has taken my tax refund on a couple of occasions. One time it was nearly {$8000.00} The contracts they sent me if they are legitimate amount to less than {$10000.00}, nowhere near the {$30000.00} they say I owe. \n\nKeep in mind ACT student loans must have sold the rights to consolidate and take over my student ...",,"Navient Solutions, LLC.",MS,,Servicemember,Consent provided,Web,03/16/2018,Closed with explanation,Yes,,2845553


In [3]:
trainRawNarrative = traindf['Consumer complaint narrative'].tolist()
trainIssueRaw = traindf['Issue']

In [4]:
%reload_ext autoreload
%autoreload 2
from ktext.preprocess import processor

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
%%time
# Clean, tokenize, and apply padding / truncating such that each document length = 70
#  also, retain only the top 8,000 words in the vocabulary and set the remaining words
#  to 1 which will become common index for rare words 
body_pp = processor(keep_n=8000, padding_maxlen=144)
train_body_vecs = body_pp.fit_transform(trainRawNarrative)



Wall time: 2min 46s


In [6]:
issue_pp = processor(append_indicators=True, keep_n=4500, 
                     padding_maxlen=12, padding ='post')

# process the title data
train_issue_vecs = issue_pp.fit_transform(trainIssueRaw)



In [7]:
import dill as dpickle
import numpy as np

# Save the preprocessor
with open('body_pp.dpkl', 'wb') as f:
    dpickle.dump(body_pp, f)

with open('issue_pp.dpkl', 'wb') as f:
    dpickle.dump(issue_pp, f)

# Save the processed data
np.save('train_issue_vecs.npy', train_issue_vecs)
np.save('train_body_vecs.npy', train_body_vecs)

In [8]:
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor

In [9]:
encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('train_issue_vecs.npy')

Shape of encoder input: (200970, 144)
Shape of decoder input: (200970, 11)
Shape of decoder target: (200970, 11)


In [10]:
num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor('issue_pp.dpkl')

Size of vocabulary for body_pp.dpkl: 8,002
Size of vocabulary for issue_pp.dpkl: 115


In [11]:
%matplotlib inline
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
from keras import optimizers

In [12]:
#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

# Intermediate GRU layer (optional)
#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)
#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)

# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just 
#  encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

########################
#### Decoder Model ####
decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder (ex: Issue Titles)
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

########################
#### Seq2Seq Model ####

#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

In [13]:
from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()
#viz_model_architecture(seq2seq_Model)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 300)    34500       Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, 144)          0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 300)    1200        Decoder-Word-Embedding[0][0]     
__________________________________________________________________________________________________
Encoder-Mo

In [14]:
from keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'tutorial_seq2seq'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 1200
epochs = 7
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

Train on 176853 samples, validate on 24117 samples
Epoch 1/7


  '. They will not be included '


Epoch 2/7
Epoch 3/7
Epoch 4/7


Epoch 5/7
Epoch 6/7


Epoch 7/7


In [15]:
#save model
seq2seq_Model.save('seq2seq_model_tutorial.h5')

  '. They will not be included '


In [17]:
from seq2seq_utils import Seq2Seq_Inference
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                 decoder_preprocessor=issue_pp,
                                 seq2seq_model=seq2seq_Model)
seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)




2653189
Issue Body:
 I 've never received anything from this collector regarding this debt or right to verify or dispute it just appeared on my Credit Report out of nowhere i sent a letter to the address on my credit report address no response there is no phone number attached to the collections or the original debtor to verify and it dropped my score and I would never let XXXX which the alleged amount of the debt ruin my credit. 

Original Title:
 Written notification about debt

****** Machine Generated Title (Prediction) ******:
 disclosure verification of debt



2327414
Issue Body:
 My consolidated PLUS loans are now serviced by NAVIENT. I am in the IBR program and make a {$420.00} payment every month. Due to the high interest rate on this loan, there is no possible way I can ever pay any of the principal balance on this loan and no possible way I can ever pay down this debt. In XXXX they capitalized over {$3500.00} in interest. How will I ever get out from under this debt if t


Original Title:
 Incorrect information on credit report

****** Machine Generated Title (Prediction) ******:
 incorrect information on credit report



2345105
Issue Body:
 Good Afternoon, I am a US military service member, and a responsible citizen with a clean and well standing credit record. 
I was contacted by " FIRST FINANCIAL ASSET MANAGEMENT '' ( XXXX ) a Debt Collection agency. They contacted me to collect on a Fraudulent account, that was opened in my name in 2005, they threatened to blemish my credit profile and even attempted to " SETTLE '' the account. they have illegally purchased my personal information and identity from another debt collector or illegitimate source. 

Original Title:
 Cont'd attempts collect debt not owed

****** Machine Generated Title (Prediction) ******:
 cont d attempts collect debt not owed



2510753
Issue Body:
 XXXX   XXXX   XXXX  posted a check for {$1800.00} to  Ally  Bank last week via express mail and it has still not been received by the Ba


Original Title:
 Incorrect information on your report

****** Machine Generated Title (Prediction) ******:
 incorrect information on your report



2669332
Issue Body:
 ON XXXX PHEAA/AES determined they made a mistake and the loans they have are in fact not mine. SEE attached transcript of live chat conversation with AES XXXX, and screenshot of them confirming they were not my original lenders. 
I have told them I paid my loans off years ago. AES/PHEAA confirmed they were not my original loan lenders, and never had my loans. But they refuse to : 1 ) return my payments/money they collected XX/XX/XXXX-XX/XX/XXXX 2 ). NOTIFY Direct Loans/USDEPT OF ED default resolution/immed cr. recovery collection agency that these are not my loans so they can return my {$8000.00} they seized from me. 
3. ) REFUSE TO NOTIFY/UPDATE NEGAATIVE credit reporting on me. 
4 ) REFUSE TO remove my name off these loans. 
5 ) UPDATE NSLD WEBSITE I am a female XXXX XXXX vet, and they have my money, and continue to 


Original Title:
 Improper use of your report

****** Machine Generated Title (Prediction) ******:
 improper use of your report



2783825
Issue Body:
 I am enrolled in the Public Service Loan Forgiveness Program ( PSLF ). I have been enrolled in the program and working for a qualified employer since XX/XX/XXXX. Despite submitting appropriate records including Income Based Repayment forms, annual certification forms for the PSLF program, and being enrolled in the direct debit program, XXXX XXXX XXXX still has not calculated my payments correctly. My loan was transferred to XXXX from XXXX in XX/XX/XXXX. The payments I made with XXXX are not being counted and all the qualifying payments I have made with XXXX XXXX are not being counted correctly. I have spoken with this company 9 times since the spring of XX/XX/XXXX and this situation is still not corrected. My employer has spoken with them twice and given them my correct employment start date, but my qualified payments are still not corr


Original Title:
 Incorrect information on credit report

****** Machine Generated Title (Prediction) ******:
 credit reporting company s investigation



2826437
Issue Body:
 The incident occurred on XX/XX/2018. 

I was contacted by what was believed to be an employer, but resulted in fraudulent activity ; the bank was aware of my concerns during this interaction with the supposed employer. The bank is refusing to repair damages done to my account despite the recorded calls, text messages, emails, and documents. 

Before depositing a check received from this employer I verified with the bank listed on the check that the funds were available and the check was not forged, they assured me the check was real and the funds were available. I also expressed concerns with this bank about their member possibly committing fraud. 
I deposited this check into my bank account and called my bank to express concerns that the check may be bad despite what I was told by the employer 's bank, and waite


Original Title:
 Trouble using your card

****** Machine Generated Title (Prediction) ******:
 fees or interest



1644856
Issue Body:
 I attempted to dispute some information on Equifax report. On my first attempt, I filled out the appropriate fields and verified my information. I was about to submit my request for the first dispute when the system kicked me out and said there was an error. Again, I went back to Equifax and upon correctly validating my information I was told that " The information you provided does not match our records so we are unable to process your request online. '' Equifax then asked me to fax in my dispute with a copy of my driver 's license. I tried once more, even asnwering the identity verification questions again and was given the same response. Now, when I attempt to initiate a dispute with Equifax 's credit reporting, I am no longer directed to the validation questions but am instead immediately asked to fax in my dispute as my information did not match 

 Unable to get your credit report or credit score

****** Machine Generated Title (Prediction) ******:
 communication tactics



2003066
Issue Body:
 Upon checking my credit report, I noticed an account by the name of rise reporting to XXXX concerning me. I called and spoke with a XXXX agent with my concerns and inform them this was n't my account. The agent replied to me that the account was in good standards and as I told him, I do n't care it was n't my account and to remove it. I 've been a victim of Identity theft and I 've filed a police report case number XXXX Officer XXXX with the XXXX, Alabama XXXX Police Department. number XXXX. I 've also sent a Certified Letter with a returned receipt that I have to XXXX XXXX and Rise also the other credit reporting agent concerning this Rise Account along with a police report and Identity Theft notarized affidavit. 

Original Title:
 Cont'd attempts collect debt not owed

****** Machine Generated Title (Prediction) ******:
 incorrect infor