In [1]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os
import spacy

In [2]:
# read in data

# https://www.consumerfinance.gov/data-research/hmda/

base_path = '/home/datawrestler/data/financial'
fname = 'financial.csv'
full_path = os.path.join(base_path, fname)

df = pd.read_csv(full_path, low_memory=False)
# shuffle the inputs
df = df.sample(n=df.shape[0])

In [3]:
consumer_complaints = df.loc[df['Consumer complaint narrative'].notnull()]

In [4]:
# get a sense for the totla number of possible complaint issues
consumer_complaints.groupby('Product')['Complaint ID'].nunique()

Product
Bank account or service                                                         14885
Checking or savings account                                                     11609
Consumer Loan                                                                    9474
Credit card                                                                     18838
Credit card or prepaid card                                                     19356
Credit reporting                                                                31588
Credit reporting, credit repair services, or other personal consumer reports    84426
Debt collection                                                                 82260
Money transfer, virtual currency, or money service                               5004
Money transfers                                                                  1497
Mortgage                                                                        51282
Other financial service                       

In [5]:
# lets focus on bank account, credit card, student loan, and money transfers

cats = ['Bank account or service', 'Credit card', 'Student loan' ,
       'Money transfers']

keep_cols = ['Product', 'Complaint ID', 'Consumer complaint narrative']

traindata = consumer_complaints.loc[consumer_complaints['Product'].isin(cats), keep_cols]

In [None]:
from sklearn.model_selection import train_test_split

# keep very small portion for LM validation
traindata, valdata = train_test_split(traindata, test_size=0.1)

In [None]:
trn_text = traindata['Consumer complaint narrative'].tolist()
trn_labels = traindata['Product'].tolist()

val_texts = valdata['Consumer complaint narrative'].tolist()
val_labels = valdata['Product'].tolist()

In [None]:
# read in spacy

# load up spacy
nlp = spacy.load('en_core_web_lg', disable=['ner', 'pos'])

In [None]:
def tokenizer(text):
    return [token.text for token in nlp.tokenizer(text)]

trn_tokens = [tokenizer(text) for text in trn_text]
val_tokens = [tokenizer(text) for text in val_texts]

In [None]:
from collections import Counter

# build vocab
vocab = Counter(p for o in trn_tokens for p in o)

In [None]:
# set vocab freq limits
max_vocab = 60000
min_freq = 2

In [None]:
itos = [o for o,c in vocab.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [None]:
# create reverse string to index
import collections

stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

In [None]:
# convert tokens to ids
trn_lm = np.array([[stoi[o] for o in p] for p in trn_tokens])
val_lm = np.array([[stoi[o] for o in p] for p in val_tokens])

In [6]:
from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(traindata, test_size = 0.4, random_state = 12)

In [7]:
df_trn.head(1)

Unnamed: 0,Product,Complaint ID,Consumer complaint narrative
795203,Bank account or service,1435578,"On XX/XX/2015, I made a mobile deposit of {$50..."


In [8]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df=df_trn, valid_df=df_val, path = "")

# Classifier model data
data_class = TextClasDataBunch.from_df(path="", train_df=df_trn, valid_df=df_val, 
                                       vocab=data_lm.train_ds.vocab, bs=32, 
                                      label_cols='Product', 
                                      text_cols='Consumer complaint narrative')

In [9]:
help(TextClasDataBunch)

Help on class TextClasDataBunch in module fastai.text.data:

class TextClasDataBunch(TextDataBunch)
 |  Create a `TextDataBunch` suitable for training an RNN classifier.
 |  
 |  Method resolution order:
 |      TextClasDataBunch
 |      TextDataBunch
 |      fastai.basic_data.DataBunch
 |      builtins.object
 |  
 |  Class methods defined here:
 |  
 |  create(train_ds, valid_ds, test_ds=None, path:Union[pathlib.Path, str]='.', bs=64, pad_idx=1, pad_first=True, no_check:bool=False, **kwargs) -> fastai.basic_data.DataBunch from builtins.type
 |      Function that transform the `datasets` in a `DataBunch` for classification.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from TextDataBunch:
 |  
 |  save(self, cache_name:Union[pathlib.Path, str]='tmp')
 |      Save the `DataBunch` in `self.path/cache_name` folder.
 |  
 |  ----------------------------------------------------------------------
 |  Class methods inherited from TextDa

In [10]:
import torch
# specify device type
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# we want the model to move to our GPU
device

device(type='cuda')

In [11]:
# fine tune pre trained model
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7)

In [12]:
learn.model = learn.model.to(device)

In [13]:
# train the learner object with learning rate = 1e-2
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,4.362731,2.892414,0.400162
2,3.886526,1.547345,0.404343
3,3.114805,1.655118,0.512033
4,2.369766,0.625688,0.540158
5,1.775460,0.166468,0.991254
6,1.351418,0.075207,0.993162
7,1.038464,0.054112,0.993506
8,0.824384,0.045929,0.994237
9,0.706353,0.046701,0.994115
10,0.583548,0.045468,0.994257


In [14]:
# https://software.intel.com/en-us/articles/transfer-learning-in-natural-language-processing
# save our encoder
learn.save_encoder('ft_enc')

In [15]:
help(text_classifier_learner)

Help on function text_classifier_learner in module fastai.text.learner:

text_classifier_learner(data:fastai.basic_data.DataBunch, bptt:int=70, emb_sz:int=400, nh:int=1150, nl:int=3, pad_token:int=1, drop_mult:float=1.0, qrnn:bool=False, max_len:int=1400, lin_ftrs:Collection[int]=None, ps:Collection[float]=None, pretrained_model:str=None, **kwargs) -> 'TextClassifierLearner'
    Create a RNN classifier from `data`.



In [16]:
# train classifier
learn_class = text_classifier_learner(data_class, drop_mult=0.7,
                                     bptt=70)
learn_class.load_encoder('ft_enc')

In [None]:
# move to GPU
learn_class.model = learn_class.model.to(device)

In [None]:
learn_class.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,1.198741,1.250138,0.352343
2,1.191631,6.856156,0.334692
3,1.175679,44.809757,0.339127
4,1.193155,1.498763,0.349028
5,1.186574,5.420327,0.273363
6,1.180240,1.195099,0.372234
7,1.180224,1.193797,0.368336
8,1.178905,1.182137,0.368426


In [None]:
# get predictions
preds, targets = learn_class.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

In [None]:
preds