In [None]:
import pandas as pd
import numpy as np 
from nltk.corpus import stopwords # get stopwords to remove
import re # regular expression
from gensim.models import doc2vec # for word embeddings
from gensim.utils import simple_preprocess # to tokenize automatically
from sklearn.model_selection import train_test_split, KFold # for test-train split & cross validation
from sklearn.preprocessing import MultiLabelBinarizer # to convert to a format that can do multi-label classification
import random
import keras # for nn
import tensorflow as tf # for nn & new loss
import keras.backend.tensorflow_backend as tfb # for nn $ new loss

stop = stopwords.words('english')
# Ensure reproducibility
seed = 561
np.random.seed(seed)
random.seed(seed)

In [None]:
# Constants
vec_dim = 100 # how big the word embeddings are

# Multi-label classification

Classify the reasons (violations) behind each docket_num (document).

# Read in data

In [None]:
raw_text = pd.read_csv('./data/clean_mea_text.csv') # this holds the raw text
reasons = pd.read_csv('./data/mea_reasons_filtered.csv') # these are our target classifications

In [None]:
print(raw_text.head())
print(raw_text.shape)

In [None]:
print(reasons.head())
print(reasons.shape)

In [None]:
# There is a bit of data mismatch, so filter both dfs for text that appears in both
bothdocs = set(raw_text.docket_num.values).intersection(reasons.docket_num.values)
raw_text = raw_text[raw_text.docket_num.isin(bothdocs)]
reasons = reasons[reasons.docket_num.isin(bothdocs)]
print(raw_text.shape)
print(reasons.shape)

# Also need to convert datatypes to prevent type mismatch
raw_text['text'] = raw_text['text'].astype(str)

In [None]:
# Remove irrelevant/unhelpful labels
toremove = ['Unknown', 
           'Other', 
           'Did not appear for the Hearing', 
           'Other failure to disclose', 
           'Unknown failure to disclose', 
           'Allowed falsification and misrepresentation of loans',
           'Circumvented the requirements that a branch manager of a licensed mortgage broker have at least three years experience',
           "Did not verify or make a reasonable effort to verify the borrower's information",
           'Employed simultaneously by more than one affiliated mortgage banker or mortgage broker',
           'Engaged in fraud',
           'Failure to disclose charges',
           'Violated NC Securities Act',
           'Withdrew appeal']
reasons = reasons[~reasons.reason.isin(toremove)]

In [None]:
# Since we want to do multi-label classification, binarize outputs
# First, need to aggregate reason by docket_num
reasonsls = reasons.groupby('docket_num')['reason'].apply(set).reset_index(name='reason')

mlb = MultiLabelBinarizer()
classesbin = mlb.fit_transform(reasonsls.reason.values)
classesbin = pd.DataFrame(classesbin)
classesbin.columns = mlb.classes_

reasonsls = pd.concat([reasonsls, classesbin], axis=1)

In [None]:
# Let's combine the input and output datasets for easier handling
merged = raw_text.merge(reasonsls)

# Preprocessing

In [None]:
# Params:
#    df - dataframe with column 'text' to be tokenized
#    tokens_only - to train the doc2vec model, we’ll need to 
#        associate a tag/number with each document of the training corpus. 
#        tokens_only=True means don't associate anything
def tokenize(df, tokens_only=False):
    tokens = df['text'].apply(lambda x: simple_preprocess(x, deacc=True, max_len=20)) # max_len=20 just in case there are important words 15 chars long)
    if tokens_only:
        return tokens
    else:
        # For training data, add tags -- notice it is just an index number
        return [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]

corpus = tokenize(merged)

# Vectorize

We use the Continuous Bag of Words (CBOW) model to create our word embeddings to be used in our ML models.

In [None]:
wordmodel = doc2vec.Doc2Vec(vector_size=vec_dim,min_count=1, epochs=40) # min_count=1 because we're not sure if relevant words occur multiple times
wordmodel.build_vocab(corpus)
# Train
wordmodel.train(corpus, total_examples=wordmodel.corpus_count, epochs=wordmodel.epochs)

## Assess CBOW model

In [None]:
ranks = []
second_ranks = []
docvecs = []
for doc_id in range(len(corpus)):
    inferred_vector = wordmodel.infer_vector(corpus[doc_id].words)
    sims = wordmodel.docvecs.most_similar([inferred_vector], topn=len(wordmodel.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])
    
    docvecs.append(inferred_vector)

In [None]:
import collections

counter = collections.Counter(ranks)
print(counter)

## Add doc embeddings

In [None]:
merged = pd.concat([merged, pd.DataFrame(docvecs)], axis=1)

# Train & Test Neural Network model

In [None]:
class_dim = len(mlb.classes_) # number of distinct classes
epochs = 50
batch_size = 10
k = 5 # number of folds for cv
pweights = [15, 10, 5, 4, 3, 2, 1]  # multiplier for positive targets, needs to be tuned
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] # thresholds for 1/0, needs to be tuned

pos_weight = pweights[0] # starting weight

## New loss

In [None]:
# From https://stackoverflow.com/questions/42158866/neural-network-for-multi-label-classification-with-large-number-of-classes-outpu/47313183#47313183
def weighted_binary_crossentropy(target, output):
    """
    Weighted binary crossentropy between an output tensor 
    and a target tensor. POS_WEIGHT is used as a multiplier 
    for the positive targets.

    Combination of the following functions:
    * keras.losses.binary_crossentropy
    * keras.backend.tensorflow_backend.binary_crossentropy
    * tf.nn.weighted_cross_entropy_with_logits
    """
    # transform back to logits
    _epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
    output = tf.log(output / (1 - output))
    # compute weighted loss
    loss = tf.nn.weighted_cross_entropy_with_logits(targets=target,
                                                    logits=output,
                                                    pos_weight=pos_weight)
    return tf.reduce_mean(loss, axis=-1)

## Train & test

In [None]:
# Split into x and y
x = merged.iloc[:, 7+class_dim:]
y = merged.iloc[:, 7:7+class_dim]

In [None]:
# Cross validation
kf = KFold(n_splits=k, shuffle=True)

cv_scores = [] # note to self: change into np array
for weight in pweights:
    pos_weight = weight
    for train_index, test_index in kf.split(x):
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Build model
        model = keras.models.Sequential() 
        model.add(keras.layers.Dense(128, input_dim=vec_dim, activation='relu'))
        model.add(keras.layers.Dense(64, activation='relu'))
        model.add(keras.layers.Dense(class_dim, activation='sigmoid')) # sigmoid so that the probability of one class is independent from the probability of another
        model.compile(loss=weighted_binary_crossentropy, 
                  optimizer='adam', 
                  metrics=['accuracy'])
        # Fit model
        model.fit(X_train, 
                     y_train,
                     epochs=epochs,
                     batch_size=batch_size)
        
        # Evaluate
        loss, acc = model.evaluate(X_test, y_test)
#         # Predict
#         y_pred = model.predict(X_test)
#         y_pred[y_pred>=0.5] = 1
#         y_pred[y_pred<0.5] = 0

        cv_scores.append(acc)
print(cv_scores)

In [None]:
temp = np.array(cv_scores)
for i in range(len(pweights)):
    print('For pos_weight={:d}, accuracy={:.4f}'.format(pweights[i], temp[i*5:(i*5+k)].mean()))