In [1]:
import pandas as pd
import numpy as np 
from nltk.corpus import stopwords # get stopwords to remove
import re # regular expression
from gensim.models import doc2vec, Word2Vec # for word embeddings
from gensim.utils import simple_preprocess # to tokenize automatically
from sklearn.model_selection import train_test_split, KFold # for test-train split & cross validation
from sklearn.preprocessing import MultiLabelBinarizer # to convert to a format that can do multi-label classification
import random
import keras # for nn
import tensorflow as tf # for nn & new loss
import keras.backend.tensorflow_backend as tfb # for nn $ new loss
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score, f1_score # for metrics
import collections

stop = stopwords.words('english')
# Ensure reproducibility
seed = 561
np.random.seed(seed)
random.seed(seed)
tf.set_random_seed(seed)

unable to import 'smart_open.gcs', disabling that module
Using TensorFlow backend.


In [2]:
# Constants
vec_dim = 100 # how big the word embeddings are

# Multi-label classification

Classify the reasons (violations) behind each docket_num (document).

# Read in data

In [3]:
raw_text = pd.read_csv('./data/clean_mea_text.csv') # this holds the raw text
reasons = pd.read_csv('./data/mea_reasons_filtered.csv') # these are our target classifications

In [4]:
print(raw_text.head())
print(raw_text.shape)

         date docket_num                                               text  \
0  2009-11-18     09_160  STATE OF NORTH CAROLINA\nWAKE COUNTY\nIN A MAT...   
1  2009-11-18     09_164  STATE OF NORTH CAROLINA\nWAKE COUNTY\nIN A MAT...   
2  2009-10-16    09_142B  OAH File No. 10 COB 2895\nSTATE OF NORTH CAROL...   
3  2009-09-09     09_081  STATE OF NORTH CAROLINA\nWAKE COUNTY\nIN A MAT...   
4  2009-08-24     09_070  STATE OF NORTH CAROLINA\nWAKE COUNTY\nIN A MAT...   

   year  month  day  
0  2009     11   18  
1  2009     11   18  
2  2009     10   16  
3  2009      9    9  
4  2009      8   24  
(177, 6)


In [5]:
print(reasons.head())
print(reasons.shape)

  docket_num        date                                        reason
0     09_160  11/18/2009                    Conspiracy to commit fraud
1     09_164  11/18/2009                    Conspiracy to commit fraud
2    09_142B  10/16/2009                     Allowed unlawful activity
3     09_081    9/9/2009  Falsification and misrepresentation of loans
4     09_070   8/24/2009                       Retained borrower funds
(375, 3)


In [6]:
# There is a bit of data mismatch, so filter both dfs for text that appears in both
bothdocs = set(raw_text.docket_num.values).intersection(reasons.docket_num.values)
raw_text = raw_text[raw_text.docket_num.isin(bothdocs)]
reasons = reasons[reasons.docket_num.isin(bothdocs)]
print(raw_text.shape)
print(reasons.shape)

# Also need to convert datatypes to prevent type mismatch
raw_text['text'] = raw_text['text'].astype(str)

(169, 6)
(359, 3)


In [7]:
# Remove irrelevant/unhelpful labels
toremove = ['Unknown', 
           'Other', 
           'Did not appear for the Hearing', 
           'Other failure to disclose', 
           'Unknown failure to disclose', 
           'Allowed falsification and misrepresentation of loans',
           'Circumvented the requirements that a branch manager of a licensed mortgage broker have at least three years experience',
           "Did not verify or make a reasonable effort to verify the borrower's information",
           'Employed simultaneously by more than one affiliated mortgage banker or mortgage broker',
           'Engaged in fraud',
           'Failure to disclose charges',
           'Violated NC Securities Act',
           'Withdrew appeal',
           'Unsatisfactory credit']
reasons = reasons[~reasons.reason.isin(toremove)]

In [8]:
# Since we want to do multi-label classification, binarize outputs
# First, need to aggregate reason by docket_num
reasonsls = reasons.groupby('docket_num')['reason'].apply(set).reset_index(name='reason')

mlb = MultiLabelBinarizer()
classesbin = mlb.fit_transform(reasonsls.reason.values)
classesbin = pd.DataFrame(classesbin)
classesbin.columns = mlb.classes_

reasonsls = pd.concat([reasonsls, classesbin], axis=1)

In [9]:
# Let's combine the input and output datasets for easier handling
merged = raw_text.merge(reasonsls)

In [10]:
print(merged.shape)

(153, 25)


# Preprocessing

In [11]:
# Params:
#    df - dataframe with column 'text' to be tokenized
#    tokens_only - to train the doc2vec model, we’ll need to 
#        associate a tag/number with each document of the training corpus. 
#        tokens_only=True means don't associate anything
def tokenize(df, tokens_only=False):
    tokens = df['text'].apply(lambda x: simple_preprocess(x, deacc=True, max_len=20)) # max_len=20 just in case there are important words 15 chars long)
    if tokens_only:
        return tokens
    else:
        # For training data, add tags -- notice it is just an index number
        return [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]

# Vectorize

We use the Continuous Bag of Words (CBOW) model to create our word embeddings to be used in our ML models.

In [12]:
# Remove extra space
merged['text'] = merged['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
# Tokenize
corpus = tokenize(merged)

docmodel = doc2vec.Doc2Vec(vector_size=vec_dim,min_count=1, epochs=40) # min_count=1 because we're not sure if relevant words occur multiple times
docmodel.build_vocab(corpus)
# Train
docmodel.train(corpus, total_examples=docmodel.corpus_count, epochs=docmodel.epochs)

In [13]:
# Let's also use word2vec and averaging to compare to doc2vec
# Perhaps in this setting, the context of words isn't important
textls = tokenize(merged, tokens_only=True)
wordmodel = Word2Vec(textls, min_count=1)
w2v = dict(zip(wordmodel.wv.index2word, wordmodel.wv.syn0))

  """


In [14]:
# Params:
#    w2v - dictionary of words to vectors
#    text - list of tokenized words to convert to vector
# Returns a vector resulting from the average of all present words in text
def average_vectors(w2v, text):
    num_count = collections.Counter(text) # words and their counts
    return np.mean([w2v[word]*count for word, count in num_count.items()], axis=0)

In [15]:
docvecs_avg = [average_vectors(w2v, corpus[i].words) for i in range(len(corpus))]

## Assess Doc2Vec CBOW model

In [16]:
ranks = []
second_ranks = []
docvecs = []
for doc_id in range(len(corpus)):
    inferred_vector = docmodel.infer_vector(corpus[doc_id].words)
    sims = docmodel.docvecs.most_similar([inferred_vector], topn=len(docmodel.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])
    
    docvecs.append(inferred_vector)

In [17]:
counter = collections.Counter(ranks)
print(counter)

Counter({0: 147, 1: 6})


## Add doc embeddings

In [18]:
merged = pd.concat([merged, pd.DataFrame(docvecs)], axis=1)
merged = pd.concat([merged, pd.DataFrame(docvecs_avg)], axis=1)

# Train & Test Neural Network model

In [19]:
class_dim = len(mlb.classes_) # number of distinct classes
epochs = 25
batch_size = 10
k_folds = 5 # number of folds for cv
pweights = [15, 10, 5, 4, 3, 2, 1]  # multiplier for positive targets, needs to be tuned
num_metrics = 5 # number of metrics -- manually set

# inputs to loss
pos_weight = pweights[0] # starting weight

## New loss

In [20]:
# From https://stackoverflow.com/questions/42158866/neural-network-for-multi-label-classification-with-large-number-of-classes-outpu/47313183#47313183
def weighted_binary_crossentropy(target, output):
    """
    Weighted binary crossentropy between an output tensor 
    and a target tensor. POS_WEIGHT is used as a multiplier 
    for the positive targets.

    Combination of the following functions:
    * keras.losses.binary_crossentropy
    * keras.backend.tensorflow_backend.binary_crossentropy
    * tf.nn.weighted_cross_entropy_with_logits
    """
    # transform back to logits
    _epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
    output = tf.log(output / (1 - output))
    # compute weighted loss
    loss = tf.nn.weighted_cross_entropy_with_logits(targets=target,
                                                    logits=output,
                                                    pos_weight=pos_weight)
    return tf.reduce_mean(loss, axis=-1)

## New metric

In [21]:
# Function that returns the hamming score
# correctly predicted / number of labels
# Effectively acts as an accuracy metric in multilabel classification
def hamming_score(y_true, y_pred):
    return (y_pred == y_true).mean()

In [22]:
# Function that given predicted probabilities 
# and the true labels,
# calculates a bunch of metrics and returns them
def calc_metrics(y_true, y_prob):
    y_pred = np.copy(y_prob) # classes
    y_pred[y_pred>=0.5] = 1
    y_pred[y_pred<0.5] = 0

    # Metrics
    # average='micro' because we care a little more about global statistics
    ham_score = hamming_score(y_true, y_pred) # accuracy
    emr = accuracy_score(y_true, y_pred) # exact match ratio
    f1 = f1_score(y_true, y_pred, average='micro') # f1 -- care about false positives and false negatives
    prec = precision_score(y_true, y_pred, average='micro') # tp / (tp + fp) # care about false positives slightly more let's look at precision instead of both
    auc = roc_auc_score(y_true, y_prob, average='micro') # for ease, pretend the threshold should be the same for all classes
    metrics = [ham_score, emr, f1, prec, auc]
    
    return metrics

## Train & test

In [23]:
# Function that runs k_folds cross validation
# Note that y, kf, pos_weight, whatever parameters is set outside of this function
# Returns the mean metrics
def run_cv(x, verbose=True):
    cv_scores = np.empty((k_folds, num_metrics))
    j = 0
    for train_index, test_index in kf.split(x):
        if verbose:
            print('{:d}th CV run'.format(j))
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Build model
        model = keras.models.Sequential() 
        model.add(keras.layers.Dense(128, input_dim=vec_dim, activation='relu'))
        model.add(keras.layers.Dense(64, activation='relu'))
        model.add(keras.layers.Dense(class_dim, activation='sigmoid')) # sigmoid so that the probability of one class is independent from the probability of another
        model.compile(loss=weighted_binary_crossentropy, 
                  optimizer='adam')
        # Fit model
        model.fit(X_train, 
                     y_train,
                     epochs=epochs,
                     batch_size=batch_size,
                     verbose=0)
        # Evaluate
        y_prob = model.predict(X_test) # probabilities
        metrics = calc_metrics(y_test.values, y_prob)
        
        if verbose:
            print('Accuracy: {:.5f}, EMR: {:.5f}, F1: {:.5f}, Precision: {:.5f}, AUC: {:.5f}'.format(*metrics))
            
        for k in range(num_metrics):
            cv_scores[j, k] = metrics[k]
            
        j = j + 1
        
        # To prevent slower performance
        tfb.clear_session()
    return cv_scores.mean(axis=0) # return mean

In [24]:
# FIRST: nn model with doc2vec
# Split into x and y
x_doc = merged.iloc[:, 7+class_dim:7+class_dim+vec_dim]
y = merged.iloc[:, 7:7+class_dim]

# Cross validation
kf = KFold(n_splits=k_folds, shuffle=True)

cv_scores_doc = np.empty((len(pweights), num_metrics))
for i in range(len(pweights)):
    pos_weight = pweights[i]
    cv_scores_doc[i] = run_cv(x_doc)

0th CV run
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Accuracy: 0.89606, EMR: 0.19355, F1: 0.53968, Precision: 0.54839, AUC: 0.81734
1th CV run
Accuracy: 0.89247, EMR: 0.09677, F1: 0.57143, Precision: 0.52632, AUC: 0.86352
2th CV run
Accuracy: 0.88889, EMR: 0.16129, F1: 0.55072, Precision: 0.46914, AUC: 0.84133
3th CV run
Accuracy: 0.87593, EMR: 0.13333, F1: 0.51095, Precision: 0.44872, AUC: 0.86042
4th CV run
Accuracy: 0.90556, EMR: 0.20000, F1: 0.52336, Precision: 0.48276, AUC: 0.86109
0th CV run
Accuracy: 0.90323, EMR: 0.25806, F1: 0.53448, Precision: 0.52542, AUC: 0.83927
1th CV run
Accuracy: 0.88889, EMR: 0.16129, F1: 0.49180, Precision: 0.51724, AUC: 0.80870
2th CV run
Accuracy: 0.90143, EMR: 0.19355, F1: 0.53782, Precision: 0.51613, AUC: 0.86847
3th CV run
Accuracy: 0.88889, EMR: 0.06667, F1: 0.49153, Precision: 0.43939, AUC: 0.84769
4th CV run
Accuracy: 0.90185, EMR: 0.26667, F1: 0.55462, Precision: 0.

In [25]:
# SECOND: nn model with averaged word vecs
# We do this separately because let's just pick a positive weight and stick to it
x_word = merged.iloc[:, 7+class_dim+vec_dim:7+class_dim+vec_dim*2]

pos_weight = 2
# Now, let's test the model with average vectors
cv_scores_word = run_cv(x_word)

0th CV run
Accuracy: 0.89247, EMR: 0.06452, F1: 0.34783, Precision: 0.45714, AUC: 0.76230
1th CV run
Accuracy: 0.89247, EMR: 0.22581, F1: 0.38776, Precision: 0.44186, AUC: 0.76501
2th CV run
Accuracy: 0.89964, EMR: 0.19355, F1: 0.44000, Precision: 0.53659, AUC: 0.79216
3th CV run
Accuracy: 0.88148, EMR: 0.13333, F1: 0.34694, Precision: 0.42500, AUC: 0.76445
4th CV run
Accuracy: 0.88519, EMR: 0.10000, F1: 0.35417, Precision: 0.53125, AUC: 0.78191


# Results

In [26]:
print('NN Model with Doc2Vec Results:')
print(cv_scores_doc) # each index corresponds to a pos_weight
print()
print('NN Model with Averaged Word2Vec Results:')
print(cv_scores_word)

NN Model with Doc2Vec Results:
[[0.89178017 0.15698925 0.53922983 0.49506305 0.84874182]
 [0.89685783 0.18924731 0.52204969 0.51749476 0.83278892]
 [0.89990442 0.19032258 0.51915022 0.53150695 0.85217396]
 [0.91062127 0.18946237 0.54812635 0.59821351 0.85651671]
 [0.91267622 0.20989247 0.54728108 0.60914704 0.83299265]
 [0.91434886 0.2283871  0.54805841 0.62833712 0.85408454]
 [0.9130227  0.20946237 0.49545712 0.65719687 0.85114295]]

NN Model with Averaged Word2Vec Results:
[0.8902509  0.14344086 0.37533733 0.47836774 0.77316566]
