In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, KFold # for test-train split & cross validation
import random
from keras.models import Sequential # for nn
from keras.layers import Dense # for nn
import tensorflow as tf # for nn & new loss
import keras.backend.tensorflow_backend as tfb # for nn $ new loss
import preprocessing

# Ensure reproducibility
# But, the NN itself will still contain some randomness because
# https://stackoverflow.com/questions/32419510/how-to-get-reproducible-results-in-keras/52897216#52897216
# Decided against full reproducibility because of concerns for training time.
seed = 561
np.random.seed(seed)
random.seed(seed)
tf.set_random_seed(seed)

Using TensorFlow backend.
unable to import 'smart_open.gcs', disabling that module


# Multi-label classification

Classify the reasons (violations) behind each docket_num (document).

# Read in data

In [2]:
merged = preprocessing.load_mea()

18 unique classes found


In [3]:
merged.head()

Unnamed: 0,date,docket_num,text,year,month,day,reason,Allowed unlawful activity,Concealed material facts that were likely to influence a mortgagor to take a mortgage loan,Conspiracy to commit fraud,...,Failure to file a correcting amendment,Falsification and misrepresentation of loans,Has outstanding tax liens,Identity issues,Impermissible net-branching,Lacks a surety bond,Mistreatment of employees,Retained borrower funds,Uncooperative with OCOB,Unlicensed activity
0,2009-11-18,09_160,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,11,18,{Conspiracy to commit fraud},0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2009-11-18,09_164,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,11,18,{Conspiracy to commit fraud},0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2009-10-16,09_142B,OAH File No. 10 COB 2895 STATE OF NORTH CAROLI...,2009,10,16,{Allowed unlawful activity},1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2009-09-09,09_081,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,9,9,{Falsification and misrepresentation of loans},0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,2009-08-24,09_070,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,8,24,"{Retained borrower funds, Failure to file a co...",0,0,0,...,1,0,0,0,0,0,0,1,0,0


# Preprocessing

## Vectorize

We use the Continuous Bag of Words (CBOW) model to create our word embeddings to be used in our ML models.

In [4]:
# Constants
vec_dim = 100 # how big the word embeddings are

# Vectorize
docvecs = preprocessing.create_tokens(merged, 'text', vec_dim, 'doc')
docvecs_avg = preprocessing.create_tokens(merged, 'text', vec_dim, 'word')

## Add doc embeddings

In [5]:
merged = pd.concat([merged, pd.DataFrame(docvecs)], axis=1)
merged = pd.concat([merged, pd.DataFrame(docvecs_avg)], axis=1)

In [6]:
merged.head()

Unnamed: 0,date,docket_num,text,year,month,day,reason,Allowed unlawful activity,Concealed material facts that were likely to influence a mortgagor to take a mortgage loan,Conspiracy to commit fraud,...,90,91,92,93,94,95,96,97,98,99
0,2009-11-18,09_160,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,11,18,{Conspiracy to commit fraud},0,0,1,...,-0.468148,1.170645,-1.334777,-0.61804,-0.008057,0.00574,-0.125155,0.200595,0.82962,-1.181881
1,2009-11-18,09_164,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,11,18,{Conspiracy to commit fraud},0,0,1,...,-0.323016,1.187945,-1.545925,-0.73954,-0.154218,0.175886,-0.212002,0.149133,0.798784,-1.074611
2,2009-10-16,09_142B,OAH File No. 10 COB 2895 STATE OF NORTH CAROLI...,2009,10,16,{Allowed unlawful activity},1,0,0,...,-0.230604,1.270992,-1.49249,-0.330789,0.137562,-0.281108,0.041319,-0.31254,1.296625,-1.273013
3,2009-09-09,09_081,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,9,9,{Falsification and misrepresentation of loans},0,0,0,...,-0.237671,0.980513,-1.329747,-0.090856,0.007664,-0.065834,0.009553,-0.366076,1.170629,-1.149983
4,2009-08-24,09_070,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,8,24,"{Retained borrower funds, Failure to file a co...",0,0,0,...,-0.019425,1.129432,-1.73433,-0.629524,0.079354,-0.057556,-0.009283,-0.324705,0.991261,-1.375346


# Train & Test Neural Network model

In [7]:
class_dim = 18 # number of distinct classes
epochs = 25
batch_size = 10
k_folds = 5 # number of folds for cv
pweights = [15, 10, 5, 4, 3, 2, 1]  # multiplier for positive targets, needs to be tuned
num_metrics = 6 # number of metrics -- manually set

# inputs to loss
pos_weight = pweights[0] # starting weight

## New loss

In [8]:
# From https://stackoverflow.com/questions/42158866/neural-network-for-multi-label-classification-with-large-number-of-classes-outpu/47313183#47313183
def weighted_binary_crossentropy(target, output):
    """
    Weighted binary crossentropy between an output tensor 
    and a target tensor. POS_WEIGHT is used as a multiplier 
    for the positive targets.

    Combination of the following functions:
    * keras.losses.binary_crossentropy
    * keras.backend.tensorflow_backend.binary_crossentropy
    * tf.nn.weighted_cross_entropy_with_logits
    """
    # transform back to logits
    _epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
    output = tf.log(output / (1 - output))
    # compute weighted loss
    loss = tf.nn.weighted_cross_entropy_with_logits(targets=target,
                                                    logits=output,
                                                    pos_weight=pos_weight)
    return tf.reduce_mean(loss, axis=-1)

## Train & test

In [9]:
# Function that runs k_folds cross validation
# Note that y, kf, pos_weight, whatever parameters is set outside of this function
# Returns the mean metrics
def run_cv(x, verbose=True):
    cv_scores = np.empty((k_folds, num_metrics))
    j = 0
    for train_index, test_index in kf.split(x):
        if verbose:
            print('{:d}th CV run'.format(j))
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Build model
        model = Sequential() 
        model.add(Dense(128, input_dim=vec_dim, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(class_dim, activation='sigmoid')) # sigmoid so that the probability of one class is independent from the probability of another
        model.compile(loss=weighted_binary_crossentropy, 
                  optimizer='adam')
        # Fit model
        model.fit(X_train, 
                     y_train,
                     epochs=epochs,
                     batch_size=batch_size,
                     verbose=0)
        # Evaluate
        y_prob = model.predict(X_test) # probabilities
        metrics = preprocessing.calc_metrics(y_test.values, y_prob)
        
        if verbose:
            print('Accuracy: {:.5f}, EMR: {:.5f}, F1: {:.5f}, Precision: {:.5f}, Recall: {:.5f}, AUC: {:.5f}'.format(*metrics))
            
        for k in range(num_metrics):
            cv_scores[j, k] = metrics[k]
            
        j = j + 1
        
        # To prevent slower performance
        tfb.clear_session()
    return cv_scores.mean(axis=0) # return mean

In [10]:
merged.columns.values # making sure we index correctly

array(['date', 'docket_num', 'text', 'year', 'month', 'day', 'reason',
       'Allowed unlawful activity',
       'Concealed material facts that were likely to influence a mortgagor to take a mortgage loan',
       'Conspiracy to commit fraud', 'Convicted of misdemeanor or felony',
       'Does not meet the financial responsibility requirements',
       'Excess fees', 'Failure to disclose civil or financial issue',
       'Failure to disclose criminal conviction',
       'Failure to file a correcting amendment',
       'Falsification and misrepresentation of loans',
       'Has outstanding tax liens', 'Identity issues',
       'Impermissible net-branching', 'Lacks a surety bond',
       'Mistreatment of employees', 'Retained borrower funds',
       'Uncooperative with OCOB', 'Unlicensed activity', 0, 1, 2, 3, 4, 5,
       6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
       24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 

In [11]:
# FIRST: nn model with doc2vec
# Split into x and y
x_doc = merged.iloc[:, 7+class_dim:7+class_dim+vec_dim]
y = merged.iloc[:, 7:7+class_dim]

# Cross validation
kf = KFold(n_splits=k_folds, shuffle=True)

cv_scores_doc = np.empty((len(pweights), num_metrics))
for i in range(len(pweights)):
    pos_weight = pweights[i]
    cv_scores_doc[i] = run_cv(x_doc)

0th CV run
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Accuracy: 0.88889, EMR: 0.12903, F1: 0.50000, Precision: 0.51667, Recall: 0.48438, AUC: 0.82120
1th CV run
Accuracy: 0.88889, EMR: 0.03226, F1: 0.55072, Precision: 0.51351, Recall: 0.59375, AUC: 0.86902
2th CV run
Accuracy: 0.88710, EMR: 0.12903, F1: 0.54015, Precision: 0.46250, Recall: 0.64912, AUC: 0.84820
3th CV run
Accuracy: 0.87593, EMR: 0.06667, F1: 0.50370, Precision: 0.44737, Recall: 0.57627, AUC: 0.87276
4th CV run
Accuracy: 0.90741, EMR: 0.13333, F1: 0.50980, Precision: 0.49057, Recall: 0.53061, AUC: 0.86205
0th CV run
Accuracy: 0.88889, EMR: 0.16129, F1: 0.48333, Precision: 0.46032, Recall: 0.50877, AUC: 0.84417
1th CV run
Accuracy: 0.88889, EMR: 0.19355, F1: 0.47458, Precision: 0.51852, Recall: 0.43750, AUC: 0.82311
2th CV run
Accuracy: 0.89606, EMR: 0.09677, F1: 0.50847, Precision: 0.49180, Recall: 0.52632, AUC: 0.87879
3th CV run
Accuracy: 0.8

In [12]:
# SECOND: nn model with averaged word vecs
# We do this separately because let's just pick a positive weight and stick to it
x_word = merged.iloc[:, 7+class_dim+vec_dim:7+class_dim+vec_dim*2]

pos_weight = 2
# Now, let's test the model with average vectors
cv_scores_word = run_cv(x_word)

0th CV run
Accuracy: 0.90143, EMR: 0.06452, F1: 0.44444, Precision: 0.52381, Recall: 0.38596, AUC: 0.75456
1th CV run
Accuracy: 0.89427, EMR: 0.22581, F1: 0.40404, Precision: 0.45455, Recall: 0.36364, AUC: 0.76140
2th CV run
Accuracy: 0.90143, EMR: 0.16129, F1: 0.45545, Precision: 0.54762, Recall: 0.38983, AUC: 0.78156
3th CV run
Accuracy: 0.88519, EMR: 0.10000, F1: 0.40385, Precision: 0.45652, Recall: 0.36207, AUC: 0.77254
4th CV run
Accuracy: 0.88148, EMR: 0.10000, F1: 0.31915, Precision: 0.50000, Recall: 0.23438, AUC: 0.77715


# Results

In [13]:
print('NN Model with Doc2Vec Results:')
print(cv_scores_doc) # each index corresponds to a pos_weight
print()
print('NN Model with Averaged Word2Vec Results:')
print(cv_scores_word)
# NN Model with Doc2Vec Results:
# [[0.88964158 0.09806452 0.52087565 0.48612293 0.56682625 0.85464479]
#  [0.88810036 0.16365591 0.48146893 0.47770042 0.49042719 0.82914383]
#  [0.90217443 0.20408602 0.51930979 0.54426892 0.4972809  0.84976277]
#  [0.90232975 0.20258065 0.51469291 0.54434181 0.48915173 0.85457016]
#  [0.9090681  0.22322581 0.52910774 0.58932107 0.4802068  0.84226945]
#  [0.91178017 0.26107527 0.52288796 0.61870296 0.45380535 0.86215186]
#  [0.91046595 0.17698925 0.46233917 0.63970711 0.36384145 0.84023481]]
# Quick note that sometimes the best pos_weight changes due to randomness.
# Generally, it seems that 1 or 2 is a good weight.

# NN Model with Averaged Word2Vec Results:
# [0.89275986 0.13032258 0.4053851  0.49649915 0.34717515 0.76944033]

NN Model with Doc2Vec Results:
[[0.88964158 0.09806452 0.52087565 0.48612293 0.56682625 0.85464479]
 [0.88810036 0.16365591 0.48146893 0.47770042 0.49042719 0.82914383]
 [0.90217443 0.20408602 0.51930979 0.54426892 0.4972809  0.84976277]
 [0.90232975 0.20258065 0.51469291 0.54434181 0.48915173 0.85457016]
 [0.9090681  0.22322581 0.52910774 0.58932107 0.4802068  0.84226945]
 [0.91178017 0.26107527 0.52288796 0.61870296 0.45380535 0.86215186]
 [0.91046595 0.17698925 0.46233917 0.63970711 0.36384145 0.84023481]]

NN Model with Averaged Word2Vec Results:
[0.89275986 0.13032258 0.4053851  0.49649915 0.34717515 0.76944033]


In [14]:
cv_scores_doc[5] - cv_scores_word 
# array([0.01902031, 0.13075269, 0.11750286, 0.12220381, 0.1066302 ,
#        0.09271153])

array([0.01902031, 0.13075269, 0.11750286, 0.12220381, 0.1066302 ,
       0.09271153])