In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, KFold # for test-train split & cross validation
import random
from keras.models import Sequential # for nn
from keras.layers import Dense # for nn
import tensorflow as tf # for nn & new loss
import keras.backend.tensorflow_backend as tfb # for nn $ new loss
import preprocessing

# Ensure reproducibility
# But, the NN itself will still contain some randomness because
# https://stackoverflow.com/questions/32419510/how-to-get-reproducible-results-in-keras/52897216#52897216
# Decided against full reproducibility because of concerns for training time.
seed = 561
np.random.seed(seed)
random.seed(seed)
tf.set_random_seed(seed)

Using TensorFlow backend.
unable to import 'smart_open.gcs', disabling that module


# Multi-label classification

Classify the reasons (violations) behind each docket_num (document).

# Read in data

In [2]:
merged = preprocessing.load_mea()

18 unique classes found


In [3]:
merged.head()

Unnamed: 0,date,docket_num,text,year,month,day,reason,Allowed unlawful activity,Concealed material facts that were likely to influence a mortgagor to take a mortgage loan,Conspiracy to commit fraud,...,Failure to file a correcting amendment,Falsification and misrepresentation of loans,Has outstanding tax liens,Identity issues,Impermissible net-branching,Lacks a surety bond,Mistreatment of employees,Retained borrower funds,Uncooperative with OCOB,Unlicensed activity
0,2009-11-18,09_160,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,11,18,{Conspiracy to commit fraud},0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2009-11-18,09_164,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,11,18,{Conspiracy to commit fraud},0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2009-10-16,09_142B,OAH File No. 10 COB 2895 STATE OF NORTH CAROLI...,2009,10,16,{Allowed unlawful activity},1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2009-09-09,09_081,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,9,9,{Falsification and misrepresentation of loans},0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,2009-08-24,09_070,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,8,24,"{Retained borrower funds, Does not meet the fi...",0,0,0,...,1,0,0,0,0,0,0,1,0,0


# Preprocessing

## Vectorize

We use the Continuous Bag of Words (CBOW) model to create our word embeddings to be used in our ML models.

In [4]:
# Constants
vec_dim = 100 # how big the word embeddings are

# Vectorize
docvecs = preprocessing.create_tokens(merged, 'text', vec_dim, 'doc')
docvecs_avg = preprocessing.create_tokens(merged, 'text', vec_dim, 'word')

## Add doc embeddings

In [5]:
merged = pd.concat([merged, pd.DataFrame(docvecs)], axis=1)
merged = pd.concat([merged, pd.DataFrame(docvecs_avg)], axis=1)

In [6]:
merged.head()

Unnamed: 0,date,docket_num,text,year,month,day,reason,Allowed unlawful activity,Concealed material facts that were likely to influence a mortgagor to take a mortgage loan,Conspiracy to commit fraud,...,90,91,92,93,94,95,96,97,98,99
0,2009-11-18,09_160,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,11,18,{Conspiracy to commit fraud},0,0,1,...,0.275608,-0.264421,-0.479553,1.379651,1.124461,-0.262941,-0.744955,1.49883,-0.061461,0.132894
1,2009-11-18,09_164,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,11,18,{Conspiracy to commit fraud},0,0,1,...,0.3553,-0.432066,-0.583241,1.566475,1.138384,-0.386443,-0.651767,1.249204,-0.115527,0.104318
2,2009-10-16,09_142B,OAH File No. 10 COB 2895 STATE OF NORTH CAROLI...,2009,10,16,{Allowed unlawful activity},1,0,0,...,0.534877,-0.507094,-0.076668,1.660304,1.273912,-0.391001,-1.285176,1.901642,0.224143,0.07091
3,2009-09-09,09_081,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,9,9,{Falsification and misrepresentation of loans},0,0,0,...,0.371945,-0.368662,-0.025115,1.517,1.141049,-0.467491,-1.232735,1.504742,0.217517,-0.088777
4,2009-08-24,09_070,STATE OF NORTH CAROLINA WAKE COUNTY IN A MATTE...,2009,8,24,"{Retained borrower funds, Does not meet the fi...",0,0,0,...,0.330605,-0.453157,-0.356843,1.784191,1.269599,-0.642188,-0.975672,1.925823,0.256681,0.352196


# Train & Test Neural Network model

In [7]:
class_dim = 18 # number of distinct classes
epochs = 25
batch_size = 10
k_folds = 5 # number of folds for cv
pweights = [15, 10, 5, 4, 3, 2, 1]  # multiplier for positive targets, needs to be tuned
num_metrics = 6 # number of metrics -- manually set

# inputs to loss
pos_weight = pweights[0] # starting weight

## New loss

In [8]:
# From https://stackoverflow.com/questions/42158866/neural-network-for-multi-label-classification-with-large-number-of-classes-outpu/47313183#47313183
def weighted_binary_crossentropy(target, output):
    """
    Weighted binary crossentropy between an output tensor 
    and a target tensor. POS_WEIGHT is used as a multiplier 
    for the positive targets.

    Combination of the following functions:
    * keras.losses.binary_crossentropy
    * keras.backend.tensorflow_backend.binary_crossentropy
    * tf.nn.weighted_cross_entropy_with_logits
    """
    # transform back to logits
    _epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
    output = tf.log(output / (1 - output))
    # compute weighted loss
    loss = tf.nn.weighted_cross_entropy_with_logits(targets=target,
                                                    logits=output,
                                                    pos_weight=pos_weight)
    return tf.reduce_mean(loss, axis=-1)

## Train & test

In [9]:
# Function that runs k_folds cross validation
# Note that y, kf, pos_weight, whatever parameters is set outside of this function
# Returns the mean metrics
def run_cv(x, verbose=True):
    cv_scores = np.empty((k_folds, num_metrics))
    j = 0
    for train_index, test_index in kf.split(x):
        if verbose:
            print('{:d}th CV run'.format(j))
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Build model
        model = Sequential() 
        model.add(Dense(128, input_dim=vec_dim, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(class_dim, activation='sigmoid')) # sigmoid so that the probability of one class is independent from the probability of another
        model.compile(loss=weighted_binary_crossentropy, 
                  optimizer='adam')
        # Fit model
        model.fit(X_train, 
                     y_train,
                     epochs=epochs,
                     batch_size=batch_size,
                     verbose=0)
        # Evaluate
        y_prob = model.predict(X_test) # probabilities
        metrics = preprocessing.calc_metrics(y_test.values, y_prob)
        
        if verbose:
            print('Accuracy: {:.5f}, EMR: {:.5f}, F1: {:.5f}, Precision: {:.5f}, Recall: {:.5f}, AUC: {:.5f}'.format(*metrics))
            
        for k in range(num_metrics):
            cv_scores[j, k] = metrics[k]
            
        j = j + 1
        
        # To prevent slower performance
        tfb.clear_session()
    return cv_scores.mean(axis=0) # return mean

In [10]:
merged.columns.values # making sure we index correctly

array(['date', 'docket_num', 'text', 'year', 'month', 'day', 'reason',
       'Allowed unlawful activity',
       'Concealed material facts that were likely to influence a mortgagor to take a mortgage loan',
       'Conspiracy to commit fraud', 'Convicted of misdemeanor or felony',
       'Does not meet the financial responsibility requirements',
       'Excess fees', 'Failure to disclose civil or financial issue',
       'Failure to disclose criminal conviction',
       'Failure to file a correcting amendment',
       'Falsification and misrepresentation of loans',
       'Has outstanding tax liens', 'Identity issues',
       'Impermissible net-branching', 'Lacks a surety bond',
       'Mistreatment of employees', 'Retained borrower funds',
       'Uncooperative with OCOB', 'Unlicensed activity', 0, 1, 2, 3, 4, 5,
       6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
       24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 

In [11]:
# FIRST: nn model with doc2vec
# Split into x and y
x_doc = merged.iloc[:, 7+class_dim:7+class_dim+vec_dim]
y = merged.iloc[:, 7:7+class_dim]

# Cross validation
kf = KFold(n_splits=k_folds, shuffle=True)

cv_scores_doc = np.empty((len(pweights), num_metrics))
for i in range(len(pweights)):
    pos_weight = pweights[i]
    cv_scores_doc[i] = run_cv(x_doc)

0th CV run
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Accuracy: 0.87634, EMR: 0.12903, F1: 0.46512, Precision: 0.46154, Recall: 0.46875, AUC: 0.81734
1th CV run
Accuracy: 0.89068, EMR: 0.09677, F1: 0.57343, Precision: 0.51899, Recall: 0.64062, AUC: 0.84464
2th CV run
Accuracy: 0.87814, EMR: 0.03226, F1: 0.54054, Precision: 0.43956, Recall: 0.70175, AUC: 0.84823
3th CV run
Accuracy: 0.86667, EMR: 0.13333, F1: 0.47826, Precision: 0.41772, Recall: 0.55932, AUC: 0.86529
4th CV run
Accuracy: 0.88704, EMR: 0.13333, F1: 0.45045, Precision: 0.40323, Recall: 0.51020, AUC: 0.82131
0th CV run
Accuracy: 0.89785, EMR: 0.19355, F1: 0.50435, Precision: 0.50000, Recall: 0.50877, AUC: 0.85440
1th CV run
Accuracy: 0.87814, EMR: 0.12903, F1: 0.42373, Precision: 0.46296, Recall: 0.39062, AUC: 0.78508
2th CV run
Accuracy: 0.91935, EMR: 0.19355, F1: 0.62810, Precision: 0.59375, Recall: 0.66667, AUC: 0.88322
3th CV run
Accuracy: 0.8

In [12]:
# SECOND: nn model with averaged word vecs
x_word = merged.iloc[:, 7+class_dim+vec_dim:7+class_dim+vec_dim*2]

cv_scores_word = np.empty((len(pweights), num_metrics))
for i in range(len(pweights)):
    pos_weight = pweights[i]
    cv_scores_word[i] = run_cv(x_word)

0th CV run
Accuracy: 0.58781, EMR: 0.00000, F1: 0.28571, Precision: 0.17358, Recall: 0.80702, AUC: 0.77302
1th CV run
Accuracy: 0.62186, EMR: 0.00000, F1: 0.28956, Precision: 0.17769, Recall: 0.78182, AUC: 0.77134
2th CV run
Accuracy: 0.62724, EMR: 0.00000, F1: 0.32468, Precision: 0.20080, Recall: 0.84746, AUC: 0.82691
3th CV run
Accuracy: 0.61481, EMR: 0.00000, F1: 0.28276, Precision: 0.17672, Recall: 0.70690, AUC: 0.77275
4th CV run
Accuracy: 0.63704, EMR: 0.00000, F1: 0.32414, Precision: 0.20796, Recall: 0.73438, AUC: 0.76858
0th CV run
Accuracy: 0.67742, EMR: 0.00000, F1: 0.31818, Precision: 0.20192, Recall: 0.75000, AUC: 0.78198
1th CV run
Accuracy: 0.70251, EMR: 0.00000, F1: 0.30252, Precision: 0.21429, Recall: 0.51429, AUC: 0.68785
2th CV run
Accuracy: 0.68817, EMR: 0.00000, F1: 0.32031, Precision: 0.21026, Recall: 0.67213, AUC: 0.77191
3th CV run
Accuracy: 0.67222, EMR: 0.00000, F1: 0.31128, Precision: 0.19608, Recall: 0.75472, AUC: 0.79594
4th CV run
Accuracy: 0.70926, EMR: 0.

# Results

In [13]:
print('NN Model with Doc2Vec Results:')
print(cv_scores_doc) # each index corresponds to a pos_weight
print()
print('NN Model with Averaged Word2Vec Results:')
print(cv_scores_word)
# NN Model with Doc2Vec Results:
# [[0.879773   0.10494624 0.50155894 0.44820671 0.5761311  0.83936206]
#  [0.89017921 0.16322581 0.49135212 0.48410245 0.50210161 0.81762053]
#  [0.90571087 0.20989247 0.53465521 0.56036797 0.51218062 0.84337619]
#  [0.90482676 0.18946237 0.52019936 0.56127781 0.48756225 0.8506447 ]
#  [0.90458781 0.22258065 0.50435731 0.56802486 0.45695305 0.83132127]
#  [0.90887694 0.21505376 0.51452628 0.60228571 0.45501253 0.85063308]
#  [0.91261649 0.22258065 0.49901649 0.64323037 0.41102582 0.84548081]]

# NN Model with Averaged Word2Vec Results:
# [[0.61775388 0.         0.30136969 0.18735256 0.77551298 0.78251768]
#  [0.68991637 0.         0.31797042 0.20798699 0.68917016 0.76741365]
#  [0.83449223 0.05204301 0.3948469  0.32585425 0.5214668  0.76999237]
#  [0.86376344 0.09741935 0.4339559  0.38915575 0.49155723 0.78224896]
#  [0.87651135 0.1172043  0.38412252 0.41435396 0.36626242 0.76079917]
#  [0.88992832 0.13053763 0.38094089 0.49044219 0.3134499  0.75706932]
#  [0.89364397 0.07849462 0.15710175 0.59102564 0.09574298 0.74318556]]

NN Model with Doc2Vec Results:
[[0.879773   0.10494624 0.50155894 0.44820671 0.5761311  0.83936206]
 [0.89017921 0.16322581 0.49135212 0.48410245 0.50210161 0.81762053]
 [0.90571087 0.20989247 0.53465521 0.56036797 0.51218062 0.84337619]
 [0.90482676 0.18946237 0.52019936 0.56127781 0.48756225 0.8506447 ]
 [0.90458781 0.22258065 0.50435731 0.56802486 0.45695305 0.83132127]
 [0.90887694 0.21505376 0.51452628 0.60228571 0.45501253 0.85063308]
 [0.91261649 0.22258065 0.49901649 0.64323037 0.41102582 0.84548081]]

NN Model with Averaged Word2Vec Results:
[[0.61775388 0.         0.30136969 0.18735256 0.77551298 0.78251768]
 [0.68991637 0.         0.31797042 0.20798699 0.68917016 0.76741365]
 [0.83449223 0.05204301 0.3948469  0.32585425 0.5214668  0.76999237]
 [0.86376344 0.09741935 0.4339559  0.38915575 0.49155723 0.78224896]
 [0.87651135 0.1172043  0.38412252 0.41435396 0.36626242 0.76079917]
 [0.88992832 0.13053763 0.38094089 0.49044219 0.3134499  0.75706932]
 [0.89364397 0.07849462 0.157

In [14]:
cv_scores_doc - cv_scores_word # how much do they differ?
# array([[ 0.26201912,  0.10494624,  0.20018925,  0.26085415, -0.19938188,
#          0.05684438],
#        [ 0.20026284,  0.16322581,  0.1733817 ,  0.27611546, -0.18706856,
#          0.05020689],
#        [ 0.07121864,  0.15784946,  0.1398083 ,  0.23451372, -0.00928619,
#          0.07338382],
#        [ 0.04106332,  0.09204301,  0.08624346,  0.17212206, -0.00399498,
#          0.06839574],
#        [ 0.02807646,  0.10537634,  0.1202348 ,  0.15367091,  0.09069063,
#          0.07052211],
#        [ 0.01894863,  0.08451613,  0.13358539,  0.11184352,  0.14156264,
#          0.09356376],
#        [ 0.01897252,  0.14408602,  0.34191474,  0.05220473,  0.31528284,
#          0.10229525]])

array([[ 0.26201912,  0.10494624,  0.20018925,  0.26085415, -0.19938188,
         0.05684438],
       [ 0.20026284,  0.16322581,  0.1733817 ,  0.27611546, -0.18706856,
         0.05020689],
       [ 0.07121864,  0.15784946,  0.1398083 ,  0.23451372, -0.00928619,
         0.07338382],
       [ 0.04106332,  0.09204301,  0.08624346,  0.17212206, -0.00399498,
         0.06839574],
       [ 0.02807646,  0.10537634,  0.1202348 ,  0.15367091,  0.09069063,
         0.07052211],
       [ 0.01894863,  0.08451613,  0.13358539,  0.11184352,  0.14156264,
         0.09356376],
       [ 0.01897252,  0.14408602,  0.34191474,  0.05220473,  0.31528284,
         0.10229525]])