In [5]:
from skmultilearn.adapt import MLTSVM
import pandas as pd
import numpy as np 
from nltk.corpus import stopwords # get stopwords to remove
import re # regular expression
from gensim.models import doc2vec, Word2Vec # for word embeddings
from gensim.utils import simple_preprocess # to tokenize automatically
from sklearn.model_selection import train_test_split, KFold # for test-train split & cross validation
from sklearn.preprocessing import MultiLabelBinarizer # to convert to a format that can do multi-label classification
import random
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score, f1_score, recall_score # for metrics
import scipy.sparse as sp
import collections

stop = stopwords.words('english')
# Ensure reproducibility
seed = 561
np.random.seed(seed)
random.seed(seed)

In [6]:
# Constants
vec_dim = 100 # how big the word embeddings are

In [7]:
raw_text = pd.read_csv('./data/clean_mea_text.csv') # this holds the raw text
reasons = pd.read_csv('./data/mea_reasons_filtered.csv') # these are our target classifications

In [8]:
# There is a bit of data mismatch, so filter both dfs for text that appears in both
bothdocs = set(raw_text.docket_num.values).intersection(reasons.docket_num.values)
raw_text = raw_text[raw_text.docket_num.isin(bothdocs)]
reasons = reasons[reasons.docket_num.isin(bothdocs)]

# Also need to convert datatypes to prevent type mismatch
raw_text['text'] = raw_text['text'].astype(str)

# Remove irrelevant/unhelpful labels
toremove = ['Unknown', 
           'Other', 
           'Did not appear for the Hearing', 
           'Other failure to disclose', 
           'Unknown failure to disclose', 
           'Allowed falsification and misrepresentation of loans',
           'Circumvented the requirements that a branch manager of a licensed mortgage broker have at least three years experience',
           "Did not verify or make a reasonable effort to verify the borrower's information",
           'Employed simultaneously by more than one affiliated mortgage banker or mortgage broker',
           'Engaged in fraud',
           'Failure to disclose charges',
           'Violated NC Securities Act',
           'Withdrew appeal',
           'Unsatisfactory credit']
reasons = reasons[~reasons.reason.isin(toremove)]

# Since we want to do multi-label classification, binarize outputs
# First, need to aggregate reason by docket_num
reasonsls = reasons.groupby('docket_num')['reason'].apply(set).reset_index(name='reason')

mlb = MultiLabelBinarizer()
classesbin = mlb.fit_transform(reasonsls.reason.values)
classesbin = pd.DataFrame(classesbin)
classesbin.columns = mlb.classes_

reasonsls = pd.concat([reasonsls, classesbin], axis=1)

# Let's combine the input and output datasets for easier handling
merged = raw_text.merge(reasonsls)

# Params:
#    df - dataframe with column 'text' to be tokenized
#    tokens_only - to train the doc2vec model, we’ll need to 
#        associate a tag/number with each document of the training corpus. 
#        tokens_only=True means don't associate anything
def tokenize(df, tokens_only=False):
    tokens = df['text'].apply(lambda x: simple_preprocess(x, deacc=True, max_len=20)) # max_len=20 just in case there are important words 15 chars long)
    if tokens_only:
        return tokens
    else:
        # For training data, add tags -- notice it is just an index number
        return [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]
    
# Remove extra space
merged['text'] = merged['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
# Tokenize
corpus = tokenize(merged)

docmodel = doc2vec.Doc2Vec(vector_size=vec_dim,min_count=1, epochs=40) # min_count=1 because we're not sure if relevant words occur multiple times
docmodel.build_vocab(corpus)
# Train
docmodel.train(corpus, total_examples=docmodel.corpus_count, epochs=docmodel.epochs)

# Let's also use word2vec and averaging to compare to doc2vec
# Perhaps in this setting, the context of words isn't important
textls = tokenize(merged, tokens_only=True)
wordmodel = Word2Vec(textls, min_count=1)
w2v = dict(zip(wordmodel.wv.index2word, wordmodel.wv.syn0))

# Params:
#    w2v - dictionary of words to vectors
#    text - list of tokenized words to convert to vector
# Returns a vector resulting from the average of all present words in text
def average_vectors(w2v, text):
    num_count = collections.Counter(text) # words and their counts
    return np.mean([w2v[word]*count for word, count in num_count.items()], axis=0)

docvecs_avg = [average_vectors(w2v, corpus[i].words) for i in range(len(corpus))]

docvecs = []
for doc_id in range(len(corpus)):
    docvecs.append(docmodel.infer_vector(corpus[doc_id].words))

merged = pd.concat([merged, pd.DataFrame(docvecs)], axis=1)
merged = pd.concat([merged, pd.DataFrame(docvecs_avg)], axis=1)



In [12]:
class_dim = len(mlb.classes_) # number of distinct classes
k_folds = 5 # number of folds for cv
num_metrics = 5 # number of metrics -- manually set

In [13]:
# Function that returns the hamming score
# correctly predicted / number of labels
# Effectively acts as an accuracy metric in multilabel classification
def hamming_score(y_true, y_pred):
    return (y_pred == y_true).mean()
# Function that given predicted probabilities 
# and the true labels,
# calculates a bunch of metrics and returns them
def calc_metrics(y_true, y_prob):
    y_pred = np.copy(y_prob) # classes
    y_pred[y_pred>=0.5] = 1
    y_pred[y_pred<0.5] = 0

    # Metrics
    # average='micro' because we care a little more about global statistics
    # If adding/removing metrics, change num_metrics
    ham_score = hamming_score(y_true, y_pred) # accuracy
    emr = accuracy_score(y_true, y_pred) # exact match ratio
    f1 = f1_score(y_true, y_pred, average='micro') # f1 -- care about false positives and false negatives
    prec = precision_score(y_true, y_pred, average='micro') # tp / (tp + fp) # care about false positives slightly more let's look at precision instead of both
    rec = recall_score(y_true, y_pred, average='micro') # just for interpretation
#     auc = roc_auc_score(y_true, y_prob, average='micro') # for ease, pretend the threshold should be the same for all classes
    metrics = [ham_score, emr, f1, prec, rec]
    
    return metrics

In [14]:
# Split into x and y
x_doc = merged.iloc[:, 7+class_dim:7+class_dim+vec_dim]
x = x_doc
y = merged.iloc[:, 7:7+class_dim]

# Cross validation
kf = KFold(n_splits=k_folds, shuffle=True)

cks = [2**i for i in range(-5, 5, 2)]
cv_scores = np.empty((len(cks), k_folds, num_metrics))
for i in range(len(cks)):
    classifier = MLTSVM(c_k = 2**-1)
    j=0
    for train_index, test_index in kf.split(x):
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # train
        classifier.fit(sp.csr_matrix(X_train), sp.csr_matrix(y_train))

        # predict
        y_pred = classifier.predict(sp.csr_matrix(X_test))

        cv_scores[i,j] = calc_metrics(y_test.values, y_pred)
        
        j = j + 1

In [15]:
cv_scores.mean(axis=1)

array([[0.88597372, 0.0655914 , 0.27654482, 0.41387406, 0.21471502],
       [0.8842055 , 0.04623656, 0.26663926, 0.41365079, 0.2009891 ],
       [0.88778973, 0.05225806, 0.27487328, 0.44064794, 0.20064724],
       [0.89039427, 0.03935484, 0.29464256, 0.47600511, 0.21756717],
       [0.88778973, 0.08473118, 0.27709926, 0.43374332, 0.20410431]])

In [41]:
[2**i for i in range(-5, 5, 2)]

[0.03125, 0.125, 0.5, 2, 8]

In [47]:
from sklearn.model_selection import GridSearchCV

parameters = {'c_k': [0.125, 0.5, 2, 8],
             'sor_omega': [0.125, 0.5, 2, 8],
             'lambda_param': [0.125, 0.5, 2, 8]}
score = ['accuracy', 'f1_micro']

clf = GridSearchCV(MLTSVM(), parameters, scoring=score, verbose=2, refit='accuracy')
clf.fit(sp.csr_matrix(x), sp.csr_matrix(y))

print(clf.best_params_, clf.best_score_)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] c_k=0.125, lambda_param=0.125, sor_omega=0.125 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ... c_k=0.125, lambda_param=0.125, sor_omega=0.125, total=   4.5s
[CV] c_k=0.125, lambda_param=0.125, sor_omega=0.125 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.4s remaining:    0.0s


[CV] ... c_k=0.125, lambda_param=0.125, sor_omega=0.125, total=   4.7s
[CV] c_k=0.125, lambda_param=0.125, sor_omega=0.125 ..................
[CV] ... c_k=0.125, lambda_param=0.125, sor_omega=0.125, total=   5.0s
[CV] c_k=0.125, lambda_param=0.125, sor_omega=0.5 ....................
[CV] ..... c_k=0.125, lambda_param=0.125, sor_omega=0.5, total=   2.4s
[CV] c_k=0.125, lambda_param=0.125, sor_omega=0.5 ....................
[CV] ..... c_k=0.125, lambda_param=0.125, sor_omega=0.5, total=   2.7s
[CV] c_k=0.125, lambda_param=0.125, sor_omega=0.5 ....................
[CV] ..... c_k=0.125, lambda_param=0.125, sor_omega=0.5, total=   2.0s
[CV] c_k=0.125, lambda_param=0.125, sor_omega=2 ......................
[CV] ....... c_k=0.125, lambda_param=0.125, sor_omega=2, total=   0.6s
[CV] c_k=0.125, lambda_param=0.125, sor_omega=2 ......................
[CV] ....... c_k=0.125, lambda_param=0.125, sor_omega=2, total=   0.7s
[CV] c_k=0.125, lambda_param=0.125, sor_omega=2 ......................
[CV] .

[CV] ......... c_k=0.5, lambda_param=0.125, sor_omega=8, total=   4.8s
[CV] c_k=0.5, lambda_param=0.5, sor_omega=0.125 ......................
[CV] ....... c_k=0.5, lambda_param=0.5, sor_omega=0.125, total=   3.4s
[CV] c_k=0.5, lambda_param=0.5, sor_omega=0.125 ......................
[CV] ....... c_k=0.5, lambda_param=0.5, sor_omega=0.125, total=   4.9s
[CV] c_k=0.5, lambda_param=0.5, sor_omega=0.125 ......................
[CV] ....... c_k=0.5, lambda_param=0.5, sor_omega=0.125, total=   4.0s
[CV] c_k=0.5, lambda_param=0.5, sor_omega=0.5 ........................
[CV] ......... c_k=0.5, lambda_param=0.5, sor_omega=0.5, total=   2.6s
[CV] c_k=0.5, lambda_param=0.5, sor_omega=0.5 ........................
[CV] ......... c_k=0.5, lambda_param=0.5, sor_omega=0.5, total=   3.1s
[CV] c_k=0.5, lambda_param=0.5, sor_omega=0.5 ........................
[CV] ......... c_k=0.5, lambda_param=0.5, sor_omega=0.5, total=   2.8s
[CV] c_k=0.5, lambda_param=0.5, sor_omega=2 ..........................
[CV] .

[CV] ............. c_k=2, lambda_param=0.5, sor_omega=8, total=   3.4s
[CV] c_k=2, lambda_param=0.5, sor_omega=8 ............................
[CV] ............. c_k=2, lambda_param=0.5, sor_omega=8, total=   4.4s
[CV] c_k=2, lambda_param=0.5, sor_omega=8 ............................
[CV] ............. c_k=2, lambda_param=0.5, sor_omega=8, total=   4.7s
[CV] c_k=2, lambda_param=2, sor_omega=0.125 ..........................
[CV] ........... c_k=2, lambda_param=2, sor_omega=0.125, total=   5.8s
[CV] c_k=2, lambda_param=2, sor_omega=0.125 ..........................
[CV] ........... c_k=2, lambda_param=2, sor_omega=0.125, total=   5.0s
[CV] c_k=2, lambda_param=2, sor_omega=0.125 ..........................
[CV] ........... c_k=2, lambda_param=2, sor_omega=0.125, total=   2.1s
[CV] c_k=2, lambda_param=2, sor_omega=0.5 ............................
[CV] ............. c_k=2, lambda_param=2, sor_omega=0.5, total=   5.6s
[CV] c_k=2, lambda_param=2, sor_omega=0.5 ............................
[CV] .

[CV] ............... c_k=8, lambda_param=2, sor_omega=2, total=   0.8s
[CV] c_k=8, lambda_param=2, sor_omega=2 ..............................
[CV] ............... c_k=8, lambda_param=2, sor_omega=2, total=   0.8s
[CV] c_k=8, lambda_param=2, sor_omega=8 ..............................
[CV] ............... c_k=8, lambda_param=2, sor_omega=8, total=   2.8s
[CV] c_k=8, lambda_param=2, sor_omega=8 ..............................
[CV] ............... c_k=8, lambda_param=2, sor_omega=8, total=   4.4s
[CV] c_k=8, lambda_param=2, sor_omega=8 ..............................
[CV] ............... c_k=8, lambda_param=2, sor_omega=8, total=   3.8s
[CV] c_k=8, lambda_param=8, sor_omega=0.125 ..........................
[CV] ........... c_k=8, lambda_param=8, sor_omega=0.125, total=   4.7s
[CV] c_k=8, lambda_param=8, sor_omega=0.125 ..........................
[CV] ........... c_k=8, lambda_param=8, sor_omega=0.125, total=   5.3s
[CV] c_k=8, lambda_param=8, sor_omega=0.125 ..........................
[CV] .

[Parallel(n_jobs=1)]: Done 192 out of 192 | elapsed:  8.9min finished


{'c_k': 0.125, 'lambda_param': 0.125, 'sor_omega': 0.125} 0.06535947712418301


In [50]:
clf.cv_results_

{'mean_fit_time': array([4.71457219, 2.35800703, 0.64668496, 3.75467118, 3.08272092,
        1.97631256, 0.60852957, 3.04036919, 3.90469201, 2.33689642,
        0.63787913, 1.94586531, 2.83294765, 2.81635404, 0.6756947 ,
        1.53584313, 3.42042716, 2.56263924, 0.63637368, 4.18992996,
        4.0929935 , 2.83873987, 0.91284116, 5.78615975, 2.63501891,
        3.33234437, 0.65781943, 2.96773346, 1.34747211, 3.15996448,
        0.797671  , 2.09860802, 3.31138603, 5.59758329, 1.01649968,
        4.38129711, 3.24144332, 3.92332045, 0.76243758, 4.18695847,
        4.27929648, 4.69374291, 0.85287213, 4.81919996, 1.68457556,
        3.92695816, 0.89014983, 3.82778001, 3.34848507, 4.13687468,
        0.90438588, 3.95463475, 3.12391416, 4.00532079, 0.90812206,
        3.98531866, 3.24196704, 3.60734717, 0.79084977, 3.67233745,
        3.87566551, 3.40501936, 0.75645264, 3.9236211 ]),
 'std_fit_time': array([0.18939443, 0.27142367, 0.0189499 , 0.59362432, 0.79905541,
        0.282765  , 0.011

In [48]:
classifier = MLTSVM(c_k = 0.125, sor_omega=0.125, lambda_param=0.125)

final_scores = np.empty((k_folds, num_metrics))
i=0
for train_index, test_index in kf.split(x):
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # train
        classifier.fit(sp.csr_matrix(X_train), sp.csr_matrix(y_train))

        # predict
        y_pred = classifier.predict(sp.csr_matrix(X_test))

        final_scores[i] = calc_metrics(y_test.values, y_pred)
        i=i+1

In [49]:
final_scores.mean(axis=0) # classifier = MLTSVM(c_k = 0.125, sor_omega=0.125, lambda_param=0.125)
# array([0.88431302, 0.09827957, 0.47336469, 0.46543815, 0.48707925])

array([0.88431302, 0.09827957, 0.47336469, 0.46543815, 0.48707925])

In [27]:
final_scores.mean(axis=0) # classifier = MLTSVM(c_k = 2**-1, sor_omega=0.125, lambda_param=8)
# array([0.86919952, 0.07784946, 0.37157766, 0.40305336, 0.37045251])

array([0.86919952, 0.07784946, 0.37157766, 0.40305336, 0.37045251])