In [1]:
from skmultilearn.adapt import MLTSVM
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV, KFold # for test-train split & cross validation
import random
import scipy.sparse as sp
import preprocessing

# Ensure reproducibility
seed = 561
np.random.seed(seed)
random.seed(seed)

unable to import 'smart_open.gcs', disabling that module


In [2]:
# Load data
merged = preprocessing.load_mea()

# Constants
vec_dim = 100 # how big the word embeddings are
# Vectorize
docvecs = preprocessing.create_tokens(merged, 'text', vec_dim, 'doc')
docvecs_avg = preprocessing.create_tokens(merged, 'text', vec_dim, 'word')

18 unique classes found


In [3]:
# Add doc embeddings
merged = pd.concat([merged, pd.DataFrame(docvecs)], axis=1)
merged = pd.concat([merged, pd.DataFrame(docvecs_avg)], axis=1)

In [4]:
class_dim = 18 # number of distinct classes
k_folds = 5 # number of folds for cv
num_metrics = 6 # number of metrics -- manually set

# Train & Test MLTSVM

## CV to find best params

In [5]:
# Split into x and y
x_doc = merged.iloc[:, 7+class_dim:7+class_dim+vec_dim]
x_word = merged.iloc[:, 7+class_dim+vec_dim:7+class_dim+vec_dim*2]
y = merged.iloc[:, 7:7+class_dim]

# Using grid search with cv to find best params
parameters = {'c_k': [0.0625, 0.125, 2],
             'sor_omega': [0.0625, 0.125, 2],
             'lambda_param': [0.0625, 0.125, 2]}
score = ['accuracy', 'f1_micro'] # note that accuracy here means exact match ratio

In [6]:
# First with the document model
clf_doc = GridSearchCV(MLTSVM(), parameters, scoring=score, verbose=0, refit='accuracy', cv=5)
clf_doc.fit(sp.csr_matrix(x_doc), sp.csr_matrix(y))

print(clf_doc.best_params_, clf_doc.best_score_)



{'c_k': 0.0625, 'lambda_param': 2, 'sor_omega': 2} 0.09803921568627451


In [7]:
# Second with the averaged word vector model
clf_word = GridSearchCV(MLTSVM(), parameters, scoring=score, verbose=0, refit='accuracy', cv=5)
clf_word.fit(sp.csr_matrix(x_word), sp.csr_matrix(y))

print(clf_word.best_params_, clf_word.best_score_)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'c_k': 0.125, 'lambda_param': 2, 'sor_omega': 0.125} 0.11764705882352941


## Final model

In [8]:
def run_cv(x, c_k, sor_omega, lambda_param):
    final_scores = np.empty((k_folds, num_metrics))
    i=0
    for train_index, test_index in kf.split(x):
        classifier = MLTSVM(c_k=c_k, sor_omega=sor_omega, lambda_param=lambda_param)
        
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # train
        classifier.fit(sp.csr_matrix(X_train), sp.csr_matrix(y_train))

        # predict
        y_pred = classifier.predict(sp.csr_matrix(X_test))

        final_scores[i] = preprocessing.calc_metrics(y_test.values, y_pred)
        i=i+1
    return final_scores.mean(axis=0)

In [9]:
kf = KFold(n_splits=k_folds, shuffle=True)

cv_scores_doc = run_cv(x_doc, clf_doc.best_params_['c_k'], clf_doc.best_params_['lambda_param'], clf_doc.best_params_['sor_omega'])
cv_scores_word = run_cv(x_word, clf_word.best_params_['c_k'], clf_word.best_params_['lambda_param'], clf_word.best_params_['sor_omega'])

# Results

In [10]:
print('MLTSVM Model with Doc2Vec Results:')
print(cv_scores_doc) # note that AUC here is meaningless since SVM is not probabilistic
print()
print('MLTSVM Model with Averaged Word2Vec Results:')
print(cv_scores_word) 

MLTSVM Model with Doc2Vec Results:
[0.89001195 0.1172043  0.35531397 0.47171576 0.29046041 0.62590493]

MLTSVM Model with Averaged Word2Vec Results:
[0.8179092  0.01333333 0.25681993 0.22966048 0.29770652 0.58905486]
