In [1]:
import pandas as pd
import numpy as np

import re, yaml

from scipy.stats import randint

#import distributed.joblib

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import strip_tags, strip_accents_ascii, TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.metrics import precision_recall_curve, confusion_matrix, roc_auc_score
#from sklearn.externals.joblib import parallel_backend

#from xgboost import XGBClassifier

from gensim.models import Word2Vec

In [2]:
# set random seed
np.random.seed(1)

In [3]:
# load the data
df = pd.read_csv("data/train.csv", encoding = "utf-8")

In [4]:
# set up W2V transformer
class W2VTransformer(TransformerMixin, BaseEstimator):

    def __init__(self, size = 100, **kwargs):
        self.gensim_model = None
        self.size = size
        self.gensim_params = kwargs

    def fit(self, X, y=None):
        self.gensim_model = Word2Vec(sentences = X, size = self.size, **self.gensim_params)
        return self

    @property
    def base_vector(self):
        return np.zeros(self.gensim_model.vector_size)
    
    def get_vector_word(self, word):
        try:
            return self.gensim_model[word]
        except KeyError:
            return self.base_vector

    def get_vector_sentence(self, sentence):
        if sentence:
            vectors = np.array([self.get_vector_word(w) for w in sentence])
            return vectors.mean(axis = 0)
        else:
            return self.base_vector
        
    def transform(self, X):
        return np.vstack([self.get_vector_sentence(s) for s in X])

In [5]:
# train / test split
xdata = df.comment_text
ydata = df.toxic
xdata_train, xdata_test, ydata_train, ydata_test = train_test_split(xdata, ydata, test_size = 0.2, random_state = 1)

In [6]:
# set up pipeline
def tokenize(doc):
    doc = strip_tags(doc.lower())
    doc = re.compile(r"\s\s+").sub(" ", doc)
    words = re.compile(r"(?u)\b\w\w+\b").findall(doc)
    words = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return words

pipeline = Pipeline(steps = [
    ('features', FeatureUnion(transformer_list = [
        ('w2v', Pipeline(steps = [
            ('token', FunctionTransformer(func = lambda X: X.apply(tokenize), validate = False)),
            ('w2v', W2VTransformer())            
        ])),
        ('tfidf', TfidfVectorizer(min_df = 3, max_df = 0.5)),
        ('kbest', Pipeline(steps = [
            ('cv', CountVectorizer(min_df = 3, max_df = 0.5)),
            ('kbest', SelectKBest())
        ])),
    ])),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', LogisticRegression(class_weight = "balanced"))
])

In [7]:
# hyperparameter tuning
param_grid = {
    'features__w2v__w2v__size': (np.arange(1, 10) * 100).tolist(),
    'features__tfidf__max_features': (np.arange(1, 10) * 100).tolist(),
    'features__kbest__kbest__k': (np.arange(1, 8) * 250).tolist(),
    'model__penalty': ['l1', 'l2'],
    'model__C': [0.01, 0.1, 1]
}

try:
    with open('model_param.yaml', 'r') as f:
        param_optimal = yaml.load(f)
except IOError:
    param_optimal = {}

    # create tuner
    ss = ShuffleSplit(n_splits = 5, train_size = 0.25, random_state = 1)
    tuner = RandomizedSearchCV(pipeline, param_grid, scoring = 'roc_auc', cv = ss, verbose = 1, refit = False, 
                               random_state = 1, n_iter = 20)
    
#     # use tuner to determine optimal params
#     # NOTE: need to replace localhost with cluster IP
#     with parallel_backend('dask.distributed', scheduler_host='localhost:8786', 
#                           scatter=[xdata_train, ydata_train]):
#         tuner.fit(xdata_train, ydata_train)
#     print('Best params: %s' % (str(tuner.best_params_)))
#     print('Best params score: %s' % (str(tuner.best_score_)))

#     # save best params
#     param_optimal.update(tuner.best_params_, model_name)

#     with open('model_param.yaml', 'w') as f:
#         yaml.dump(param_optimal, f)



In [8]:
# build model with optimal param
param_optimal = {
    'features__w2v__w2v__size': 500,
    'features__tfidf__max_features': 500,
    'features__kbest__kbest__k': 2000,
    'model__penalty': 'l2',
    'model__C': 0.01
}
pipeline.set_params(**param_optimal)
model = pipeline.fit(xdata_train, ydata_train)



In [9]:
# make predictions for our test set
ydata_test_pred = model.predict_proba(xdata_test)[:,1]



In [10]:
# determine cutoff balancing precision/recall
precision, recall, threshold = precision_recall_curve(ydata_test, ydata_test_pred)
pos_threshold = np.min(threshold[precision[1:] > recall[:-1]])
print('Positive threshold: %s' % str(pos_threshold))
print('Confusion matrix:')
print(confusion_matrix(ydata_test, (ydata_test_pred >= pos_threshold).astype(int)))
print('AUC: %s' % roc_auc_score(ydata_test, ydata_test_pred))

Positive threshold: 0.818817638904037
Confusion matrix:
[[28072   740]
 [  741  2362]]
AUC: 0.9627166282140919
