In [1]:
import pandas as pd
import numpy as np

import re, yaml

from itertools import product

from distributed import Client

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator, clone
from sklearn.feature_extraction.text import strip_tags, strip_accents_ascii, TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, ParameterGrid
from sklearn.metrics import precision_recall_curve, confusion_matrix, roc_auc_score
from sklearn.utils.validation import check_is_fitted

from sklearn.linear_model import LogisticRegression
#from xgboost import XGBClassifier

from dask_ml.model_selection import RandomizedSearchCV as RandomizedSearchCVBase

from gensim.models import Word2Vec

In [2]:
# set random seed
np.random.seed(1)

In [3]:
# load the data
df_train = pd.read_csv("data/train.csv", encoding = "utf-8")
df_test = pd.read_csv("data/train.csv", encoding = "utf-8")
df_train= df_train.head(10000)
df_test= df_test.head(1000)
yvar = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
# correlation matrix
df_train[yvar].corr()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
toxic,1.0,0.308017,0.664821,0.163685,0.642155,0.262158
severe_toxic,0.308017,1.0,0.4014,0.133694,0.364642,0.166033
obscene,0.664821,0.4014,1.0,0.134702,0.714533,0.292142
threat,0.163685,0.133694,0.134702,1.0,0.131717,0.147546
insult,0.642155,0.364642,0.714533,0.131717,1.0,0.353182
identity_hate,0.262158,0.166033,0.292142,0.147546,0.353182,1.0


In [5]:
df_train[yvar].apply(np.mean, axis = 0)

toxic            0.0971
severe_toxic     0.0101
obscene          0.0527
threat           0.0033
insult           0.0494
identity_hate    0.0084
dtype: float64

In [6]:
# set up W2V transformer
class W2VTransformer(TransformerMixin, BaseEstimator):

    def __init__(self, size = 100, **kwargs):
        self.gensim_model = None
        self.size = size
        self.gensim_params = kwargs

    @staticmethod
    def tokenize(doc):
        doc = strip_tags(doc.lower())
        doc = re.compile(r"\s\s+").sub(" ", doc)
        words = re.compile(r"(?u)\b\w\w+\b").findall(doc)
        words = [w for w in words if w not in ENGLISH_STOP_WORDS]
        return words
    
    @property
    def base_vector(self):
        return np.zeros(self.gensim_model.vector_size)
    
    def get_vector_word(self, word):
        try:
            return self.gensim_model[word]
        except KeyError:
            return self.base_vector

    def get_vector_sentence(self, sentence):
        if sentence:
            vectors = np.array([self.get_vector_word(w) for w in sentence])
            return vectors.mean(axis = 0)
        else:
            return self.base_vector

    def fit(self, X, y=None):
        sentences = X.apply(self.tokenize)
        self.gensim_model = Word2Vec(sentences = sentences, size = self.size, **self.gensim_params)
        return self
        
    def transform(self, X):
        return np.vstack([self.get_vector_sentence(s) for s in X])

In [7]:
# train / test split
xdata = df_train.comment_text
ydata = df_train[yvar]
xdata_train, xdata_eval, ydata_train, ydata_eval = train_test_split(xdata, ydata, test_size = 0.2, random_state = 1)

In [8]:
# set up pipeline
def basic_stats(docs):
    nwords = docs.apply(lambda x: len(x.split()))
    nchar = docs.apply(len)
    ncap = docs.apply(lambda x: len(re.compile(r"[A-Z]").findall(x)))
    ncap_perc = ncap / nchar
    nexcl = docs.apply(lambda x: len(re.compile(r"!").findall(x)))
    nquest = docs.apply(lambda x: len(re.compile(r"\?").findall(x)))
    nsymb = docs.apply(lambda x: len(re.compile(r"&|@|#|\$|%|\*|\^").findall(x)))
    nsmile = docs.apply(lambda x: len(re.compile(r"((?::|;|=)(?:-)?(?:\)|D|P))").findall(x)))
    return pd.DataFrame(data = dict(
        nwords = nwords, nchar = nchar, ncap = ncap, ncap_perc = ncap_perc,
        nexcl = nexcl, nquest = nquest, nsymb = nsymb, nsmile = nsmile
    ))

pipeline = Pipeline(steps = [
    ('features', FeatureUnion(transformer_list = [
        ('w2v', W2VTransformer()),
        ('tfidf', TfidfVectorizer(min_df = 5, max_df = 0.5)),
        ('kbest', Pipeline(steps = [
            ('cv', CountVectorizer(min_df = 5, max_df = 0.5)),
            ('kbest', SelectKBest())
        ])),
        ('stats', FunctionTransformer(func = basic_stats, validate = False))
    ])),
    #('model', XGBClassifier(seed = 1))
    ('model', LogisticRegression())
])

In [9]:
# create the parameter grid
pg1 = [
    {'features__w2v__size': [500], 'features__tfidf__max_features': [1000], 'features__kbest__kbest__k': [1000]},
    {'features__w2v__size': [1000], 'features__tfidf__max_features': [2000], 'features__kbest__kbest__k': [2000]},
    {'features__w2v__size': [1000], 'features__tfidf__max_features': [3000], 'features__kbest__kbest__k': [3000]},
]
pg2 = {
    'model__n_estimators': [250, 500, 1000, 2000],
    'model__learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.3],
    'model__max_depth': [2, 4, 6, 8, 10],
    'model__min_child_weight': [5],
    'model__subsample': [0.8],
    'model__colsample_bytree': [0.8]
}

def merge_param_grids(x, y):
    z = list(product(list(x), list(y)))
    z = [dict(**i[0], **i[1]) for i in z]
    listize = lambda x: dict(zip(x.keys(), [[i] for i in x.values()]))
    z = [listize(i) for i in z]
    return ParameterGrid(z)

pg = merge_param_grids(ParameterGrid(pg1), ParameterGrid(pg2))

In [10]:
# for non-multimetric, don't require refit = True for best_params_ / best_score_
class RandomizedSearchCV(RandomizedSearchCVBase):

    # For multiple metric evaluation, refit is a string denoting the scorer that should be 
    # used to find the best parameters for refitting the estimator 
    @property
    def scorer_key(self):
        return self.refit if self.multimetric_ else 'score'
    
    @property
    def best_index(self):
        check_is_fitted(self, 'cv_results_')
        return np.flatnonzero(self.cv_results_['rank_test_{}'.format(self.scorer_key)] == 1)[0]

    @property
    def best_params_(self):
        return self.cv_results_['params'][self.best_index]

    @property
    def best_score_(self):
        return self.cv_results_['mean_test_{}'.format(self.scorer_key)][self.best_index]

In [11]:
# hyperparameter tuning
try:
    with open('model_param.yaml', 'r') as f:
        param_optimal = yaml.load(f)
except IOError:
    param_optimal = {}

    # create tuner
    client = Client()
    ss = ShuffleSplit(n_splits = 5, train_size = 0.8, random_state = 1)
    tuner = RandomizedSearchCV(pipeline, pg, scheduler = client, scoring = 'roc_auc', 
                               cv = ss, refit = False, return_train_score = False, random_state = 1, 
                               n_iter = 20)
    
#     # use tuner to determine optimal params
#     %time tuner.fit(xdata_train, ydata_train)
#     print('Best params: %s' % (str(tuner.best_params_)))
#     print('Best params score: %s' % (str(tuner.best_score_)))

#     # save best params
#     param_optimal = tuner.best_params_
#     with open('model_param.yaml', 'w') as f:
#         yaml.dump(param_optimal, f)



In [12]:
# build model with optimal param and generate predictions on eval set
param_optimal = {
    'features__w2v__size': 100,
    'features__tfidf__max_features': 100,
    'features__kbest__kbest__k': 100,
    'model__C': 0.01,
    'model__penalty': 'l2'
#     'model__n_estimators': 100,
#     'model__learning_rate': 0.3,
#     'model__max_depth': 6,
#     'model__min_child_weight': 5,
#     'model__subsample': 0.8,
#     'model__colsample_bytree': 0.8
}
pipeline.set_params(**param_optimal)

models = client.map(lambda y: clone(pipeline).fit(xdata_train, ydata_train[y]), yvar)
ydata_eval_pred = client.map(lambda m: pd.Series(m.predict_proba(xdata_eval)[:,1]), models)

ydata_eval_pred = client.gather(ydata_eval_pred)
ydata_eval_pred = pd.concat(ydata_eval_pred, axis = 1)
ydata_eval_pred.columns = yvar

In [14]:
# calculate performance on test set
auc = [roc_auc_score(ydata_eval[y], ydata_eval_pred[y]) for y in yvar]
print('Model AUCs: %s' % auc)
print('Avg AUC: %s' % np.mean(auc))

Model AUCs: [0.8894434442270059, 0.7396385521090292, 0.8482838364167479, 0.7448743718592965, 0.8415090659891328, 0.7163947163947164]
Avg AUC: 0.7966906644993214


In [15]:
xdata_test = df_test.comment_text

models_final = client.map(lambda y: clone(pipeline).fit(xdata, ydata[y]), yvar)
ydata_test_pred = client.map(lambda m: pd.Series(m.predict_proba(xdata_test)[:,1]), models_final)

ydata_test_pred = client.gather(ydata_test_pred)
ydata_test_pred = pd.concat(ydata_test_pred, axis = 1)
ydata_test_pred.columns = yvar
ydata_test_pred['id'] = df_test.id

ydata_test_pred.to_csv('submission.csv', index = False)