In [1]:
import pandas as pd
import numpy as np
import yaml, re, copy

from google.cloud import storage
from io import BytesIO

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.phrases import Phrases, Phraser

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator, clone
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import strip_tags
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import distributed
from dask_ml.model_selection import GridSearchCV as GridSearchCVBase

In [2]:
# load the data
client_gcs = storage.Client()
bucket = client_gcs.get_bucket('djr-data')

def gcs_to_df(f):
    blob = bucket.blob(f)
    buf = BytesIO()
    blob.download_to_file(buf)
    buf.seek(0)
    return pd.read_csv(buf, encoding = "utf-8")
 
df_train = gcs_to_df("kaggle-jigsaw/train.csv").head(10000)
df_test = gcs_to_df("kaggle-jigsaw/test.csv").head(10000)
yvar = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
# initialize client for interacting with dask
# DASK_SCHEDULER_ADDRESS env variable specifies scheduler ip
client_dask = distributed.Client()

In [4]:
# correlation matrix
df_train[yvar].corr()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
toxic,1.0,0.308017,0.664821,0.163685,0.642155,0.262158
severe_toxic,0.308017,1.0,0.4014,0.133694,0.364642,0.166033
obscene,0.664821,0.4014,1.0,0.134702,0.714533,0.292142
threat,0.163685,0.133694,0.134702,1.0,0.131717,0.147546
insult,0.642155,0.364642,0.714533,0.131717,1.0,0.353182
identity_hate,0.262158,0.166033,0.292142,0.147546,0.353182,1.0


In [5]:
df_train[yvar].apply(np.mean, axis = 0)

toxic            0.0971
severe_toxic     0.0101
obscene          0.0527
threat           0.0033
insult           0.0494
identity_hate    0.0084
dtype: float64

In [6]:
# train/test split
xdata = df_train.comment_text
ydata = df_train[yvar]
xdata_train, xdata_eval, ydata_train, ydata_eval = train_test_split(xdata, ydata, test_size = 0.2, random_state = 1)

In [7]:
# return words from corpus
# TODO: also try r"([\w][\w']*\w)"
def tokenize(doc, token=r"(?u)\b\w\w+\b"):
    doc = strip_tags(doc.lower())
    doc = re.compile(r"\s\s+").sub(" ", doc)
    words = re.compile(token).findall(doc)
    return words

# remove stop words
def remove_stop_words(x, stop_words=ENGLISH_STOP_WORDS):
    return [i for i in x if i not in stop_words]

In [8]:
# returning basic stats about docs
def basic_stats(docs):
    nwords = docs.apply(lambda x: len(x.split()))
    nchar = docs.apply(len)
    ncap = docs.apply(lambda x: len(re.compile(r"[A-Z]").findall(x)))
    ncap_perc = ncap / nchar
    nexcl = docs.apply(lambda x: len(re.compile(r"!").findall(x)))
    nsmile = docs.apply(lambda x: len(re.compile(r"((?::|;|=)(?:-)?(?:\)|D|P))").findall(x)))
    return pd.DataFrame(data = dict(
        nwords_log = np.log(nwords),
        ncap_perc = ncap_perc,
        nexcl = np.clip(nexcl, 0, 5),
        has_smile = (nsmile > 0).astype(int)
    ))

In [9]:
# wrapper for gensim Phraser
COMMON_TERMS = ["of", "with", "without", "and", "or", "the", "a"]
class PhraseTransformer(TransformerMixin, BaseEstimator):

    def __init__(self, common_terms=COMMON_TERMS):
        self.phraser = None
        self.common_terms = common_terms

    def fit(self, X, y=None):
        phrases = Phrases(X, common_terms=self.common_terms)
        self.phraser = Phraser(phrases)
        return self

    def transform(self, X):
        return X.apply(lambda x: self.phraser[x])

In [10]:
# for making tagged documents
# NOTE: can't use FunctionTransformer since TransformerMixin doesn't pass y to fit_transform anymore
class MakeTaggedDocuments(BaseEstimator):

    def fit(self, X, y):
        return self

    def transform(self, X, y=None):
        if y is not None:
            yvar = list(y.columns)
            tags = y.apply(lambda row: [i for i,j in zip(yvar, row) if j == 1], axis=1)
            return [TaggedDocument(words=w, tags=t) for w,t in zip(X, tags)]
        else:
            return [TaggedDocument(words=w, tags=[]) for w in X]

    def fit_transform(self, X, y):
        return self.transform(X, y)

In [11]:
# wrapper for gensim Doc2Vec
class D2VEstimator(TransformerMixin, BaseEstimator):

    def __init__(self, min_count=10, alpha=0.025, vector_size=200, epochs=20):
        self.min_count = min_count
        self.alpha = alpha
        self.vector_size = vector_size
        self.epochs = epochs
        self.model = Doc2Vec(seed=1, hs=1, negative=0, dbow_words=0, dm=0, min_alpha=0.0001,
                             min_count=self.min_count, alpha=self.alpha, vector_size=self.vector_size, epochs=self.epochs)

    def get_tags(self, doc):
        vec = self.model.infer_vector(doc.words, self.model.alpha, self.model.min_alpha, self.model.epochs)
        return dict(self.model.docvecs.most_similar([vec]))

    def fit(self, X, y=None):
        self.model.build_vocab(X)
        self.model.train(X, epochs=self.model.epochs, total_examples=self.model.corpus_count)
        self.model.delete_temporary_training_data()
        return self

    def transform(self, X):
        pred = [self.get_tags(d) for d in X]
        pred = pd.DataFrame.from_records(data=pred)
        return pred

In [12]:
# create pipeline
pipeline = Pipeline(steps=[
    ('fe', FeatureUnion(transformer_list=[
        ('d2v', Pipeline(steps=[
            ('tk', FunctionTransformer(func=lambda x: x.apply(tokenize), validate=False)),
            ('ph', PhraseTransformer()),
            ('sw', FunctionTransformer(func=lambda x: x.apply(remove_stop_words), validate=False)),
            ('doc', MakeTaggedDocuments()),
            ('d2v', D2VEstimator())
        ])),
        ('bs', FunctionTransformer(func=basic_stats, validate=False))
    ])),
    ('ovr', OneVsRestClassifier(LogisticRegression(class_weight="balanced")))
])

In [13]:
# for non-multimetric, don't require refit = True for best_params_ / best_score_
class GridSearchCV(GridSearchCVBase):

    # For multiple metric evaluation, refit is a string denoting the scorer that should be 
    # used to find the best parameters for refitting the estimator 
    @property
    def scorer_key(self):
        return self.refit if self.multimetric_ else 'score'
    
    @property
    def best_index(self):
        check_is_fitted(self, 'cv_results_')
        return np.flatnonzero(self.cv_results_['rank_test_{}'.format(self.scorer_key)] == 1)[0]

    @property
    def best_params_(self):
        return self.cv_results_['params'][self.best_index]

    @property
    def best_score_(self):
        return self.cv_results_['mean_test_{}'.format(self.scorer_key)][self.best_index]

In [14]:
# hyperparameter tuning
param_grid = {
    'fe__d2v__d2v__min_count': [10, 25],
    'fe__d2v__d2v__alpha': [0.025, 0.05],
    'fe__d2v__d2v__epochs': [10, 20, 30],
    'fe__d2v__d2v__vector_size': [200, 300]
}
            
try:
    with open('model_param_d2v.yaml', 'r') as f:
        param_optimal = yaml.load(f)
except IOError:
    param_optimal = {}

    # create tuner
    tuner = GridSearchCV(pipeline, param_grid, scheduler=client_dask, scoring='roc_auc', 
                         cv=3, refit=False, return_train_score=False)
    
    # determine optimal hyperparameters
    tuner.fit(xdata_train, ydata_train)
    print('Best params: %s' % (str(tuner.best_params_)))
    print('Best params score: %s' % (str(tuner.best_score_)))
    
    # save best params
    with open('model_param_d2v.yaml', 'w') as f:
        yaml.dump(param_optimal, f)

In [15]:
# build model with optimal param
pipeline.set_params(**param_optimal)
pipeline.fit(xdata_train, ydata_train)

Pipeline(memory=None,
     steps=[('fe', FeatureUnion(n_jobs=1,
       transformer_list=[('d2v', Pipeline(memory=None,
     steps=[('tk', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x7f500c97bc80>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          va...=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=1))])

In [16]:
# apply to eval set
ydata_eval_pred = pipeline.predict_proba(xdata_eval)

In [17]:
# calculate auc on eval set
auc = [roc_auc_score(ydata_eval[y], ydata_eval_pred[:,i]) for i,y in enumerate(yvar)]
print('Model AUCs: %s' % auc)
print('Avg AUC: %s' % np.mean(auc))

Model AUCs: [0.9171005870841488, 0.9484245700589584, 0.9399900600454398, 0.8828643216080402, 0.9281275901858704, 0.9536998948763655]
Avg AUC: 0.9283678373098039


In [18]:
# genrate final model
pipeline_final = clone(pipeline)
pipeline_final.set_params(**param_optimal)
pipeline_final.fit(xdata, ydata)

Pipeline(memory=None,
     steps=[('fe', FeatureUnion(n_jobs=1,
       transformer_list=[('d2v', Pipeline(memory=None,
     steps=[('tk', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x7f500c97bc80>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          va...=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=1))])

In [20]:
# generate output
xdata_test = df_test.comment_text
ydata_test_pred = pipeline.predict_proba(xdata_test)
ydata_test_pred = pd.DataFrame(data=ydata_test_pred, columns=yvar)
ydata_test_pred['id'] = df_test.id
ydata_test_pred.to_csv('submission.csv', index=False)