In [6]:
import sys
sys.path.append('..')

from __future__ import division, print_function 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict 
%matplotlib inline
%load_ext autoreload
%autoreload 2

from evaluation import cross_validate_multilabel, multilabel_results

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import re, string
re_tok = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation))

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

In [3]:
toxic_classes = [
    'toxic', 'severe_toxic', 'obscene', 
    'threat', 'insult', 'identity_hate' 
]

df = pd.read_csv('../data/train_new.csv', na_filter=False)
X_text_train = df['comment_text'].values
Y_train = df[toxic_classes].values

df = pd.read_csv('../data/test_new.csv', na_filter=False)
X_text_test = df['comment_text'].values
id_test = df['id']

del(df) 

In [4]:
%%time
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

X_train = vec.fit_transform(X_text_train)
X_test = vec.transform(X_text_test)

In [7]:
lr = LogisticRegression(C=4, dual=True)

In [8]:
%%time
cv_scores = cross_validate_multilabel(lr, X_train, Y_train, cv=10, scoring='neg_log_loss')

  " = {}.".format(self.n_jobs))


CPU times: user 9min 32s, sys: 12.4 s, total: 9min 45s
Wall time: 9min 58s


In [12]:
multilabel_results(cv_scores, toxic_classes, ['cv'])

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,all
cv,-0.10602,-0.026379,-0.06158,-0.009354,-0.074062,-0.025592,-0.050498


In [9]:
models = OneVsRestClassifier(lr) 
models.fit(X_train, Y_train)

Y_test_prob = models.predict_proba(X_test)

In [10]:
df_submit = pd.concat([id_test, pd.DataFrame(Y_test_prob, columns=toxic_classes)], axis=1)
df_submit.to_csv('../results/m002.csv', index=False)

In [13]:
!kg submit ../results/m002.csv

0.067


There appears to be some discrepencies between the new "private" and "public" test sets leading to overfitting, however this has been noted across the board in several threads on the forum. Suggested explanations are different underlying class distribution in the hold-out set, or inconsistent pre-processing.