In [91]:
import re
import numpy as np
import pandas as pd
from scipy import sparse
from datetime import datetime

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import log_loss, confusion_matrix

from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

In [3]:
CLASSES = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [5]:
# Tets data contains one NaN, so we have to reaplce it with something
test.fillna(' ', inplace=True)

In [6]:
def normalize(text):
    text = text.lower()
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = re.sub('\W', ' ', text)
    text = re.sub('[^a-z]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    return text

In [7]:
train['normalized'] = train.comment_text.map(normalize)
test['normalized'] = test.comment_text.map(normalize)

In [81]:
vect_words = TfidfVectorizer(max_features=None, analyzer='word', ngram_range=(1, 1), max_df=0.1, min_df=10)
vect_chars = TfidfVectorizer(max_features=None, analyzer='char', ngram_range=(1, 5), max_df=0.1, min_df=10)

In [82]:
# Creating features
train_vec_words = vect_words.fit_transform(train.normalized)
train_vec_chars = vect_chars.fit_transform(train.normalized)

# Combine two different types of features into single sparse matrix
train_vec = sparse.hstack([train_vec_words, train_vec_chars])

In [83]:
train_vec.shape

(95851, 170559)

In [84]:
X_train, X_val, y_train, y_val = train_test_split(train_vec, train[CLASSES], test_size=0.01)

In [85]:
%%time
models = {}
feature_selector = {}
for toxicity in CLASSES:
    feature_selector[toxicity] = SelectKBest(chi2, k=20000).fit(X_train, y_train[toxicity])
    m = LogisticRegression(C=3.0, class_weight=None)  
    #m = RandomForestClassifier(n_estimators=40, 
    #                           max_depth=None, 
    #                           bootstrap=True,
    #                           n_jobs=-1,
    #                           verbose=1, 
    #                           warm_start=False, 
    #                           class_weight=None)
    #m = SVC(C=1.0, 
    #        kernel="rbf", 
    #        degree=3, 
    #        gamma="auto", 
    #        coef0=0.0, 
    #        probability=True,
    #        class_weight=None, 
    #        verbose=True)
    m.fit(feature_selector[toxicity].transform(X_train), y_train[toxicity])
    models[toxicity] = m
    print("Model for %s trained" % toxicity, flush=True)

Model for toxic trained
Model for severe_toxic trained
Model for obscene trained
Model for threat trained
Model for insult trained
Model for identity_hate trained
CPU times: user 1min 29s, sys: 900 ms, total: 1min 30s
Wall time: 50.9 s


In [86]:
predictions = pd.DataFrame()
loss = 0
for toxicity in CLASSES:
    predictions[toxicity] = models[toxicity].predict_proba(feature_selector[toxicity].transform(X_val))[:, 1]
    print(toxicity)
    ll = log_loss(y_val[toxicity], predictions[toxicity])
    loss = loss + ll
    print(ll.round(2))
    print(confusion_matrix(y_val[toxicity], models[toxicity].predict(feature_selector[toxicity].transform(X_val))))
print('mean log-loss: %s' % str(loss/6.))

toxic
0.1
[[859   7]
 [ 28  65]]
severe_toxic
0.04
[[941   5]
 [  9   4]]
obscene
0.05
[[904   5]
 [ 12  38]]
threat
0.01
[[955   0]
 [  4   0]]
insult
0.06
[[905   4]
 [ 17  33]]
identity_hate
0.02
[[950   1]
 [  6   2]]
mean log-loss: 0.0466514011199


# predictions on the test set

In [87]:
Xtest_words = vect_words.transform(test.normalized)
Xtest_chars = vect_chars.transform(test.normalized)

Xtest = sparse.hstack([Xtest_words, Xtest_chars])

In [89]:
predictions = pd.DataFrame(test.id)
for toxicity in CLASSES:
    predictions[toxicity] = models[toxicity].predict_proba(feature_selector[toxicity].transform(Xtest))[:, 1]

In [92]:
predictions.to_csv(datetime.now().strftime('%Y%m%d%H%M')+'_submission.csv', index=False)