In [1]:
import re
import numpy as np
import pandas as pd
from scipy import sparse

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import log_loss, confusion_matrix

from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

In [3]:
CLASSES = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [5]:
# Tets data contains one NaN, so we have to reaplce it with something
test.fillna(' ', inplace=True)

In [13]:
def normalize(text):
    text = text.lower()
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = re.sub('\W', ' ', text)
    text = re.sub('[^a-z]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    return text

In [14]:
train['normalized'] = train.comment_text.map(normalize)
test['normalized'] = test.comment_text.map(normalize)

In [45]:
vect_words = TfidfVectorizer(max_features=None, analyzer='word', ngram_range=(1, 1), max_df=1.0, min_df=1)
vect_chars = TfidfVectorizer(max_features=None, analyzer='char', ngram_range=(1, 5), max_df=1.0, min_df=1)

In [46]:
# Creating features
train_vec_words = vect_words.fit_transform(train.normalized)
train_vec_chars = vect_chars.fit_transform(train.normalized)

# Combine two different types of features into single sparse matrix
train_vec = sparse.hstack([train_vec_words, train_vec_chars])

In [47]:
X_train, X_val, y_train, y_val = train_test_split(train_vec, train[CLASSES], test_size=0.33)

In [None]:
%%time
models = {}
feature_selector = {}
for toxicity in CLASSES:
    feature_selector[toxicity] = SelectKBest(mutual_info_classif, k=1000).fit(X_train, y_train[toxicity])
    m = LogisticRegression(C=3.0, class_weight=None)  
    #m = RandomForestClassifier(n_estimators=40, 
    #                           max_depth=None, 
    #                           bootstrap=True,
    #                           n_jobs=-1,
    #                           verbose=1, 
    #                           warm_start=False, 
    #                           class_weight=None)
    #m = SVC(C=1.0, 
    #        kernel="rbf", 
    #        degree=3, 
    #        gamma="auto", 
    #        coef0=0.0, 
    #        probability=True,
    #        class_weight=None, 
    #        verbose=True)
    m.fit(feature_selector[toxicity].transform(X_train), y_train[toxicity])
    models[toxicity] = m
    print("Model for %s trained" % toxicity, flush=True)

Model for toxic trained
Model for severe_toxic trained
Model for obscene trained


In [None]:
predictions = pd.DataFrame()
loss = 0
for toxicity in CLASSES:
    predictions[toxicity] = models[toxicity].predict_proba(feature_selector[toxicity].transform(X_val))[:, 1]
    print(toxicity)
    ll = log_loss(y_val[toxicity], predictions[toxicity])
    loss = loss + ll
    print(ll.round(2))
    print(confusion_matrix(y_val[toxicity], models[toxicity].predict(feature_selector[toxicity].transform(X_val))))
print('mean log-loss: %s' % str(loss/6.))

# predictions on the test set

In [None]:
Xtest_words = vect_words.transform(test.normalized)
Xtest_chars = vect_chars.transform(test.normalized)

Xtest = sparse.hstack([Xtest_words, Xtest_chars])

In [None]:
predictions = pd.DataFrame(test.id)
for toxicity in CLASSES:
    predictions[toxicity] = models[toxicity].predict_proba(Xtest)[:, 1]

In [None]:
predictions.to_csv(datetime.datetime.now().strftime('%Y%m%d%H%M')+'_submission.csv', index=False)