In [1]:
import re
import numpy as np
import pandas as pd
from scipy import sparse
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import log_loss, confusion_matrix

from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, SelectPercentile

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [5]:
classes = [x for x in train.columns if x not in ['id', 'comment_text']]

In [6]:
test.fillna(' ', inplace=True)
train.fillna(' ', inplace=True)

In [7]:
def clean_text(col):
    col = col.str.lower()
    col = col.replace(r'\n', ' ').replace(r'\t', ' ')
    col = col.replace(r'[^a-z\s]', '', regex=True)
    col = col.replace(r'\s+', ' ', regex=True)
    col = col.replace(r"([a-z]+?)\1+", r"\1\1", regex=True) # removes any repetitions of letters more than twice
    col = col.replace(r"\b(\w+)(\s)(\1\2?)+", r"\1", regex=True) # removes any repetitions of words more than once
    col = col.str.strip()
    return col

In [8]:
train['comment_text_clean'] = clean_text(train['comment_text'])
test['comment_text_clean'] = clean_text(test['comment_text'])

In [9]:
X_train_df, X_val_df, y_train, y_val = train_test_split(train['comment_text_clean'], 
                                                        train[classes], 
                                                        test_size=0.3)

In [40]:
class vectorizer():
    def __init__(self, analyzer='both'):
        self.analyzer = analyzer
        self.vect_words = TfidfVectorizer(max_features=None, 
                                         analyzer='word', 
                                         ngram_range=(1, 3), 
                                         max_df=0.1, 
                                         min_df=3,
                                         stop_words='english',
                                         use_idf=True)
        self.vect_chars = TfidfVectorizer(max_features=None, 
                                         analyzer='char', 
                                         ngram_range=(1, 5), 
                                         max_df=1., 
                                         min_df=1,
                                         use_idf=True)
        
    def fit(self, col):
        if self.analyzer == 'both':
            self.vect_words.fit(col)
            self.vect_chars.fit(col)
        elif self.analyzer == 'word':
            self.vect_words.fit(col)
        elif self.analyzer == 'char':
            self.vect_chars.fit(col)
        else:
            print('no valid analyzer chosen')
        
    def transform(self, col):
        if self.analyzer == 'both':
            vec = sparse.hstack([self.vect_words.transform(col), 
                                 self.vect_chars.transform(col)])
        elif self.analyzer == 'word':
            vec = self.vect_words.transform(col)
        elif self.analyzer == 'char':
            vec = self.vect_chars.transform(col)
        else:
            raise Exception('no valid analyzer chosen')
            
        return vec

In [52]:
vect = vectorizer(analyzer='word')

In [63]:
vect.fit(train['comment_text_clean'].append(test['comment_text_clean']))

In [64]:
X_train = vect.transform(X_train_df)
X_val = vect.transform(X_val_df)

In [65]:
X_train.shape

(67095, 607218)

In [66]:
# the docker container is too small to run the gridsearch
%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp


In [67]:
%%time
models = {}
feature_selector = {}
for toxicity in classes:
    selec = SelectPercentile(chi2, 
                             percentile=20)
    clf = LogisticRegression(C=5.0, class_weight=None, n_jobs=1)  
    #clf = GradientBoostingClassifier()
    pipe = Pipeline(steps=[('selec', selec), 
                           ('clf', clf)])
    parameters = {'selec__percentile':[1, 3, 6, 8], 
                  'clf__C':[1, 5, 7, 10, 15]}
    est = GridSearchCV(pipe, 
                       parameters, 
                       scoring='neg_log_loss', 
                       n_jobs=-1,
                       cv=3, 
                       verbose=1)
    est.fit(X_train, y_train[toxicity])
    models[toxicity] = est
    print(est.best_params_)
    print(est.best_score_)
    print("Model for %s trained" % toxicity, flush=True)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.1min finished


{'clf__C': 10, 'selec__percentile': 3}
-0.12636351563
Model for toxic trained
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.3min finished


{'clf__C': 5, 'selec__percentile': 1}
-0.0317368187626
Model for severe_toxic trained
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.0min finished


{'clf__C': 15, 'selec__percentile': 1}
-0.0713910449415
Model for obscene trained
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.0min finished


{'clf__C': 10, 'selec__percentile': 8}
-0.0129166544542
Model for threat trained
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.0min finished


{'clf__C': 7, 'selec__percentile': 3}
-0.0886363386131
Model for insult trained
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.0min finished


{'clf__C': 7, 'selec__percentile': 1}
-0.0284944258322
Model for identity_hate trained
CPU times: user 1min 27s, sys: 2.82 s, total: 1min 30s
Wall time: 25min 28s


In [68]:
predictions = pd.DataFrame()
predictions_tr = pd.DataFrame()
loss = 0
loss_tr = 0
for toxicity in classes:
    predictions[toxicity] = models[toxicity].best_estimator_.predict_proba(X_val)[:, 1]
    predictions_tr[toxicity] = models[toxicity].best_estimator_.predict_proba(X_train)[:, 1]
    print(toxicity)
    ll = log_loss(y_val[toxicity], predictions[toxicity])
    ll_tr = log_loss(y_train[toxicity], predictions_tr[toxicity])
    print('test log-loss: %s' % str(ll.round(3)))
    print('train log-loss: %s' % str(ll_tr.round(3)))
    loss = loss + ll
    loss_tr = loss_tr + ll_tr
    print('test confusion matrix')
    print(confusion_matrix(y_val[toxicity], models[toxicity].best_estimator_.predict(X_val)))
print('test mean log-loss: %s' % str(loss/6.))
print('train mean log-loss: %s' % str(loss_tr/6.))

toxic
test log-loss: 0.124
train log-loss: 0.062
test confusion matrix
[[25844   157]
 [ 1131  1624]]
severe_toxic
test log-loss: 0.03
train log-loss: 0.016
test confusion matrix
[[28429    50]
 [  245    32]]
obscene
test log-loss: 0.064
train log-loss: 0.037
test confusion matrix
[[27147    86]
 [  556   967]]
threat
test log-loss: 0.013
train log-loss: 0.003
test confusion matrix
[[28657     7]
 [   82    10]]
insult
test log-loss: 0.084
train log-loss: 0.042
test confusion matrix
[[27207   156]
 [  727   666]]
identity_hate
test log-loss: 0.03
train log-loss: 0.011
test confusion matrix
[[28476    15]
 [  229    36]]
test mean log-loss: 0.0576875404096
train mean log-loss: 0.0285301237115


In [None]:
df = pd.DataFrame(X_val_df.reset_index(drop=True))
df['label'] = y_val['toxic'].reset_index(drop=True)
df['pred'] = predictions['toxic']

In [None]:
def generate_cloud(col):
    text = '.. '.join(list(col))
    wordcloud = WordCloud(max_font_size=40).generate(text)
    plt.figure(figsize=(15,7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
generate_cloud(df[(df.label==1) & (df.pred>0.5)].comment_text_clean)

In [None]:
df[df.comment_text_clean.str.contains('hate u')]

In [None]:
df[(df.label==0) & (df.pred>0.5)].comment_text_clean.iloc[107]

In [None]:
train[train.comment_text.str.contains('I promise you')]['comment_text'].iloc[0]

# predictions on the test set

In [None]:
Xtest = vect.transform(test['comment_text_clean'])

In [None]:
predictions = pd.DataFrame(test.id)
for toxicity in classes:
    predictions[toxicity] = models[toxicity].predict_proba(feature_selector[toxicity].transform(Xtest))[:, 1]

In [None]:
predictions.to_csv(datetime.now().strftime('%Y%m%d%H%M')+'_submission.csv', index=False)