In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('data/train.csv').set_index('id')

In [3]:
original_columns = dataset.columns[1:]
original_columns

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [4]:
X, y = dataset.comment_text, dataset[original_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

for s in X_train, X_test, y_train, y_test:
    print(s.shape)

(127656,)
(31915,)
(127656, 6)
(31915, 6)


In [5]:
our_columns = original_columns[:]
our_columns

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

# vectorize to get a bag of words

In [6]:
# define a vectorizer here

from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
    analyzer="word", tokenizer=word_tokenize, ngram_range=(1, 2), max_df=0.5)

In [7]:
%%time
vect_train = vectorizer.fit_transform(X_train)

CPU times: user 1min 57s, sys: 696 ms, total: 1min 58s
Wall time: 1min 58s


##### peek at most common tokens

In [8]:
pd.DataFrame(vect_train.sum(axis=0).T, index=vectorizer.get_feature_names())[0]\
    .sort_values(ascending=False)[:20]

''       195335
of       179591
you      172937
is       144467
that     128526
``       124835
it       118395
in       116092
!         86829
for       82139
this      77983
not       77438
)         72695
on        71852
(         68374
be        66634
:         66611
'' ''     65075
as        62085
have      59286
Name: 0, dtype: int64

In [9]:
vect_train.shape

(127656, 2057546)

## fit a classifier for each output column and make some predictions

In [10]:
# define a classifier here

from sklearn.svm import LinearSVC
classifier = LinearSVC()

In [11]:
%%time

predictions_simple = []
scores = []
for n, col in enumerate(our_columns):
    print('{}/{}: {}'.format(n + 1, len(our_columns), col))
    
    print('let\'s fit a classifier', end='... ')
    classifier.fit(vect_train, y_train[col])
    print('done!')

    print('let\'s predict', end='... ')
    vect_test = vectorizer.transform(X_test)
    y_pred = pd.Series(classifier.predict(vect_test), y_test.index)
    predictions_simple.append(y_pred)
    
    score = (y_pred == y_test[col]).mean()
    scores.append(score)
    print('done! score: {:.3f}'.format(score))

1/6: toxic
let's fit a classifier... done!
let's predict... done! score: 0.958
2/6: severe_toxic
let's fit a classifier... done!
let's predict... done! score: 0.989
3/6: obscene
let's fit a classifier... done!
let's predict... done! score: 0.978
4/6: threat
let's fit a classifier... done!
let's predict... done! score: 0.997
5/6: insult
let's fit a classifier... done!
let's predict... done! score: 0.970
6/6: identity_hate
let's fit a classifier... done!
let's predict... done! score: 0.991
CPU times: user 5min 42s, sys: 228 ms, total: 5min 43s
Wall time: 5min 43s


In [12]:
predf_simple = pd.concat(predictions_simple, axis=1)
predf_simple.columns = original_columns

In [13]:
from sklearn.metrics import log_loss

def mean_log_loss(predf):
    return np.mean([log_loss(real, predicted) for ((_, predicted), (_, real)) in zip(predf.items(), y_test.items())])

In [14]:
mean_log_loss(predf_simple)

0.67241932783619651

In [15]:
# map 0s and 1s to global cutoff values
cutoff = 0.04
predf_cut_004 = predf_simple.applymap(lambda x: min(1 - cutoff, max(cutoff, x)))
mean_log_loss(predf_cut_004)

0.10269355271931069

In [16]:
# (super ugly) map 0s and 1s to the columnwise cutoff values
cutoffs = [1 - score for score in scores]
predf_score_cut = pd.concat([s.apply(lambda x: min(1 - cutoff, max(cutoff, x))) for ((_, s), cutoff) in zip(predf_simple.items(), cutoffs)], axis=1)
mean_log_loss(predf_score_cut)

0.090913877200452894