In [1]:
from nltk import word_tokenize
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('data/train.csv').set_index('id')

In [3]:
documents = dataset.comment_text
documents.head()

id
0000997932d777bf    Explanation\nWhy the edits made under my usern...
000103f0d9cfb60f    D'aww! He matches this background colour I'm s...
000113f07ec002fd    Hey man, I'm really not trying to edit war. It...
0001b41b1c6bb37e    "\nMore\nI can't make any real suggestions on ...
0001d958c54c6e35    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

## new bag

In [4]:
original_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X, y = dataset.comment_text, dataset[original_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

for s in X_train, X_test, y_train, y_test:
    print(s.shape)

(127656,)
(31915,)
(127656, 6)
(31915, 6)


In [5]:
our_columns = original_columns[:1]
our_columns

['toxic']

# ad-hoc begins

In [6]:
col = our_columns[0]

### vectorize

In [7]:
%%time
vectorizer = CountVectorizer(
    analyzer="word", stop_words='english', ngram_range=(1, 2), max_features=100000)
vect_train = vectorizer.fit_transform(X_train)

CPU times: user 22.3 s, sys: 404 ms, total: 22.7 s
Wall time: 22.7 s


##### peek at most common tokens

In [8]:
pd.DataFrame(vect_train.sum(axis=0).T, index=vectorizer.get_feature_names())\
    [0].sort_values(ascending=False)

article                46321
wikipedia              38945
page                   37060
talk                   29794
like                   22456
just                   22316
don                    18924
think                  16047
know                   15195
people                 14870
edit                   14639
articles               13532
use                    13361
time                   12787
did                    11929
user                   11325
thanks                 11135
talk page              10784
make                   10523
good                   10321
ve                      9834
information             9736
does                    9708
want                    9301
deletion                9166
way                     9091
sources                 9011
image                   8830
wp                      8735
pages                   8605
                       ...  
probly                     5
subject topic              5
subject edit               5
subject covere

In [9]:
vect_train.shape

(127656, 100000)

### fit classifier

In [10]:
from sklearn.svm import LinearSVC
# from sklearn.naive_bayes import GaussianNB

In [11]:
%%time
classifier = LinearSVC()
classifier.fit(vect_train, y_train[col])

CPU times: user 6.24 s, sys: 4 ms, total: 6.25 s
Wall time: 6.24 s


### predict

In [12]:
%%time
vect_test = vectorizer.transform(X_test)
y_pred = pd.Series(classifier.predict(vect_test), y_test.index)
score = classifier.score(vect_test, y_test[col])
print(score)

0.952968823437
CPU times: user 2.64 s, sys: 0 ns, total: 2.64 s
Wall time: 2.64 s


### log losses: simple, cutoff 4%, cutoff from score

In [13]:
ll_simple = log_loss(y_test[col], y_pred)
print(ll_simple)

cutoff = 0.04
y_pred_cut004 = y_pred.apply(lambda x: min(1 - cutoff, max(cutoff, x)))
ll_cut004 = log_loss(y_test[col], y_pred_cut004)
print(ll_cut004)

cutoff = 1 - score
y_pred_score_cut = y_pred.apply(lambda x: min(1 - cutoff, max(cutoff, x)))
ll_score_cut = log_loss(y_test[col], y_pred_score_cut)
print(ll_score_cut)

1.62441580146
0.190289605341
0.189679152518
