In [5]:
from nltk import word_tokenize
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [6]:
dataset = pd.read_csv('data/train.csv').set_index('id')

In [7]:
documents = dataset.comment_text
documents.head()

id
0000997932d777bf    Explanation\nWhy the edits made under my usern...
000103f0d9cfb60f    D'aww! He matches this background colour I'm s...
000113f07ec002fd    Hey man, I'm really not trying to edit war. It...
0001b41b1c6bb37e    "\nMore\nI can't make any real suggestions on ...
0001d958c54c6e35    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

## new bag

In [8]:
original_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X, y = dataset.comment_text, dataset[original_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

for s in X_train, X_test, y_train, y_test:
    print(s.shape)

(127656,)
(31915,)
(127656, 6)
(31915, 6)


In [9]:
our_columns = original_columns[:1]
our_columns

['toxic']

# ad-hoc begins

In [10]:
col = our_columns[0]

### vectorize

In [11]:
%%time
vectorizer = CountVectorizer(
    analyzer="word", tokenizer=word_tokenize, ngram_range=(1, 2), max_df=0.5)
vect_train = vectorizer.fit_transform(X_train)

CPU times: user 1min 58s, sys: 660 ms, total: 1min 59s
Wall time: 1min 59s


##### peek at most common tokens

In [12]:
pd.DataFrame(vect_train.sum(axis=0).T, index=vectorizer.get_feature_names())[0].sort_values(ascending=False)

''                    195335
of                    179591
you                   172937
is                    144467
that                  128526
``                    124835
it                    118395
in                    116092
!                      86829
for                    82139
this                   77983
not                    77438
)                      72695
on                     71852
(                      68374
be                     66634
:                      66611
'' ''                  65075
as                     62085
have                   59286
are                    58744
?                      57038
's                     53433
`` ''                  51975
your                   50398
do                     50079
with                   47878
if                     46888
! !                    46711
n't                    45756
                       ...  
immidiely or               1
immigartion                1
immigartion ,              1
stand uncorrec

In [13]:
vect_train.shape

(127656, 2057546)

### PCA

In [19]:
from sklearn.decomposition import TruncatedSVD

In [24]:
svd = TruncatedSVD(n_components=10000)

In [25]:
svd.fit(vect_train)

MemoryError: 

In [23]:
print(svd.explained_variance_ratio_)

[ 0.62530592  0.02063627]


### fit classifier

In [10]:
from sklearn.svm import LinearSVC

In [11]:
%%time
classifier = LinearSVC()
classifier.fit(vect_train, y_train[col])

CPU times: user 28.8 s, sys: 80 ms, total: 28.9 s
Wall time: 28.9 s


### predict

In [12]:
%%time
vect_test = vectorizer.transform(X_test)
y_pred = pd.Series(classifier.predict(vect_test), y_test.index)
score = classifier.score(vect_test, y_test[col])
print(score)

0.958138806204
CPU times: user 27.7 s, sys: 0 ns, total: 27.7 s
Wall time: 27.7 s


### log losses: simple, cutoff 4%, cutoff from score

In [13]:
ll_simple = log_loss(y_test[col], y_pred)
print(ll_simple)

cutoff = 0.04
y_pred_cut004 = y_pred.apply(lambda x: min(1 - cutoff, max(cutoff, x)))
ll_cut004 = log_loss(y_test[col], y_pred_cut004)
print(ll_cut004)

cutoff = 1 - score
y_pred_score_cut = y_pred.apply(lambda x: min(1 - cutoff, max(cutoff, x)))
ll_score_cut = log_loss(y_test[col], y_pred_score_cut)
print(ll_score_cut)

1.44584661343
0.173859121807
0.173814672309
