In [88]:
import pandas as pd

In [89]:
dataset = pd.read_csv('data/train.csv').set_index('id')

In [90]:
documents = dataset.comment_text
documents.head()

id
0000997932d777bf    Explanation\nWhy the edits made under my usern...
000103f0d9cfb60f    D'aww! He matches this background colour I'm s...
000113f07ec002fd    Hey man, I'm really not trying to edit war. It...
0001b41b1c6bb37e    "\nMore\nI can't make any real suggestions on ...
0001d958c54c6e35    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

## new bag

In [91]:
from sklearn.model_selection import train_test_split

In [92]:
predict_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X, y = dataset.comment_text, dataset[predict_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

for s in X_train, X_test, y_train, y_test:
    print(s.shape)

(127656,)
(31915,)
(127656, 6)
(31915, 6)


In [103]:
y_test

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7ca72b5b9c688e9e,0,0,0,0,0,0
c03f72fd8f8bf54f,0,0,0,0,0,0
9e5b8e8fc1ff2e84,0,0,0,0,0,0
5332799e706665a6,0,0,0,0,0,0
dfa7d8f0b4366680,0,0,0,0,0,0
64479b84de1d00c1,0,0,0,0,0,0
0e3561a3ab12ebee,0,0,0,0,0,0
b393676802817dac,0,0,0,0,0,0
b5632fa10019dbdc,0,0,0,0,0,0
82d99700af45e2a8,0,0,0,0,0,0


In [93]:
from nltk import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier


In [94]:
our_column = 'toxic'

In [95]:
%%time

clfs = []
for column_name in predict_columns[:1]:
    our_column = column_name
    print('doing {}'.format(our_column))
    
    clf = Pipeline([
        ('vectorizer', CountVectorizer(analyzer="word",
                                       tokenizer=word_tokenize,
                                       max_df=0.5) ),
        ('classifier', RandomForestClassifier())
    ])
    clf.fit(X_train, y_train[our_column])
    clfs.append(clf)

doing toxic
CPU times: user 2min 57s, sys: 128 ms, total: 2min 57s
Wall time: 2min 57s


In [96]:
%%time

scores = []
for column_name, clf in zip(predict_columns, clfs):
    our_column = column_name
    print('doing {}'.format(our_column))
    
    score = clf.score(X_test, y_test[our_column])
    print('{} score: {}'.format(our_column, score))
    scores.append(score)

doing toxic
toxic score: 0.934576218079273
CPU times: user 25.2 s, sys: 4 ms, total: 25.2 s
Wall time: 25.2 s


In [97]:
   
predict_df = pd.DataFrame(y_test).copy()

for column_name, clf in zip(predict_columns, clfs):
    our_column = column_name
    print('doing {}'.format(our_column))

    predict_test = clf.predict(X_test)
    predict_df['predicted_' + our_column] = predict_test

doing toxic


In [98]:
predict_df

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate,predicted_toxic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7ca72b5b9c688e9e,0,0,0,0,0,0,0
c03f72fd8f8bf54f,0,0,0,0,0,0,0
9e5b8e8fc1ff2e84,0,0,0,0,0,0,0
5332799e706665a6,0,0,0,0,0,0,0
dfa7d8f0b4366680,0,0,0,0,0,0,0
64479b84de1d00c1,0,0,0,0,0,0,0
0e3561a3ab12ebee,0,0,0,0,0,0,0
b393676802817dac,0,0,0,0,0,0,0
b5632fa10019dbdc,0,0,0,0,0,0,0
82d99700af45e2a8,0,0,0,0,0,0,0


## Apply cutoffs and save

### All cutoffs 4%

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate,predicted_toxic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7ca72b5b9c688e9e,0,0,0,0,0,0,0.04
c03f72fd8f8bf54f,0,0,0,0,0,0,0.04
9e5b8e8fc1ff2e84,0,0,0,0,0,0,0.04
5332799e706665a6,0,0,0,0,0,0,0.04
dfa7d8f0b4366680,0,0,0,0,0,0,0.04
64479b84de1d00c1,0,0,0,0,0,0,0.04
0e3561a3ab12ebee,0,0,0,0,0,0,0.04
b393676802817dac,0,0,0,0,0,0,0.04
b5632fa10019dbdc,0,0,0,0,0,0,0.04
82d99700af45e2a8,0,0,0,0,0,0,0.04


In [102]:
tempdf.to_csv('partial_results/predict_df4.csv')

### Cutoffs based on score

In [43]:
cutoffs = [1 - score for score in scores]
tempdf = predict_df.copy()

predicted_columns = ['predicted_' + c for c in predict_columns]

for pc, cutoff in zip(predicted_columns, cutoffs):
    tempdf[pc] = tempdf[pc].apply(lambda x: 1 - cutoff if x == 1 else cutoff)
tempdf

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate,predicted_toxic,predicted_severe_toxic,predicted_obscene,predicted_threat,predicted_insult,predicted_identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7ca72b5b9c688e9e,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
c03f72fd8f8bf54f,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
9e5b8e8fc1ff2e84,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
5332799e706665a6,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
dfa7d8f0b4366680,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
64479b84de1d00c1,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
0e3561a3ab12ebee,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
b393676802817dac,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
b5632fa10019dbdc,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
82d99700af45e2a8,0,0,0,0,0,0,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215


In [44]:
tempdf.to_csv('partial_results/predict_df3.csv')

## Predict for true test set

In [45]:
test_dataset = pd.read_csv('data/test.csv').set_index('id')
test_dataset.head()

Unnamed: 0_level_0,comment_text
id,Unnamed: 1_level_1
00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
0000247867823ef7,== From RfC == \n\n The title is fine as it is...
00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
00017563c3f7919a,":If you have a look back at the source, the in..."
00017695ad8997eb,I don't anonymously edit articles at all.


In [49]:
true_predict_df = test_dataset.copy()
true_predict_df
true_X_test = test_dataset.comment_text

In [50]:
for column_name, clf in zip(predict_columns, clfs):
    our_column = column_name
    print('doing {}'.format(our_column))

    predict_test = clf.predict(true_X_test)
    true_predict_df['predicted_' + our_column] = predict_test
true_predict_df

doing toxic
doing severe_toxic
doing obscene
doing threat
doing insult
doing identity_hate


Unnamed: 0_level_0,comment_text,predicted_toxic,predicted_severe_toxic,predicted_obscene,predicted_threat,predicted_insult,predicted_identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,1,0,1,0,1,0
0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0,0,0,0,0,0
00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0,0,0,0,0,0
00017563c3f7919a,":If you have a look back at the source, the in...",0,0,0,0,0,0
00017695ad8997eb,I don't anonymously edit articles at all.,0,0,0,0,0,0
0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...,0,0,0,0,0,0
000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
00025358d4737918,""" \n Only a fool can believe in such numbers. ...",0,0,0,0,0,0
00026d1092fe71cc,== Double Redirects == \n\n When fixing double...,0,0,0,0,0,0


### Apply cutoffs

In [52]:
cutoffs = [
    1 - 0.9567288109039637,
    1 - 0.9888767037443209,
    1 - 0.9764374118752938,
    1 - 0.9971173429421902,
    1 - 0.963246122512925,
    1 - 0.9897853673821088
]
tempdf = true_predict_df.copy()

predicted_columns = ['predicted_' + c for c in predict_columns]

for pc, cutoff in zip(predicted_columns, cutoffs):
    tempdf[pc] = tempdf[pc].apply(lambda x: 1 - cutoff if x == 1 else cutoff)
tempdf

Unnamed: 0_level_0,comment_text,predicted_toxic,predicted_severe_toxic,predicted_obscene,predicted_threat,predicted_insult,predicted_identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0.956729,0.011123,0.976437,0.002883,0.963246,0.010215
0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
00017563c3f7919a,":If you have a look back at the source, the in...",0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
00017695ad8997eb,I don't anonymously edit articles at all.,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
0001ea8717f6de06,Thank you for understanding. I think very high...,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
000247e83dcc1211,:Dear god this site is horrible.,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
00025358d4737918,""" \n Only a fool can believe in such numbers. ...",0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
00026d1092fe71cc,== Double Redirects == \n\n When fixing double...,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215


### Save file

In [None]:
tempdf.drop('comment_text', axis=1, inplace=True)

In [56]:
tempdf.columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
tempdf.head()

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.956729,0.011123,0.976437,0.002883,0.963246,0.010215
0000247867823ef7,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
00013b17ad220c46,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
00017563c3f7919a,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215
00017695ad8997eb,0.043271,0.011123,0.023563,0.002883,0.036754,0.010215


In [57]:
tempdf.to_csv('attempt2.csv')

# some random crap

In [78]:
clfs[0].steps[0][1].get_feature_names()[:100]

['!',
 '#',
 '$',
 '%',
 '&',
 "'",
 "''",
 "''.",
 "''17:08",
 "''2010",
 "''according",
 "''addendum:6",
 "''after",
 "''and",
 "''april",
 "''armies",
 "''artist",
 "''attrition",
 "''bananas",
 "''blindly",
 "''bold",
 "''bonked",
 "''but",
 "''cheers",
 "''clyde",
 "''cofini",
 "''cookie",
 "''countries",
 "''covert",
 "''did",
 "''discover",
 "''edits",
 "''encyclopedia",
 "''everyone",
 "''further",
 "''g",
 "''grumpyness",
 "''hands",
 "''heated",
 "''hello",
 "''here",
 "''hope",
 "''i",
 "''if",
 "''in",
 "''israel",
 "''it",
 "''italic",
 "''jeffire",
 "''jon",
 "''journalism",
 "''justin",
 "''keep",
 "''kentucky",
 "''km",
 "''l'aerophile",
 "''lemming",
 "''macdonald",
 "''maintenance",
 "''many",
 "''maryland",
 "''masonic",
 "''matthew",
 "''most",
 "''motor",
 "''mperator",
 "''n.b",
 "''nope..",
 "''on",
 "''policy",
 "''prohibited",
 "''really",
 "''reflection",
 "''run",
 "''s",
 "''senegalensis",
 "''seriously",
 "''she",
 "''some",
 "''someone",
 "''southern",
 "'

In [73]:
pipe0 = clfs[0]
v0 = pipe0.steps[0][1]

In [76]:
pipe0.steps[1][0]

'classifier'

In [77]:
v0.transform(X_train)

<127656x224083 sparse matrix of type '<class 'numpy.int64'>'
	with 5675030 stored elements in Compressed Sparse Row format>