In [1]:
import numpy as np
import pandas as pd
from sklearn import *
from textblob import TextBlob



In [2]:
zpolarity = {0:'zero',1:'one',2:'two',3:'three',4:'four',5:'five',6:'six',7:'seven',8:'eight',9:'nine',10:'ten'}
zsign = {-1:'negative',  0.: 'neutral', 1:'positive'}


In [3]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [4]:
coly = [c for c in train.columns if c not in ['id','comment_text']]
y = train[coly]
tid = test['id'].values

In [5]:
train['polarity'] = train['comment_text'].map(lambda x: int(TextBlob(x).sentiment.polarity * 10))
test['polarity'] = test['comment_text'].map(lambda x: int(TextBlob(x).sentiment.polarity * 10))

In [6]:
train.to_csv('Data/train_blob.csv', index =False)
test.to_csv('Data/test_blob.csv',index=False)

In [7]:
train.head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,polarity
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,2
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,2
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,-1
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,-1


In [8]:
train['comment_text'] = train.apply(lambda r: str(r['comment_text']) + ' polarity' +  zsign[np.sign(r['polarity'])] + zpolarity[np.abs(r['polarity'])], axis=1)

In [9]:
test['comment_text'] = test.apply(lambda r: str(r['comment_text']) + 
                                  ' polarity' +  zsign[np.sign(r['polarity'])] + 
                                  zpolarity[np.abs(r['polarity'])], axis=1)

In [10]:
train.comment_text[4]

"You, sir, are my hero. Any chance you remember what page that's on? polarityneutralzero"

In [11]:
test.comment_text[5]

'Thank you for understanding. I think very highly of you and would not revert without discussion. polaritypositivetwo'

In [12]:
df = pd.concat([train['comment_text'], test['comment_text']], axis=0)
df = df.fillna("unknown")
nrow = train.shape[0]

In [13]:
tfidf = feature_extraction.text.TfidfVectorizer(stop_words='english', max_features=800000)
data = tfidf.fit_transform(df)

In [14]:
model = ensemble.ExtraTreesClassifier(n_jobs=-1, random_state=3)
model.fit(data[:nrow], y)
print(1- model.score(data[:nrow], y))


0.00085855199253


In [15]:
sub2 = model.predict_proba(data[nrow:])
sub2

[array([[ 0.3,  0.7],
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        ..., 
        [ 1. ,  0. ],
        [ 0.9,  0.1],
        [ 0.1,  0.9]]), array([[ 0.9,  0.1],
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        ..., 
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        [ 1. ,  0. ]]), array([[ 0.4,  0.6],
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        ..., 
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        [ 0.4,  0.6]]), array([[ 1.,  0.],
        [ 1.,  0.],
        [ 1.,  0.],
        ..., 
        [ 1.,  0.],
        [ 1.,  0.],
        [ 1.,  0.]]), array([[ 0.3,  0.7],
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        ..., 
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        [ 0.6,  0.4]]), array([[ 0.8,  0.2],
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        ..., 
        [ 1. ,  0. ],
        [ 1. ,  0. ],
        [ 1. ,  0. ]])]

In [16]:
sub2 = pd.DataFrame([[c[1] for c in sub2[row]] for row in range(len(sub2))]).T
sub2.columns = coly
sub2['id'] = tid

In [17]:
sub2.head(20)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,id
0,0.7,0.1,0.6,0.0,0.7,0.2,00001cee341fdb12
1,0.0,0.0,0.0,0.0,0.0,0.0,0000247867823ef7
2,0.0,0.0,0.0,0.0,0.0,0.0,00013b17ad220c46
3,0.0,0.0,0.0,0.0,0.0,0.0,00017563c3f7919a
4,0.0,0.0,0.0,0.0,0.0,0.0,00017695ad8997eb
5,0.0,0.0,0.0,0.0,0.0,0.0,0001ea8717f6de06
6,0.0,0.0,0.0,0.0,0.0,0.0,00024115d4cbde0f
7,0.1,0.0,0.1,0.1,0.1,0.0,000247e83dcc1211
8,0.3,0.0,0.0,0.1,0.2,0.0,00025358d4737918
9,0.0,0.0,0.0,0.0,0.0,0.0,00026d1092fe71cc


In [18]:
for c in coly:
    sub2[c] = sub2[c].clip(0+1e12, 1-1e12)

In [19]:
sub2.head(20)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,id
0,0.7,0.1,0.6,0.0,0.7,0.2,00001cee341fdb12
1,0.0,0.0,0.0,0.0,0.0,0.0,0000247867823ef7
2,0.0,0.0,0.0,0.0,0.0,0.0,00013b17ad220c46
3,0.0,0.0,0.0,0.0,0.0,0.0,00017563c3f7919a
4,0.0,0.0,0.0,0.0,0.0,0.0,00017695ad8997eb
5,0.0,0.0,0.0,0.0,0.0,0.0,0001ea8717f6de06
6,0.0,0.0,0.0,0.0,0.0,0.0,00024115d4cbde0f
7,0.1,0.0,0.1,0.1,0.1,0.0,000247e83dcc1211
8,0.3,0.0,0.0,0.1,0.2,0.0,00025358d4737918
9,0.0,0.0,0.0,0.0,0.0,0.0,00026d1092fe71cc


In [20]:
sub1 = pd.read_csv('output/lr_lstm_lrbin_lrcha_textblob.csv')

In [21]:
#blend 1
sub2.columns = [x+'_' if x not in ['id'] else x for x in sub2.columns]
blend = pd.merge(sub1, sub2, how='left', on='id')


In [22]:
blend.head(20)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_,severe_toxic_,obscene_,threat_,insult_,identity_hate_
0,00001cee341fdb12,0.999997,0.463398,0.999985,0.085391,0.987129,0.662095,0.7,0.1,0.6,0.0,0.7,0.2
1,0000247867823ef7,0.003032,0.0016,0.001934,0.000329,0.003284,0.001224,0.0,0.0,0.0,0.0,0.0,0.0
2,00013b17ad220c46,0.011101,0.002395,0.005801,0.000363,0.007277,0.001253,0.0,0.0,0.0,0.0,0.0,0.0
3,00017563c3f7919a,0.002232,0.00213,0.001103,0.000261,0.001068,0.00015,0.0,0.0,0.0,0.0,0.0,0.0
4,00017695ad8997eb,0.019301,0.001439,0.00145,0.000607,0.008245,0.000567,0.0,0.0,0.0,0.0,0.0,0.0
5,0001ea8717f6de06,0.008125,0.000387,0.002695,0.000612,0.010922,0.000605,0.0,0.0,0.0,0.0,0.0,0.0
6,00024115d4cbde0f,0.004121,0.000313,0.00242,6.3e-05,0.001828,0.000673,0.0,0.0,0.0,0.0,0.0,0.0
7,000247e83dcc1211,0.674505,0.001754,0.134225,0.006019,0.192767,0.00413,0.1,0.0,0.1,0.1,0.1,0.0
8,00025358d4737918,0.224124,0.000475,0.101502,0.001972,0.079174,0.00226,0.3,0.0,0.0,0.1,0.2,0.0
9,00026d1092fe71cc,0.001761,0.000447,0.001861,0.000221,0.002393,0.00048,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
for c in coly:
    blend[c] = blend[c] * 0.8 + blend[c+'_'] * 0.2
    blend[c] = blend[c].clip(0+1e12, 1-1e12)
blend = blend[sub1.columns]
blend.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.939998,0.390718,0.919988,0.068313,0.929703,0.569676
1,0000247867823ef7,0.002426,0.00128,0.001547,0.000263,0.002627,0.000979
2,00013b17ad220c46,0.008881,0.001916,0.004641,0.000291,0.005822,0.001002
3,00017563c3f7919a,0.001786,0.001704,0.000883,0.000209,0.000854,0.00012
4,00017695ad8997eb,0.015441,0.001151,0.00116,0.000485,0.006596,0.000453


In [24]:
#blend 2
sub2 = blend[:]
sub2.columns = [x+'_' if x not in ['id'] else x for x in sub2.columns]
blend = pd.merge(sub1, sub2, how='left', on='id')
for c in coly:
    blend[c] = np.sqrt(blend[c] * blend[c+'_'])
    blend[c] = blend[c].clip(0+1e12, 1-1e12)
blend = blend[sub1.columns]
blend.head(20)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.969533,0.425509,0.959153,0.076376,0.957986,0.614149
1,0000247867823ef7,0.002712,0.001431,0.001729,0.000295,0.002937,0.001094
2,00013b17ad220c46,0.009929,0.002142,0.005189,0.000325,0.006509,0.00112
3,00017563c3f7919a,0.001997,0.001905,0.000987,0.000234,0.000955,0.000134
4,00017695ad8997eb,0.017264,0.001287,0.001297,0.000543,0.007375,0.000507
5,0001ea8717f6de06,0.007267,0.000346,0.00241,0.000547,0.009769,0.000541
6,00024115d4cbde0f,0.003686,0.00028,0.002164,5.7e-05,0.001635,0.000602
7,000247e83dcc1211,0.614374,0.001569,0.130758,0.012222,0.183256,0.003694
8,00025358d4737918,0.231587,0.000425,0.090786,0.006523,0.090453,0.002022
9,00026d1092fe71cc,0.001575,0.000399,0.001665,0.000198,0.00214,0.000429


In [25]:
blend.to_csv('submission/ex.csv', index=False)