In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

from keras.models import Sequential
from keras.layers import Dense, Dropout

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Data Analysis

In [3]:
df_train = pd.read_csv('train.csv')

## Training

In [29]:
p = Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3),
                                              stop_words='english',
                                              max_features=80000,
                                              min_df=10,
                                              max_df=0.5,
                                              lowercase=True)),
                    # ('dim-reduc', TruncatedSVD(n_components=256))
                   ])

In [30]:
X_train = p.fit_transform(df_train['comment_text'])
y_train = df_train.iloc[:,2:].values

In [31]:
print X_train.shape, y_train.shape

(159571, 65665) (159571, 6)


### Sklearn multi model

In [32]:
class_names = ['toxic', 'severe_toxic', 'obscene',
               'threat', 'insult', 'identity_hate']

clfs = {}
roc_aucs = []

for i, class_name in enumerate(class_names):
    p_ = Pipeline(steps=[('clf', SGDClassifier(loss='log',
                                               n_jobs=4,
                                               max_iter=5))])
    
    score =  np.mean(cross_val_score(p_,
                                     X_train,
                                     y_train[:,i],
                                     cv=3,
                                     scoring='roc_auc'))
    
    print '"{0}" classifier has {1:.3f} roc_auc'.format(class_names[i], score)

    p_.fit(X_train, y_train[:, i])
    clfs[class_names[i]] = p_
    roc_aucs.append(score)
    

"toxic" classifier has 0.956 roc_auc
"severe_toxic" classifier has 0.984 roc_auc
"obscene" classifier has 0.980 roc_auc
"threat" classifier has 0.974 roc_auc
"insult" classifier has 0.969 roc_auc
"identity_hate" classifier has 0.966 roc_auc


In [33]:
np.mean(roc_aucs)

0.9714445837070133

## Test

In [35]:
df_test = pd.read_csv('test.csv')
X_test = p.transform(df_test['comment_text'])

In [36]:
submission_df = pd.read_csv('sample_submission.csv')

for class_name, model in clfs.items():
    submission_df[class_name] = model.predict_proba(X_test)[:,1]
    

In [37]:
submission_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.750307,0.041616,0.59171,0.006073,0.467185,0.028495
1,0000247867823ef7,0.05461,0.008343,0.032223,0.00398,0.032362,0.008575
2,00013b17ad220c46,0.031342,0.0069,0.020768,0.003526,0.020999,0.007206
3,00017563c3f7919a,0.036287,0.007284,0.02375,0.003625,0.023833,0.007089
4,00017695ad8997eb,0.08001,0.007515,0.036924,0.003798,0.035283,0.007561


In [38]:
submission_df.to_csv('submission_sklearn_multi_sgd.csv',
                     index=False)