In [219]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from Stemmer import Stemmer
from nltk import SnowballStemmer
from unidecode import unidecode
import re

from keras.models import Sequential
from keras.layers import Dense, Dropout

## Data Analysis

In [231]:
df_train = pd.read_csv('train.csv', encoding='utf8')

In [232]:
def parse_df(df):
    match_numbers = re.compile(r'[0-9]+', flags=re.IGNORECASE)
    df.comment_text = df.comment_text.str.replace(match_numbers, 'NUM')
    stemmer = SnowballStemmer('english')

    df.comment_text = df.comment_text.map(lambda r: map(lambda w: stemmer.stem(w), r.split(' ')))\
                        .map(lambda r: ' '.join(r))

    return df

In [235]:
df_train = parse_df(df_train)

In [236]:
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation\nwhi the edit made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww! he match this background colour i'm see...,0,0,0,0,0,0
2,000113f07ec002fd,"hey man, i'm realli not tri to edit war. it ju...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nmore\ni can't make ani real suggest on impr...",0,0,0,0,0,0
4,0001d958c54c6e35,"you, sir, are my hero. ani chanc you rememb wh...",0,0,0,0,0,0


## Training

In [None]:
p = Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2),
                                              stop_words='english',
                                              max_features=60000,
                                              min_df=10,
                                              max_df=0.7,
                                              lowercase=True)),
                    # ('dim-reduc', TruncatedSVD(n_components=256))
                   ])

In [None]:
X_train = p.fit_transform(df_train['comment_text'])
y_train = df_train.iloc[:,2:].values

In [None]:
print X_train.shape, y_train.shape

### Sklearn multi model

In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene',
               'threat', 'insult', 'identity_hate']

clfs = {}
roc_aucs = []

for i, class_name in enumerate(class_names):
    clf = SGDClassifier(loss='modified_huber',
                        n_jobs=6,
                        alpha=0.0001,
                        max_iter=15)
    
    score =  np.mean(cross_val_score(clf,
                                     X_train,
                                     y_train[:,i],
                                     cv=3,
                                     scoring='roc_auc'))
    
    print '"{0}" classifier has {1:.3f} roc_auc'.format(class_names[i], score)

    clf.fit(X_train, y_train[:, i])
    clfs[class_names[i]] = clf
    roc_aucs.append(score)
    

In [182]:
np.mean(roc_aucs)

0.976764195167458

## Test

In [183]:
df_test = pd.read_csv('test.csv')
X_test = p.transform(df_test['comment_text'])

In [184]:
submission_df = pd.read_csv('sample_submission.csv')

for class_name, model in clfs.items():
    submission_df[class_name] = model.predict_proba(X_test)[:,1]
    

In [185]:
submission_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1.0,0.224218,1.0,0.054105,0.773524,0.186411
1,0000247867823ef7,0.0,0.0,0.0,0.0,0.0,0.0
2,00013b17ad220c46,0.0,0.0,0.0,0.0,0.0,0.0
3,00017563c3f7919a,0.0,0.0,0.0,0.0,0.0,0.0
4,00017695ad8997eb,0.05214,0.0,0.0,0.0,0.019391,0.0


In [186]:
submission_df.to_csv('submission_sklearn_multi_sgd.csv',
                     index=False)