In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from Stemmer import Stemmer
from nltk import SnowballStemmer
from unidecode import unidecode
import re

from keras.models import Sequential
from keras.layers import Dense, Dropout

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Data Analysis

In [2]:
df_train = pd.read_csv('train.csv', encoding='utf8')
df_test = pd.read_csv('test.csv', encoding='utf8')


In [None]:
def parse_df(df):
    match_numbers = re.compile(r'[0-9]+', flags=re.IGNORECASE)
    df.comment_text = df.comment_text.str.replace(match_numbers, 'NUM')
    df.comment_text = df.comment_text.str.replace('[\n\,;-_"]', ' ')
    stemmer = SnowballStemmer('english')

    df.comment_text = df.comment_text.map(lambda r: map(lambda w: stemmer.stem(w), r.split(' ')))\
                        .map(lambda r: ' '.join(r))

    return df

In [None]:
df_train.head()

## Training

In [3]:
p = Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3),
                                              stop_words='english',
                                              max_features=150000,
                                              min_df=3,
                                              max_df=0.8,
                                              lowercase=True,))])

In [None]:
p.fit(pd.concat([df_train['comment_text'],
                 df_test['comment_text']]))

X_train = p.transform(df_train['comment_text'])
y_train = df_train.iloc[:,2:].values

In [None]:
print X_train.shape, y_train.shape

In [None]:
svd = TruncatedSVD(n_components=256)
X_train_k = svd.fit_transform(X_train)

In [None]:
np.hstack([X_train, X_train_k])

In [None]:
X_train_k.shape

### Sklearn multi model

In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene',
               'threat', 'insult', 'identity_hate']

clfs = {}
roc_aucs = []

for i, class_name in enumerate(class_names):
    clf = SGDClassifier(loss='modified_huber',
                        n_jobs=6,
                        alpha=0.0001,
                        max_iter=15)
    
    score =  np.mean(cross_val_score(clf,
                                     X_train,
                                     y_train[:,i],
                                     cv=3,
                                     scoring='roc_auc'))
    
    print '"{0}" classifier has {1:.3f} roc_auc'.format(class_names[i], score)

    clf.fit(X_train, y_train[:, i])
    clfs[class_names[i]] = clf
    roc_aucs.append(score)

In [None]:
np.mean(roc_aucs)

## Test

In [None]:
X_test = p.transform(df_test['comment_text'])

In [None]:
submission_df = pd.read_csv('sample_submission.csv')

for class_name, model in clfs.items():
    submission_df[class_name] = model.predict_proba(X_test)[:,1]
    

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('submission_sklearn_multi_sgd.csv',
                     index=False)