In [1]:
import sys
sys.path.append('..')

from __future__ import division, print_function 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import chi2, mutual_info_classif, SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import binarize 
from sklearn.metrics import log_loss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict 
%matplotlib inline
%load_ext autoreload
%autoreload 2

from evaluation import cross_validate_multilabel, multilabel_results

In [2]:
toxic_classes = [
    'toxic', 'severe_toxic', 'obscene', 
    'threat', 'insult', 'identity_hate' 
]

df = pd.read_csv('../data/train.csv', na_filter=False)
X_text_train = df['comment_text'].values
Y_train = df[toxic_classes].values

df = pd.read_csv('../data/test.csv', na_filter=False)
X_text_test = df['comment_text'].values
id_test = df['id']

del(df) 

In [3]:
cvec = CountVectorizer(min_df=10) 

X_train = cvec.fit_transform(X_text_train, Y_train)
X_test = cvec.transform(X_text_test) 

In [22]:
cv_pipe = Pipeline([
        ('kbest', SelectKBest(chi2, k=30)), 
        ('classifier', MultinomialNB()) 
])

In [24]:
cv_scores = cross_validate_multilabel(cv_pipe, X_train, Y_train, cv=10, scoring='neg_log_loss')
multilabel_results(cv_scores, toxic_classes, ['cv'])

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,all
cv,-0.261351,-0.068612,-0.131687,-0.033252,-0.163661,-0.058057,-0.119437


In [4]:
kbest = SelectKBest(chi2, k=30)
X_train = kbest.fit_transform(X_train, Y_train)
X_test = kbest.transform(X_test)

In [5]:
models = OneVsRestClassifier(MultinomialNB())
models.fit(X_train, Y_train)

Y_test_prob = models.predict_proba(X_test)

In [6]:
df_submit = pd.concat([id_test, pd.DataFrame(Y_test_prob, columns=toxic_classes)], axis=1)
df_submit.to_csv('../results/m001.csv', index=False)