In [42]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.externals import joblib

sys.path.insert(1, '..')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [29]:
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier

In [3]:
data_dir = 'data/external/'

In [19]:
train_df = pd.read_csv(data_dir + 'train.csv')
test_df = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

In [20]:
test_ids = test_df['id'].values
target_cols = [y for y in train_df.columns if y not in ('id', 'comment_text')]
targets = train_df[target_cols]

text_data = pd.concat([train_df['comment_text'], test_df['comment_text']], axis=0)
text_data.fillna('unknown', inplace=True)
# train_text = text_data[:train_df.shape[0]]



In [9]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=800000)
tfidf_corpus = tfidf.fit_transform(text_data)


In [16]:
train_text = tfidf_corpus[:train_df.shape[0]]
test_text = tfidf_corpus[train_df.shape[0]:]

In [30]:
model = MultiOutputClassifier(LogisticRegression(), n_jobs=-1)
model.fit(train_text, targets)

MultiOutputClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
           n_jobs=-1)

In [32]:
preds = model.predict_proba(test_text)

In [33]:
target_probs = pd.DataFrame([[c[1] for c in preds[row]] for row in range(len(preds))]).T

In [34]:
target_probs.columns = target_cols

In [35]:
target_probs.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.989326,0.142172,0.980654,0.017183,0.884346,0.192551
1,0.024509,0.004974,0.013534,0.002414,0.016832,0.005378
2,0.041344,0.004992,0.017551,0.002179,0.020404,0.005435
3,0.009471,0.00318,0.006626,0.001678,0.007119,0.002366
4,0.045821,0.003321,0.015538,0.002203,0.019554,0.003133


In [36]:
target_probs['id'] = test_ids

In [37]:
for c in target_cols:
    target_probs[c] = target_probs[c].clip(0+1e12, 1-1e12)

In [39]:
target_probs.to_csv('submission_baseline.csv', index=False)

In [41]:
target_probs.shape

(153164, 7)

In [43]:
joblib.dump(model, 'logistic_baseline.pickle')

['logistic_baseline.pickle']

In [45]:
target_probs[['id'] + target_cols].head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.989326,0.142172,0.980654,0.017183,0.884346,0.192551
1,0000247867823ef7,0.024509,0.004974,0.013534,0.002414,0.016832,0.005378
2,00013b17ad220c46,0.041344,0.004992,0.017551,0.002179,0.020404,0.005435
3,00017563c3f7919a,0.009471,0.00318,0.006626,0.001678,0.007119,0.002366
4,00017695ad8997eb,0.045821,0.003321,0.015538,0.002203,0.019554,0.003133
