In [26]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [5]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [13]:
df_raw = pd.read_csv('train.csv')
df_nblr = pd.read_csv('train_nblr.csv', usecols=classes).rename(columns={c:'nblr_'+c for c in classes})
df_cnn = pd.read_csv('train_CNN_keras_epoch1.csv', usecols=classes).rename(columns={c:'cnn_'+c for c in classes})
df_rnn = pd.read_csv('train_RNN_GRU.csv', usecols=classes).rename(columns={c:'rnn_'+c for c in classes})
df = pd.concat([df_raw, df_nblr, df_cnn, df_rnn], axis=1)

In [19]:
X = df.drop(columns=['id', 'comment_text']+classes)
y = df[classes]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [21]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 5
}

In [29]:
xgb_test = []
trees = {}
for c in classes:
    print c
    dtrain = xgb.DMatrix(X_train, label=y_train[[c]])
    dtest = xgb.DMatrix(X_test, label=y_test[[c]])
    
    bst = xgb.train(xgb_params, dtrain, 500, [(dtrain, 'train'), (dtest, 'test')],
                    early_stopping_rounds=50, verbose_eval=10)
    trees[c] = bst
    xgb_test.append(log_loss(y_test[[c]].values.transpose()[0], bst.predict(dtest)))

toxic
[0]	train-logloss:0.444346	test-logloss:0.445388
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.032916	test-logloss:0.037931
[20]	train-logloss:0.015112	test-logloss:0.022423
[30]	train-logloss:0.012471	test-logloss:0.021377
[40]	train-logloss:0.010845	test-logloss:0.021317
[50]	train-logloss:0.009736	test-logloss:0.021396
[60]	train-logloss:0.008664	test-logloss:0.021792
[70]	train-logloss:0.007692	test-logloss:0.021959
[80]	train-logloss:0.006728	test-logloss:0.02242
Stopping. Best iteration:
[37]	train-logloss:0.011286	test-logloss:0.02126

severe_toxic
[0]	train-logloss:0.439813	test-logloss:0.440253
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.021488	test-logloss:0.02409
[20]	train-logloss:0.004753	test-logloss:0.00849
[30]	train-l

In [27]:
np.mean(xgb_test)

0.012120738780624358

# Test Set

In [28]:
test_raw = pd.read_csv('test.csv')
test_nblr = pd.read_csv('nblr_submission.csv', usecols=classes).rename(columns={c:'nblr_'+c for c in classes})
test_cnn = pd.read_csv('conv_submission_keras_trainable_embedding_1epoch.csv', usecols=classes).rename(columns={c:'cnn_'+c for c in classes})
test_rnn = pd.read_csv('rnn_submission_keras_gru.csv', usecols=classes).rename(columns={c:'rnn_'+c for c in classes})
test = pd.concat([test_raw, test_nblr, test_cnn, test_rnn], axis=1)

In [31]:
X_pred = np.zeros((len(test_raw), len(classes)))
for i, c in enumerate(classes):
    bst = trees[c]
    dtest = xgb.DMatrix(test.drop(columns=['id', 'comment_text']))
    X_pred[:,i] = bst.predict(dtest)

In [33]:
df_combined = pd.concat([test_raw[['id']], pd.DataFrame(X_pred, columns = classes)], axis=1)
df_combined.to_csv('combined_0.csv', index=False)