In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib

In [2]:
train = pd.read_csv('../data/preprocessed/train.csv')
test = pd.read_csv('../data/preprocessed/test.csv')

In [3]:
features = ['count_sent',
            'count_word',
            'count_letters',
            'count_punctuations',
            'mean_word_len',
            'word_unique_percent',
            'uppercase_percent',
            'stopwords_perent']

target = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [4]:
def calc_avg_auc(Y,predictions):
    nb_classes = Y.shape[1]
    cols = Y.columns
    score = 0
    for cl in range(nb_classes):
        score += roc_auc_score(Y[cols[cl]],predictions[:,cl])
    score = score/nb_classes
    return score

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(train[features],train[target], random_state=997)
X = test[features]

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [6]:
logit = [None]*len(target)
for cl in range(len(target)):
    logit[cl] = LogisticRegression(multi_class = 'ovr', class_weight = 'balanced')
    logit[cl].fit(X_train,Y_train[list(Y_train.columns)[cl]])

In [7]:
predictions = np.zeros((X_test.shape[0],len(target)))
for cl in range(len(target)):
    predictions[:,cl] = logit[cl].predict(X_test)
score = calc_avg_auc(Y_test,predictions)
print("Mean AUC for a set of single-target logit models is " + str(score))

Mean AUC for a set of single-target logit models is 0.620174187236


In [8]:
submission_logit = np.zeros((X.shape[0],len(target)))
for cl in range(Y_train.shape[1]):
    submission_logit[:,cl] = logit[cl].predict(X)
submission_logit = pd.DataFrame(submission_logit, columns = target)
submission_logit['id'] = test['id']
submission_logit=submission_logit[['id']+target]
submission_logit.to_csv('../submissions/logit.csv', index = False)

# Tree-Based Models

## Random Forest (RF)

In [None]:
from sklearn.tree import *
from sklearn.ensemble import *

### One RF per class

In [9]:
rf = [None]*len(target)
for cl in range(Y_train.shape[1]):
    rf[cl] = RandomForestRegressor(n_estimators=130, max_features=None,random_state=997,min_samples_leaf=100)
    rf[cl].fit(X_train,Y_train[list(Y_train.columns)[cl]])

In [10]:
predictions = np.zeros((X_test.shape[0],len(target)))
for cl in range(len(target)):
    predictions[:,cl] = rf[cl].predict(X_test)
score = calc_avg_auc(Y_test,predictions)
print("Mean AUC for a set of single-target RF models is " + str(score))

Mean AUC for a set of single-target RF models is 0.770897555977


In [14]:
submission_rf = np.zeros((X.shape[0],len(target)))
for cl in range(Y_train.shape[1]):
    submission_rf[:,cl] = rf[cl].predict(X)
    joblib.dump(rf[cl],'../models/rf'+target[cl]+'.pkl')
submission_rf = pd.DataFrame(submission_rf, columns = target)
submission_rf['id'] = test['id']
submission_rf=submission_rf[['id']+target]
submission_rf.to_csv('../submissions/rf.csv', index = False)

### Single multi-output RF

In [12]:
rf_joint = RandomForestRegressor(n_estimators=130,random_state=997,min_samples_leaf=100)
rf_joint.fit(X_train,Y_train)
predictions = rf_joint.predict(X_test)
score = calc_avg_auc(Y_test,predictions)
print("Mean AUC for a single multi-target RF model is " + str(score))

Mean AUC for a single multi-target RF model is 0.777593733361


In [13]:
submission_rf_joint = rf_joint.predict(X)
submission_rf_joint = pd.DataFrame(submission_rf_joint, columns = target)
submission_rf_joint['id'] = test['id']
submission_rf_joint=submission_rf_joint[['id']+target]
submission_rf_joint.to_csv('../submissions/rf_joint.csv', index = False)
joblib.dump(rf_joint,'../models/rf_joint.pkl')

['../models/rf_joint.pkl']

## AdaBoost

In [16]:
from sklearn.ensemble import AdaBoostRegressor

In [48]:
ada = [None]*len(target)
base_estimator = DecisionTreeRegressor(min_samples_leaf=100)
for cl in range(Y_train.shape[1]):
    ada[cl] = AdaBoostRegressor(base_estimator=base_estimator,loss='linear')
    ada[cl].fit(X_train,Y_train[list(Y_train.columns)[cl]])

In [49]:
predictions = np.zeros((X_test.shape[0],len(target)))
for cl in range(len(target)):
    predictions[:,cl] = ada[cl].predict(X_test)
    joblib.dump(rf[cl],'../models/ada'+target[cl]+'.pkl')
score = calc_avg_auc(Y_test,predictions)
print("Mean AUC for a set of AdaBoost models is " + str(score))

Mean AUC for a set of AdaBoost models is 0.705682442077


In [19]:
submission_ada = np.zeros((X.shape[0],len(target)))
for cl in range(Y_train.shape[1]):
    submission_ada[:,cl] = ada[cl].predict(X)
submission_ada = pd.DataFrame(submission_ada, columns = target)
submission_ada['id'] = test['id']
submission_ada=submission_ada[['id']+target]
submission_ada.to_csv('../submissions/ada.csv', index = False)

## XGBoost

In [47]:
from xgboost import XGBClassifier

ImportError: No module named xgboost

In [None]:
xgboost = [None]*len(target)
for cl in range(Y_train.shape[1]):
    xgboost[cl] =XGBBoostClassifier()
    xgboost[cl].fit(X_train,Y_train[list(Y_train.columns)[cl]])