In [96]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.externals import joblib

sys.path.insert(1, '..')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [97]:
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Functions and constants

In [98]:
TARGETS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [99]:
def transform_preds_to_df(preds):
    target_probs = pd.DataFrame([[c[1] for c in preds[row]] for row in range(len(preds))]).T
    target_probs.columns = TARGETS
    return target_probs

In [100]:
def multi_roc_auc(y_true, y_score, verbose=False):
    ''' Compute roc auc for each target and then average them
    y_true - dataframe of true targets
    y_score - dataframe of predicted target
    '''
    roc_scores = dict()
    for target in TARGETS:
        roc_score = roc_auc_score(y_true=y_true[target], y_score=y_score[target])
        roc_scores[target] = roc_score

    mean_roc_score = np.mean(list(roc_scores.values()))
    
    if verbose: 
        print('Mean ROC AUC overall all targets: {}'.format(mean_roc_score))
    
    return mean_roc_score, roc_scores

In [101]:
def train_and_test_model(model, train_text, train_targets, test_text, test_targets):
    
    # Train model
    model.fit(train_text, train_targets)
    
    # Predict
    preds_train = model.predict_proba(train_text)
    preds_test = model.predict_proba(test_text)
    
    # Transform predictions to dataframes
    preds_train_df = transform_preds_to_df(preds_train)
    preds_test_df = transform_preds_to_df(preds_test)
    
    # Compute metrics
    mean_roc_score_train, roc_scores_train = multi_roc_auc(y_score=preds_train_df.loc[:,TARGETS], y_true=train_targets)
    mean_roc_score_test, roc_scores_test = multi_roc_auc(y_score=preds_test_df.loc[:,TARGETS], y_true=test_targets)
    print('Train score: {} Test score: {}'.format(mean_roc_score_train, mean_roc_score_test))
    print('Individual train score: {} Individual test score: {}'.format(roc_scores_train, roc_scores_test))
    
    return preds_train_df, preds_test_df

# Load data

In [102]:
data_dir = '../data/external/'

In [103]:
train_df = pd.read_csv(data_dir + 'train.csv')
test_df = pd.read_csv(data_dir + 'test.csv')

In [104]:
# Create a validation set
train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)

# Transform text to features

In [105]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=800000)
train_text = tfidf.fit_transform(train_df['comment_text'])
val_text = tfidf.transform(val_df['comment_text'])
test_text = tfidf.transform(test_df['comment_text'])

# Train model and score on train and validation set

In [106]:
model = MultiOutputClassifier(LogisticRegression(), n_jobs=-1)
model.fit(train_text, train_df[TARGETS])

MultiOutputClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
           n_jobs=-1)

In [107]:
preds_train = model.predict_proba(train_text)
preds_val = model.predict_proba(val_text)
preds_test = model.predict_proba(test_text)

In [108]:
preds_train_df = transform_preds_to_df(preds_train)
preds_val_df = transform_preds_to_df(preds_val)
preds_test_df = transform_preds_to_df(preds_test)

In [109]:
mean_roc_score, roc_scores = multi_roc_auc(y_score=preds_train_df.loc[:,TARGETS], y_true=train_df.loc[:,TARGETS])
mean_roc_score, roc_scores = multi_roc_auc(y_score=preds_val_df.loc[:,TARGETS], y_true=val_df.loc[:,TARGETS])

# Train multiple models and score

In [92]:
models = {
          'LogisticRegression': MultiOutputClassifier(LogisticRegression(), n_jobs=-1),
          'RandomForest':  MultiOutputClassifier(RandomForestClassifier(n_estimators=10), n_jobs=-1)
          }

for model_name, model in models.items():
    print('\n Training and testing: {}'.format(model))
    preds_train_df, preds_test_df = train_and_test_model(model=model, train_text=train_text, 
                                                         train_targets=train_df.loc[:,TARGETS],
                                                         test_text=val_text, 
                                                         test_targets=val_df.loc[:,TARGETS])

Training and testing: MultiOutputClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
           n_jobs=-1)
Train score: 0.9934007237257818 Test score: 0.9756044351143994
Individual train score: {'toxic': 0.9934007237257818, 'severe_toxic': 0.9934007237257818, 'obscene': 0.9934007237257818, 'threat': 0.9934007237257818, 'insult': 0.9934007237257818, 'identity_hate': 0.9934007237257818} Individual test score: {'toxic': 0.9756044351143994, 'severe_toxic': 0.9756044351143994, 'obscene': 0.9756044351143994, 'threat': 0.9756044351143994, 'insult': 0.9756044351143994, 'identity_hate': 0.9756044351143994}
Training and testing: MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, ma

# Output predictions for submission

In [80]:
output_df = pd.concat([test_df['id'], preds_test_df], axis=1)
output_df.head()

In [81]:
output_df.to_csv('../data/processed/submission_baseline.csv')

In [43]:
joblib.dump(model, 'logistic_baseline.pickle')

['logistic_baseline.pickle']