In [1]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.externals import joblib

sys.path.insert(1, '..')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

In [None]:
from scipy.sparse import hstack

# Functions and constants

In [3]:
TARGETS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
def transform_preds_to_df(preds):
    target_probs = pd.DataFrame([[c[1] for c in preds[row]] for row in range(len(preds))]).T
    target_probs.columns = TARGETS
    return target_probs

In [5]:
def multi_roc_auc(y_true, y_score, verbose=False):
    ''' Compute roc auc for each target and then average them
    y_true - dataframe of true targets
    y_score - dataframe of predicted target
    '''
    roc_scores = dict()
    for target in TARGETS:
        roc_score = roc_auc_score(y_true=y_true[target], y_score=y_score[target])
        roc_scores[target] = roc_score

    mean_roc_score = np.mean(list(roc_scores.values()))
    
    if verbose: 
        print('Mean ROC AUC overall all targets: {}'.format(mean_roc_score))
    
    return mean_roc_score, roc_scores

In [6]:
def train_and_test_model(model, train_text, train_targets, test_text, test_targets):
    
    # Train model
    model.fit(train_text, train_targets)
    
    # Predict
    preds_train = model.predict_proba(train_text)
    preds_test = model.predict_proba(test_text)
    
    # Transform predictions to dataframes
    preds_train_df = transform_preds_to_df(preds_train)
    preds_test_df = transform_preds_to_df(preds_test)
    
    # Compute metrics
    mean_roc_score_train, roc_scores_train = multi_roc_auc(y_score=preds_train_df.loc[:,TARGETS], y_true=train_targets)
    mean_roc_score_test, roc_scores_test = multi_roc_auc(y_score=preds_test_df.loc[:,TARGETS], y_true=test_targets)
    print('Train score: {} Test score: {}'.format(mean_roc_score_train, mean_roc_score_test))
    print('Individual train score: {} Individual test score: {}'.format(roc_scores_train, roc_scores_test))
    
    return preds_train_df, preds_test_df

# Load data

In [7]:
data_dir = '../data/external/'

In [8]:
train_df = pd.read_csv(data_dir + 'train.csv').fillna(' ')
test_df = pd.read_csv(data_dir + 'test.csv').fillna(' ')

In [9]:
val_df = train_df.head(1000)

In [39]:
# Create a validation set
train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)

# Transform text to features

In [10]:
all_text = pd.concat([train_df.comment_text, test_df.comment_text])

In [11]:
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)

tfidf.fit(all_text)

train_text = tfidf.transform(train_df['comment_text'])
val_text = tfidf.transform(val_df['comment_text'])
test_text = tfidf.transform(test_df['comment_text'])

In [12]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_df['comment_text'])
test_char_features = char_vectorizer.transform(test_df['comment_text'])

In [15]:
train_features = hstack([train_char_features, train_text])
test_features = hstack([test_char_features, test_text])

In [17]:
train_features

<159571x60000 sparse matrix of type '<class 'numpy.float64'>'
	with 162480170 stored elements in COOrdinate format>

# Train model and score on train and validation set

In [12]:
model = MultiOutputClassifier(LogisticRegression(), n_jobs=-1)
model.fit(tra, train_df[TARGETS])

MultiOutputClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
           n_jobs=-1)

In [27]:
NB = MultiOutputClassifier(MultinomialNB(), n_jobs=-1)
NB.fit(X=train_text, y=train_df[TARGETS])

MultiOutputClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
           n_jobs=-1)

In [13]:
preds_train = model.predict_proba(train_text)
preds_val = model.predict_proba(val_text)
preds_test = model.predict_proba(test_text)

In [14]:
preds_train_df = transform_preds_to_df(preds_train)
preds_val_df = transform_preds_to_df(preds_val)
preds_test_df = transform_preds_to_df(preds_test)

In [23]:
fname = '../data/processed/logistic_regression.csv'
preds_test_df.merge(test_df.id.to_frame(), left_index=True, right_index=True).to_csv(fname, index=False)

In [18]:
mean_roc_score, roc_scores = multi_roc_auc(y_score=preds_train_df.loc[:,TARGETS], y_true=train_df.loc[:,TARGETS])
mean_roc_score, roc_scores = multi_roc_auc(y_score=preds_val_df.loc[:,TARGETS], y_true=val_df.loc[:,TARGETS])

# Train multiple models and score

In [19]:
models = {
          'LogisticRegression': MultiOutputClassifier(LogisticRegression(), n_jobs=-1),
          'NaiveBayes':  MultiOutputClassifier(MultinomialNB(), n_jobs=-1),
          'lgb': MultiOutputClassifier(LGBMClassifier(), n_jobs=-1)
          }

for model_name, model in models.items():
    print('\n Training and testing: {}'.format(model))
    preds_train_df, preds_test_df = train_and_test_model(model=model, train_text=train_features, 
                                                         train_targets=train_df.loc[:,TARGETS],
                                                         test_text=train_features, 
                                                         test_targets=train_df.loc[:,TARGETS])


 Training and testing: MultiOutputClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
           n_jobs=-1)


KeyboardInterrupt: 

In [67]:
output = []
for model in models.values():
    print(model)
    preds = model.predict_proba(test_text)
    preds_test_df = transform_preds_to_df(preds)
    output.append(preds_test_df)

MultiOutputClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
           n_jobs=-1)
MultiOutputClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
           n_jobs=-1)
MultiOutputClassifier(estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1),
           n_jobs=-1)


In [68]:
df = pd.DataFrame()
for c in ['toxic','severe_toxic','obscene','threat','insult','identity_hate']:
    df[c] = (0.5*output[0][c]) + (0.5*output[2][c])

In [69]:
fname = '../data/processed/logistic_regression_lgb_ensemble.csv'
df.merge(test_df.id.to_frame(), left_index=True, right_index=True).to_csv(fname, index=False)

In [71]:
preds_test_df.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.996395,0.291054,0.981759,0.04545,0.949492,0.159204
1,0.026924,0.00182,0.010981,0.000453,0.015556,0.002333
2,0.032826,0.000956,0.009129,0.000321,0.011835,0.002333
3,0.01192,0.00051,0.004267,0.000474,0.003957,0.000624
4,0.038521,0.000778,0.005599,0.000968,0.012025,0.001191


In [70]:
df.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.993771,0.228539,0.982597,0.033233,0.936887,0.172757
1,0.022444,0.002986,0.012084,0.001366,0.014276,0.003817
2,0.037424,0.002999,0.014147,0.001341,0.016589,0.004061
3,0.009789,0.001765,0.004981,0.001205,0.005006,0.001562
4,0.041527,0.002282,0.010455,0.001611,0.015552,0.002338


# Output predictions for submission

In [80]:
output_df = pd.concat([test_df['id'], preds_test_df], axis=1)
output_df.head()

In [81]:
output_df.to_csv('../data/processed/submission_baseline.csv')

In [43]:
joblib.dump(model, 'logistic_baseline.pickle')

['logistic_baseline.pickle']