In [59]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, ParameterSampler
from sklearn.metrics import accuracy_score, log_loss, precision_score, recall_score

from scipy.stats.distributions import uniform

In [24]:
LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [25]:
train_data = pd.read_csv('../dataset/train.csv')

In [26]:
test_data = pd.read_csv('../dataset/test.csv')

In [27]:
documents = pd.concat([train_data.comment_text.astype(str), 
                       test_data.comment_text.astype(str)], axis=0).reset_index(drop=True)

In [28]:
vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, max_features=50000)
vec.fit(documents)
x_train = vec.transform(train_data.comment_text.astype(str))
x_test = vec.transform(test_data.comment_text.astype(str))

In [29]:
x_train_copy = x_train.copy()

In [30]:
def pr(x, y_i, y):
    
    p = x[y==y_i].sum(0)
    
    return (p+1) / ((y==y_i).sum()+1)

In [31]:
def get_mdl(x, y, c=4, dual=True):
    y = y.values
    r = np.log(pr(x, 1,y) / pr(x, 0,y))
    
    m = LogisticRegression(C=c, dual=dual)
    
    x_nb = x.multiply(r)
    
    return m.fit(x_nb, y), r

In [80]:
metrics = ['log_loss', 'accuracy', 'recall', 'precision']

def average(scores):
    length = len(scores) if len(scores) > 0 else 1
    
    return float(sum(scores)) / length

def get_label(probs):
    probs[probs >= .5 ] = 1
    probs[probs < .5] = 0
    
    return probs


def run_kfold(x, y):
    k_fold = KFold(n_splits=5, random_state=1234)
    
    fold_scores = {label_column: {metric_name: [] for metric_name in metrics}
                   for label_column in LABEL_COLUMNS}
    best_log_loss = {label_column: 1 for label_column in LABEL_COLUMNS}
    best_params = {label_column: None for label_column in LABEL_COLUMNS}
    
    param_grid = {'c': uniform(1.3, 2.4)}
    param_sampler = ParameterSampler(param_grid, n_iter=10)
    
    for i, label_column in enumerate(LABEL_COLUMNS):
        for params in param_sampler:
            for fold_i, (train_index, test_index) in enumerate(k_fold.split(x)):
    #             print('Fitting models on fold: {}'.format(fold_i + 1))

                x_train_split, y_train_split = x[train_index, :], y[label_column].iloc[train_index]
                x_test_split, y_test_split = x[test_index, :], y[label_column].iloc[test_index]

                m,r = get_mdl(x_train_split, y_train_split, c=params['c'])

                preds = m.predict_proba(x_test_split.multiply(r))[:,1]

                log_loss_score = log_loss(y_test_split, preds)

                pred_labels = get_label(preds)

                accuracy = accuracy_score(y_test_split, pred_labels)
                recall = recall_score(y_test_split, pred_labels)
                precision = precision_score(y_test_split, pred_labels)

                fold_scores[label_column]['log_loss'].append(log_loss_score)
                fold_scores[label_column]['accuracy'].append(accuracy)
                fold_scores[label_column]['recall'].append(recall)
                fold_scores[label_column]['precision'].append(precision)

            fold_avgs = {label_column: {score_name: average(fold_scores[label_column][score_name]) 
                         for score_name in metrics} for label_column in LABEL_COLUMNS}

    #         for label_column, scores in fold_avgs.items():
    #             print('-----------------------------------------')
    #             print(label_column)

#             for score_name, score_average in fold_avgs[label_column].items():
#                 print('{} average: {}'.format(score_name, score_average))
#             print('-----------------------------------------\n')

            total_avg_log_loss = average(list(map(lambda scores: scores['log_loss'], map(lambda column_name: fold_avgs[column_name], 
                                                                            LABEL_COLUMNS))))
            
            if fold_avgs[label_column]['log_loss'] < best_log_loss[label_column]:
                best_log_loss[label_column] = fold_avgs[label_column]['log_loss']
                best_params[label_column] = params
                
#             print('Average log loss on all labels: {}'.format(total_avg_log_loss))
    return best_log_loss, best_params

In [81]:
log_losses, c = run_kfold(x_train_copy, train_data)

In [82]:
avg_log_loss = average(list(map(lambda column_name: log_losses[column_name], LABEL_COLUMNS)))

In [83]:
avg_log_loss

0.05078621964141904

In [84]:
param

{'identity_hate': {'c': 1.7223574910194115},
 'insult': {'c': 1.5465714849717698},
 'obscene': {'c': 2.3246163376589211},
 'severe_toxic': {'c': 1.3421517716983811},
 'threat': {'c': 2.1944784949912961},
 'toxic': {'c': 1.8996061003708009}}

In [None]:
submission = pd.concat([test_data.id.to_frame(), pd.DataFrame(preds, columns = LABEL_COLUMNS, dtype=float)], axis=1)
submission.to_csv('logistic_submission.csv', index=False)