In [23]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, log_loss, precision_score, recall_score

In [24]:
LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [25]:
train_data = pd.read_csv('../dataset/train.csv')

In [26]:
test_data = pd.read_csv('../dataset/test.csv')

In [27]:
documents = pd.concat([train_data.comment_text.astype(str), 
                       test_data.comment_text.astype(str)], axis=0).reset_index(drop=True)

In [28]:
vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, max_features=50000)
vec.fit(documents)
x_train = vec.transform(train_data.comment_text.astype(str))
x_test = vec.transform(test_data.comment_text.astype(str))

In [29]:
x_train_copy = x_train.copy()

In [30]:
def pr(x, y_i, y):
    
    p = x[y==y_i].sum(0)
    
    return (p+1) / ((y==y_i).sum()+1)

In [31]:
def get_mdl(x, y, c=4, dual=True):
    y = y.values
    r = np.log(pr(x, 1,y) / pr(x, 0,y))
    
    m = LogisticRegression(C=c, dual=dual)
    
    x_nb = x.multiply(r)
    
    return m.fit(x_nb, y), r

In [None]:
def average(scores):
    return float(sum(scores)) / len(scores)

def get_label(probs):
    probs[probs >= .5 ] = 1
    probs[probs < .5] = 0
    
    return probs

def run_kfold(x, y):
    k_fold = KFold(n_splits=5, random_state=1234)
    fold_scores = {label_column: {'log_loss': [], 'accuracy': [], 'recall': [], 'precision': []}
                 for label_column in LABEL_COLUMNS}
    for fold_i, (train_index, test_index) in enumerate(k_fold.split(x_train_copy)):
        print('Fitting models on fold: {}'.format(fold_i + 1))

        for i, label_column in enumerate(LABEL_COLUMNS):
            x_train_split, y_train_split = x[train_index, :], y[label_column].iloc[train_index]
            x_test_split, y_test_split = x[test_index, :], y[label_column].iloc[test_index]

            m,r = get_mdl(x_train_split, y_train_split)

            preds = m.predict_proba(x_test_split.multiply(r))[:,1]

            log_loss_score = log_loss(y_test_split, preds)

            pred_labels = get_label(preds)

            accuracy = accuracy_score(y_test_split, pred_labels)
            recall = recall_score(y_test_split, pred_labels)
            precision = precision_score(y_test_split, pred_labels)

            fold_scores[label_column]['log_loss'].append(log_loss_score)
            fold_scores[label_column]['accuracy'].append(accuracy)
            fold_scores[label_column]['recall'].append(recall)
            fold_scores[label_column]['precision'].append(precision)

    fold_avgs = {label_column: {score_name: average(fold_scores[label_column][score_name]) 
                 for score_name in fold_scores['toxic'].keys()} for label_column in LABEL_COLUMNS}

    for label_column, score in fold_avgs.items():
        print('{} average: {}'.format())

    print('Log loss: {}\nAccuracy: {}\nRecall: {}\nPrecision: {}'.format(
        log_loss_score_avg, accuracy_avg, recall_avg, precision_avg
    ))

    print('-------------------------------------------------')

 
                 

Fitting models on fold: 1
Fitting models on fold: 2
Fitting models on fold: 3
Fitting models on fold: 4
Fitting models on fold: 5


In [None]:
submission = pd.concat([test_data.id.to_frame(), pd.DataFrame(preds, columns = LABEL_COLUMNS, dtype=float)], axis=1)
submission.to_csv('logistic_submission.csv', index=False)