In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn import linear_model
from sklearn.model_selection import KFold
from keras.models import load_model
from sklearn import preprocessing
from scipy.optimize import minimize

In [2]:
def metric(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

In [137]:
validation_names = ['val1.csv', 'val2.csv', '415_val.csv', '250_val.csv', 'p100_val.csv', 'p100_val2.csv', '200_gru_val.csv', '2801_val.csv', 'final_val.csv', 'final2_val.csv']
test_names = ['test1.csv', 'test2.csv', '415_test.csv', '250_test.csv', 'p100_test.csv', 'p100_test2.csv', '200_gru_test.csv', '2801_test.csv', 'final_test.csv', 'final2_test.csv']
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
ground_truth = pd.read_csv("../train.csv")
y = ground_truth[list_classes].values

In [138]:
train = []
test = []
for j in range(len(validation_names)):
    train.append(pd.read_csv(validation_names[j]))
    test.append(pd.read_csv(test_names[j]))

In [149]:
#Linear Regression Stacker- replaced with following minimization routine
val_preds = {}
test_preds= {}
for i, category in enumerate(list_classes):
    print("Working on model: {:s}".format(category))
    X_train = np.column_stack([mdl[category].values for mdl in train])
    X_test = np.column_stack([mdl[category].values for mdl in test])
    valpred = np.zeros(y.shape[0])
    testpred = np.zeros(test[0][list_classes].values.shape[0])
    clf = linear_model.LinearRegression(n_jobs=-1)
    clf.fit(X_train, y[:,i])
    valpred = clf.predict(X_train) #Not training any hyperparameters, just fit on everything
    testpred =  clf.predict(X_test)
    test_preds[category]=testpred
    val_preds[category]= valpred 

Working on model: toxic
Working on model: severe_toxic
Working on model: obscene
Working on model: threat
Working on model: insult
Working on model: identity_hate


In [223]:
def function_metric(x,label, train_index):
    x = [max(val, 0) for val in x]
    x = x / np.array(x).sum()
    y_test = np.zeros(len(train_index))
    for k in range(len(validation_names)):
        y_test += x[k]*train[k][list_classes[label]].values[train_index]
    logloss = log_loss(y[train_index,label], y_test)
    return logloss

In [228]:
num_folds = 3
num_models = len(validation_names)
kf = KFold(n_splits=num_folds)

x0 = np.array([1 for _ in range(num_models)])
val_preds = {}
test_preds= {}
for i, category in enumerate(list_classes):
    print("Working on model: {:s}".format(category))
    valpred = np.zeros(y.shape[0])
    testpred = np.zeros(test[0][list_classes].values.shape[0])
    for j, (train_index, test_index) in enumerate(kf.split(X_train)):
        print("Fitting fold {:d}".format(j))
        res = minimize(function_metric, x0, args=(i, train_index), method='nelder-mead', options={'xtol': 1e-3, 'disp': True})
        coeffs = [max(val, 0) for val in res.x]
        x_res = coeffs / np.array(coeffs).sum()    
        prediction = np.zeros(len(test_index))
        
        for k in range(num_models):
            prediction += x_res[k]*train[k][category].values[test_index]  
            
        test_prediction = np.zeros(X_test.shape[0])
        for k in range(num_models):
            test_prediction += x_res[k]*test[k][category].values   
            
        valpred[test_index] = prediction
        testpred+=test_prediction
    testpred = testpred/num_folds
    test_preds[category]=testpred
    val_preds[category]= valpred   

Working on model: toxic
Fitting fold 0
Optimization terminated successfully.
         Current function value: 0.083133
         Iterations: 350
         Function evaluations: 507
Fitting fold 1
Optimization terminated successfully.
         Current function value: 0.084477
         Iterations: 463
         Function evaluations: 666
Fitting fold 2
Optimization terminated successfully.
         Current function value: 0.085127
         Iterations: 495
         Function evaluations: 708
Working on model: severe_toxic
Fitting fold 0
Optimization terminated successfully.
         Current function value: 0.022179
         Iterations: 479
         Function evaluations: 685
Fitting fold 1
Optimization terminated successfully.
         Current function value: 0.021861
         Iterations: 499
         Function evaluations: 739
Fitting fold 2
Optimization terminated successfully.
         Current function value: 0.021698
         Iterations: 503
         Function evaluations: 720
Working on mode

In [225]:
valpred = np.column_stack([val_preds[category] for category in list_classes])
testpred = np.column_stack([test_preds[category] for category in list_classes])
metric(y,valpred.clip(0,1))

0.038969271214682986

In [182]:
sample_submission = pd.read_csv("../sample_submission.csv")
sample_submission[list_classes] = testpred.clip(0,1)
sample_submission.to_csv("submission.csv", index=False)