In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
# Any results you write to the current directory are saved as output.

In [2]:
train = pd.read_csv('./train_cleaned.csv')
test = pd.read_csv('./test_cleaned.csv')
X_train = train['comment_text'].fillna("CVxTz").values
X_test = test['comment_text'].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

In [3]:
def metric(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

In [5]:
vect = TfidfVectorizer(analyzer='char', ngram_range=(1,4), max_features=50000, min_df=2)
print("Vectorizing")
vect.fit(X_train)
X_test_dm = vect.transform(X_test)

Vectorizing


In [7]:
mysplits = pd.read_csv('./splits.csv')
num_folds = mysplits['split'].unique().shape[0]

X_train_dm = {}
X_val_dm = {}
#Pre-processing for fitting, memory intensive
for j in mysplits['split'].unique():
    print("Vectoring fold {:d}".format(j))
    test_index = mysplits.index[mysplits['split']==j]
    train_index = mysplits.index[mysplits['split']!=j]
    X_train_dm[j]= vect.transform(X_train[train_index])
    X_val_dm[j] = vect.transform(X_train[test_index])

Vectoring fold 2
Vectoring fold 4
Vectoring fold 5
Vectoring fold 9
Vectoring fold 8
Vectoring fold 7
Vectoring fold 6
Vectoring fold 3
Vectoring fold 0
Vectoring fold 1


In [9]:
val_preds = {}
test_preds= {}
for i, category in enumerate(list_classes):
    print("Working on model: {:s}".format(category))
    valpred = np.zeros(y.shape[0])
    testpred = np.zeros(test.shape[0])
    for j in mysplits['split'].unique():
        test_index = mysplits.index[mysplits['split']==j]
        train_index = mysplits.index[mysplits['split']!=j]
        #print("Fitting fold {:d}".format(j))
        
        clf = LogisticRegression(C=9.0, n_jobs=-1)
        clf.fit(X_train_dm[j], y[train_index,i])       
        prediction = clf.predict_proba(X_val_dm[j])[:,1]
        valpred[test_index] = prediction
        test_prediction =  clf.predict_proba(X_test_dm)[:,1]
        testpred+=test_prediction
    testpred = testpred/num_folds
    test_preds[category]=testpred
    val_preds[category]= valpred    

Working on model: toxic
Working on model: severe_toxic
Working on model: obscene
Working on model: threat
Working on model: insult
Working on model: identity_hate


In [11]:
valpred = np.column_stack([val_preds[category] for category in list_classes])
testpred = np.column_stack([test_preds[category] for category in list_classes])
metric(y,valpred)

0.050380818262615416

In [12]:
sample_submission = pd.read_csv("./sample_submission.csv")
sample_submission[list_classes] = testpred.clip(0,1)
sample_submission.to_csv("submission.csv", index=False)

In [13]:
temp = pd.read_csv("./train.csv")
temp[list_classes] = valpred
temp.to_csv("validation_predictions.csv", index = False)