In [29]:
import pandas as pd

In [30]:
train_data = pd.read_csv('../dataset/preprocessed_train.csv').astype(str)

In [31]:
test_data = pd.read_csv('../dataset/preprocessed_test.csv').astype(str)

In [32]:
documents = pd.concat([train_data.comment_text, test_data.comment_text], axis=0).reset_index(drop=True)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
vec.fit(documents)
x_train = vec.transform(train_data.comment_text)
x_test = vec.transform(test_data.comment_text)

In [41]:
x = x_train.copy()

In [42]:
from sklearn.linear_model import LogisticRegression
import numpy as np

In [43]:
def get_labels(df):
    labels = [df.toxic, df.severe_toxic, df.obscene,
              df.threat, df.insult, df.identity_hate]
    labels = list(map(lambda y: y.values, labels))
    labels = np.array(labels, dtype=np.int32).transpose()

    return labels


In [44]:
y_train = get_labels(train_data)

In [45]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [46]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [47]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

preds = np.zeros((len(test_data), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train_data[j])
    preds[:,i] = m.predict_proba(x_test.multiply(r))[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [48]:
preds[preds.shape[0] - 1, :]

array([ 0.09584751,  0.01000274,  0.05295293,  0.00300462,  0.04936855,
        0.00881226])

In [49]:
test_data.id.shape

(153164,)

In [50]:
submission = pd.concat([test_data.id.to_frame(), pd.DataFrame(preds, columns = label_cols, dtype=float)], axis=1)
submission.to_csv('logistic_submission.csv', index=False)