In [1]:
import pandas as pd
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [2]:
def filter_documents(documents):
    return list(map(lambda document: ' '.join(text_to_word_sequence(document)), documents))

In [3]:
train_data = pd.read_csv('../dataset/preprocessed_train.csv').dropna()

In [4]:
test_data = pd.read_csv('../dataset/preprocessed_test.csv').fillna('')

In [5]:
documents = pd.concat([train_data.comment_text, test_data.comment_text], axis=0).reset_index(drop=True)

In [6]:
parsed_documents = filter_documents(documents)

In [7]:
parsed_train = filter_documents(train_data.comment_text)
parsed_test = filter_documents(test_data.comment_text)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
vec.fit(parsed_train)
x_train = vec.transform(parsed_train)
x_test = vec.transform(parsed_test)

In [10]:
x = x_train.copy()

In [12]:
from sklearn.linear_model import LogisticRegression
import numpy as np

In [13]:
def get_labels(df):
    labels = [df.toxic, df.severe_toxic, df.obscene,
              df.threat, df.insult, df.identity_hate]
    labels = list(map(lambda y: y.values, labels))
    labels = np.array(labels, dtype=np.int32).transpose()

    return labels


In [14]:
y_train = get_labels(train_data)

In [15]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [16]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [17]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

preds = np.zeros((len(test_data), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train_data[j])
    preds[:,i] = m.predict_proba(x_test.multiply(r))[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [18]:
preds[preds.shape[0] - 1, :]

array([  1.07140432e-03,   1.97232897e-04,   1.63605989e-03,
         8.83499328e-05,   1.78076485e-03,   2.61363061e-04])

In [19]:
test_data.id.shape

(226998,)

In [20]:
submission = pd.concat([test_data.id.to_frame(), pd.DataFrame(preds, columns = label_cols, dtype=float)], axis=1)
submission.to_csv('logistic_submission.csv', index=False)