In [34]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import re, string

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [4]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [6]:
COMMENT = 'comment_text'
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train[COMMENT])
count_vec = CountVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                min_df=3, max_df=0.9, strip_accents='unicode')
trn_count = count_vec.fit_transform(train[COMMENT])

In [29]:
ind = np.where(train['toxic'] == 0)
trn_count[ind].sum(0)

(1, 426005)

In [31]:
def pr(x, y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [88]:
def get_model(y):
    y = y.values
    r = np.log(pr(trn_term_doc, 1, y) / pr(trn_term_doc, 0, y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = trn_term_doc.multiply(r)
    return m.fit(x_nb, y)

In [89]:
models = []
for i, j in enumerate(label_cols):
    print('fit', j)
    model = get_model(train[j])
    models.append(model)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [106]:
s = "Whatever."
s_count = vec.transform([s])

[print(label_cols[i], models[i].predict_proba(s_count)[0][1]) for i in range(len(label_cols))]

toxic 0.846742683244
severe_toxic 0.00387434664072
obscene 0.120162241449
threat 0.00019864246473
insult 0.171090153144
identity_hate 0.00374191057119


[None, None, None, None, None, None]