In [15]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [16]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
subm = pd.read_csv('input/sample_submission.csv')

In [17]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


We'll create a list of all the labels to predict, and we'll also create a 'none' label so we can see how many comments have no labels. We can then summarize the dataset.

In [18]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0
mean,499435900000.0,0.096368,0.010068,0.053301,0.003182,0.049713,0.008492,0.897862
std,289013600000.0,0.295097,0.099832,0.224635,0.05632,0.217352,0.091762,0.302831
min,22256640.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,247343700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,500129700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,750108800000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999988200000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
len(train),len(test)

(95851, 226998)

Remove empty comments.

In [20]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

## Building the model

In [21]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [22]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,4), tokenizer=tokenize,
               min_df=5, max_df=0.95, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

This creates a sparse matrix with only a small number of non-zero elements (stored elements in the representation below).

In [23]:
trn_term_doc, test_term_doc

(<95851x407869 sparse matrix of type '<class 'numpy.float64'>'
 	with 14746287 stored elements in Compressed Sparse Row format>,
 <226998x407869 sparse matrix of type '<class 'numpy.float64'>'
 	with 37552267 stored elements in Compressed Sparse Row format>)

Here's the basic naive bayes feature equation:

In [24]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [25]:
x = trn_term_doc
test_x = test_term_doc

Fit a model for one dependent at a time:

In [26]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [27]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [28]:
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('nb-baseline.csv', index=False)