# Naïve Bayes

In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## 1. Загрузка данных

In [2]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
subm = pd.read_csv("./data/sample_submission.csv")

**Посмотрим на данные:** один ряд = один комментарий + данные по его принадлежности к каждому классу.

In [3]:
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"Fine, I don't think blocking is a solution to ...",0,0,0,0,0,0
1,ter. v er.,1,0,1,0,1,0
2,.\n\nNeither am I. Pants suck. They should all...,1,0,1,0,0,0
3,"iYou're missing my point so badly, I can't tel...",0,0,0,0,0,0
4,photo \n\ni'll see what i can do to it. i've...,0,0,0,0,0,0


Выделим отдельно колонку `None`, чтобы понять, является ли комментарий вообще хоть каким-нибудь.

In [15]:
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train["none"] = 1 - train[label_cols].max(axis=1)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,"Fine, I don't think blocking is a solution to ...",0,0,0,0,0,0,1
1,ter. v er.,1,0,1,0,1,0,0
2,.\n\nNeither am I. Pants suck. They should all...,1,0,1,0,0,0,0
3,"iYou're missing my point so badly, I can't tel...",0,0,0,0,0,0,1
4,photo \n\ni'll see what i can do to it. i've...,0,0,0,0,0,0,1


Заполняем пустые комментарии:

In [5]:
COMMENT = "comment_text"
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

## 2. Чистим текст

In [6]:
import re
import string

In [7]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [8]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), 
                      tokenizer=tokenize,
                      min_df=3, 
                      max_df=0.9, 
                      strip_accents="unicode", 
                      use_idf=1,
                      smooth_idf=1, 
                      sublinear_tf=1)
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [9]:
trn_term_doc, test_term_doc

(<22404x68177 sparse matrix of type '<class 'numpy.float64'>'
 	with 1574191 stored elements in Compressed Sparse Row format>,
 <12064x68177 sparse matrix of type '<class 'numpy.float64'>'
 	with 821804 stored elements in Compressed Sparse Row format>)

In [10]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [11]:
x = trn_term_doc
test_x = test_term_doc

In [12]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [13]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print("Fitting logreg for type: {}".format(j.upper()))
    m, r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]
    print("Done!")

Fitting logreg for type: toxic
Done!
Fitting logreg for type: severe_toxic
Done!
Fitting logreg for type: obscene
Done!
Fitting logreg for type: threat
Done!
Fitting logreg for type: insult
Done!
Fitting logreg for type: identity_hate
Done!


In [14]:
submid = pd.DataFrame({"id": subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv("submission_NB.csv", index=False)