https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline

Apparently, some motivation is here: https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf

In [1]:
SUBMISSION_FILE = 'nb_svm_sklearny_submission.csv'

### Make a sklearny classifier
https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline/notebook#261316

In [2]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse

In [3]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

### Other stuff

In [4]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1 - train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
len(train), len(test)

(159571, 153164)

In [8]:
# train['comment_text'].fillna('unknown', inplace=True)
# train['comment_text'].fillna('unknown', inplace=True)

### Vectorize to bag of words

In [9]:
import re, string

re_tok = re.compile('([{punc}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(punc=string.punctuation))

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

In [10]:
%%time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vec = TfidfVectorizer(
    ngram_range=(1, 2), 
    tokenizer=tokenize, 
    min_df=3, 
    max_df=0.9,
    strip_accents='unicode', 
    sublinear_tf=True)

trn_term_doc = vec.fit_transform(train['comment_text'])
test_term_doc = vec.transform(test['comment_text'])

CPU times: user 1min 11s, sys: 1.09 s, total: 1min 12s
Wall time: 1min 12s


In [11]:
trn_term_doc, test_term_doc

(<159571x426005 sparse matrix of type '<class 'numpy.float64'>'
 	with 17775104 stored elements in Compressed Sparse Row format>,
 <153164x426005 sparse matrix of type '<class 'numpy.float64'>'
 	with 14765755 stored elements in Compressed Sparse Row format>)

In [12]:
pd.DataFrame(trn_term_doc.sum(axis=0).T, index=vec.get_feature_names())[0]\
    .sort_values(ascending=False).head(10)

.      5313.388414
the    4114.770000
,      4088.575353
"      3887.009831
to     3396.389798
you    3359.781848
i      3347.954071
'      3163.683995
a      2900.832193
and    2765.370096
Name: 0, dtype: float64

### Fit models and predict

In [13]:
x = trn_term_doc
test_x = test_term_doc

In [14]:
%%time

preds = np.zeros((len(test), len(label_cols)))

for i, label in enumerate(label_cols):
    print('fit', label)
    model = NbSvmClassifier(C=4, dual=True)
    model.fit(x, train[label])
    preds[:, i] = model.predict_proba(test_x)[:, 1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
CPU times: user 2min 29s, sys: 2.03 s, total: 2min 31s
Wall time: 2min 31s


In [15]:
submid = pd.DataFrame({'id': sample_submission["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns=label_cols)], axis=1)
submission.head(100)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999988,0.106262,0.999987,0.002370,0.962578,0.094959
1,0000247867823ef7,0.002873,0.000604,0.001893,0.000100,0.002227,0.000342
2,00013b17ad220c46,0.011755,0.000864,0.005588,0.000102,0.003210,0.000297
3,00017563c3f7919a,0.000960,0.000224,0.001141,0.000171,0.001057,0.000297
4,00017695ad8997eb,0.009957,0.000485,0.002009,0.000131,0.002395,0.000351
5,0001ea8717f6de06,0.004428,0.000280,0.001964,0.000373,0.003182,0.000364
6,00024115d4cbde0f,0.000535,0.000156,0.000825,0.000102,0.000704,0.000411
7,000247e83dcc1211,0.187793,0.000366,0.003324,0.000132,0.007759,0.000342
8,00025358d4737918,0.009363,0.000135,0.002518,0.000095,0.003299,0.000394
9,00026d1092fe71cc,0.003037,0.000229,0.002203,0.000110,0.002957,0.000276


In [16]:
submission.to_csv(SUBMISSION_FILE, index=False)