# Notebook 2 - NB-SVM Implementation

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt

import os

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
import spacy

In [2]:
BASEDIR = '/data/datasets/kaggle/jigsaw-toxic-comment-classification-challenge'

In [3]:
train = pd.read_csv(os.path.join(BASEDIR, 'train.csv'))
test = pd.read_csv(os.path.join(BASEDIR, 'test.csv'))
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0,95851.0
mean,499435900000.0,0.096368,0.010068,0.053301,0.003182,0.049713,0.008492,0.897862
std,289013600000.0,0.295097,0.099832,0.224635,0.05632,0.217352,0.091762,0.302831
min,22256640.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,247343700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,500129700000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,750108800000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999988200000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
len(train),len(test)

(95851, 226998)

In [6]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [7]:
nlp = spacy.load('en', disable=['parser', 'ner', 'textcat'])

In [8]:
def reduce_to_double_max(text):
    """Removes unecessary doubling/tripling/etc of characters
    
    Steps:
        1. Replaces every 3+ consecutive identical chars by 2 consecutive identical chars
        2. Replaces every 2+ consecutive non-word character by a single
    """
    import re
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
    return re.sub(r'(\W)\1+', r'\1', text)

In [9]:
def preprocess_corpus(corpus):
    """Applies all preprocessing rules to the corpus"""
    corpus = (reduce_to_double_max(s.lower()) for s in corpus)
    docs = nlp.pipe(corpus, batch_size=1000, n_threads=12)
    return [' '.join([x.lemma_ for x in doc if x.is_alpha]) for doc in docs]

In [10]:
fname_train_processed = '../data/processed/train.txt'

if os.path.isfile(fname_train_processed):
    with open(fname_train_processed, 'r') as fin:
        train_processed = [line.strip() for line in fin if line]
    
else:
    train_processed = preprocess_corpus(train['comment_text'])

    with open(fname_train_processed, 'w') as fout:
        for doc in train_processed:
            fout.write('{}\n'.format(doc))
    
train['comment_text_processed'] = train_processed

In [11]:
fname_test_processed = '../data/processed/test.txt'

if os.path.isfile(fname_test_processed):
    with open(fname_test_processed, 'r') as fin:
        test_processed = [line.strip() for line in fin if line]
    
else:
    test_processed = preprocess_corpus(test['comment_text'])

    with open(fname_test_processed, 'w') as fout:
        for doc in test_processed:
            fout.write('{}\n'.format(doc))
    
test['comment_text_processed'] = test_processed

In [12]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [13]:
all_text = pd.concat([train['comment_text_processed'], test['comment_text_processed']])

In [14]:
word_vect = TfidfVectorizer(
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    min_df=10,
    max_df=0.75,
    ngram_range=(1,2),
    max_features=100000,
    binary=True)
word_vect.fit(all_text)

TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=100000, min_df=10,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [22]:
train_word_features = word_vect.transform(train['comment_text_processed'])
test_word_features = word_vect.transform(test['comment_text_processed'])

In [27]:
char_vect = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1,5),
    max_features=100000)
char_vect.fit(pd.concat([train['comment_text'], test['comment_text']]))

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100000, min_df=1,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [28]:
train_char_features = char_vect.transform(train['comment_text'])
test_char_features = char_vect.transform(test['comment_text'])

In [29]:
train_features = hstack((train_char_features, train_word_features))
test_features = hstack((test_char_features, test_word_features))

In [115]:
# Adapted from AlexSanchez's code at https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline-eda-0-052-lb
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x.tocsr()[y==y_i, :].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, solver='sag', dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [116]:
def evaluate_model(model, train_ft, y_true):
    cv_loss = np.mean(cross_val_score(model, train_ft, y_true, cv=3, n_jobs=4, scoring='neg_log_loss'))
    return cv_loss

In [117]:
losses = []
preds = {'id': test['id']}
for class_name in class_names:
    targets = train[class_name]
    model = NbSvmClassifier(C=4, dual=False)
    loss = evaluate_model(model, train_features, targets)
    print('Avg. CV loss for class {}: {}'.format(class_name, loss))
    losses.append(loss)
    model.fit(train_features, targets)
    preds[class_name] = model.predict_proba(test_features)[:, 1]

(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)




Avg. CV loss for class toxic: -0.10059793434871923
(95851, 200000) (95851,)
(95851, 200000) (95851,)




(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63902, 200000) (63902,)
(63902, 200000) (63902,)
Avg. CV loss for class severe_toxic: -0.03118636050896448
(95851, 200000) (95851,)
(95851, 200000) (95851,)




(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
Avg. CV loss for class obscene: -0.05851601719440596
(95851, 200000) (95851,)
(95851, 200000) (95851,)




(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63902, 200000) (63902,)
(63902, 200000) (63902,)
Avg. CV loss for class threat: -0.012098497638420616
(95851, 200000) (95851,)
(95851, 200000) (95851,)




(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
Avg. CV loss for class insult: -0.07934594923123112
(95851, 200000) (95851,)
(95851, 200000) (95851,)




(63900, 200000) (63900,)
(63900, 200000) (63900,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
(63901, 200000) (63901,)
Avg. CV loss for class identity_hate: -0.028611218202053923
(95851, 200000) (95851,)
(95851, 200000) (95851,)




In [119]:
print('Cumulative Avg. CV loss: {}'.format(np.mean(losses)))

Cumulative Avg. CV loss: -0.05172599618729923


## Submission

In [120]:
import time
submission = pd.DataFrame.from_dict(preds)
submission.to_csv('../data/external/submission-{}.csv'.format(time.strftime('%Y%m%d_%H%M', time.localtime())), index=False)