Following the Baselines and Bigrams: Simple, Good Sentiment and Topic Classification paper
Specifically trying MNB and NBSVM
https://www.aclweb.org/anthology/P12-2018

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.model_selection import GridSearchCV, train_test_split
import spacy
import time
nlp = spacy.load('en')

In [3]:
df = pd.read_csv('train.csv')
X_train, X_test, y_train, y_test = train_test_split(df[['comment_text']],
                                                    df.drop(columns=['id', 'comment_text']),
                                                    test_size=0.2)
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
nlp.vocab[u'you'].is_stop = False

In [5]:
def tokenize(s):
    tokens = nlp(unicode(s))
    return [t.lemma_ if t.lemma_ != u'-PRON-' else t.text for t in tokens if not t.is_stop and not t.is_space and not t.is_punct]

In [6]:
tfidf_vec = TfidfVectorizer(strip_accents='unicode', tokenizer=tokenize, ngram_range=(1,2),
                            max_df=0.9, min_df=3, sublinear_tf=True)

In [None]:
X_train_tfidf = tfidf_vec.fit_transform(X_train.comment_text)
X_test_tfidf = tfidf_vec.transform(X_test.comment_text)

In [None]:
def get_r(X, y_pos, y_neg, alpha=1.):
    p = X[y_pos].sum(0) + alpha
    q = X[y_neg].sum(0) + alpha
    
    p /= sum(p) + alpha
    q /= sum(q) + alpha
    
    return np.log(p, q).A1

In [None]:
lr = LogisticRegression()

In [None]:
search_params = {
    'dual': [True, False],
    'tol': [1e-3, 1e-4, 1e-5],
    'C': [1., 4., 8.],
    'class_weight': [None, 'balanced']
}

In [None]:
grid_search = GridSearchCV(lr, search_params, scoring='neg_log_loss', cv=10, verbose=2)
grid_search.fit(X_train_tfidf.multiply(r), y_train[c].values)

In [None]:
saga_search = GridSearchCV(lr, saga_params, scoring='neg_log_loss', cv=10, verbose=1)
saga_search.fit(X_train_tfidf.multiply(r), y_train[c].values)

In [None]:
print saga_search.best_score_
print log_loss(y_test[c], saga_search.best_estimator_.predict_proba(X_test_tfidf.multiply(r))[:,1])
print saga_search.best_params_

In [None]:
best_lin_params = {}
best_score = {}
test_score = {}
for c in classes:
    print 'Training {}'.format(c)
    t = time.time()
    y_pos = y_train[y_train[c] == 1][c].values
    y_neg = y_train[y_train[c] == 0][c].values
    r = get_r(X_train_tfidf, y_pos, y_neg)
    clf = GridSearchCV(lr, search_params, scoring='neg_log_loss', cv=10)
    clf.fit(X_train_tfidf.multiply(r), y_train[c].values)
    best_lin_params[c] = clf.best_params_
    best_score[c] = clf.best_score_
    test_score[c] = log_loss(y_test[c].values, clf.best_estimator_.predict_proba(X_test_tfidf.multiply(r))[:,1])
    print 'Took {} seconds'.format(time.time() - t)

In [None]:
best_lin_params

In [None]:
np.mean(test_score.values())

In [None]:
np.mean(best_score.values())

In [None]:
sag_params = {
    'tol': [1e-3, 1e-4],
    'C': [4., 6., 8.],
    'solver': ['sag'],
    'n_jobs': [-1]
}

In [None]:
best_sag_params = {}
best_sag_score = {}
test_sag_score = {}
for c in classes:
    print 'Training {}'.format(c)
    t = time.time()
    y_pos = y_train[y_train[c] == 1][c].values
    y_neg = y_train[y_train[c] == 0][c].values
    r = get_r(X_train_tfidf, y_pos, y_neg)
    clf = GridSearchCV(lr, search_params, scoring='neg_log_loss', cv=10)
    clf.fit(X_train_tfidf.multiply(r), y_train[c].values)
    best_sag_params[c] = clf.best_params_
    best_sag_score[c] = clf.best_score_
    test_sag_score[c] = log_loss(y_test[c].values, clf.best_estimator_.predict_proba(X_test_tfidf.multiply(r))[:,1])
    print 'Took {} seconds'.format(time.time() - t)

In [None]:
best_sag_params

In [None]:
best_sag_score

In [None]:
np.mean(best_sag_score.values())

In [None]:
np.mean(test_sag_score.values())

In [None]:
pred_lr = grid_search.best_estimator_.predict_proba(X_test_tfidf.multiply(r))[:,1]

In [None]:
print 'Log loss NBLR: {}'.format(log_loss(y_test[c], pred_lr))

In [None]:
confusion_matrix(y_test[c], [1 if p > 0.5 else 0 for p in pred_lr])