Following the Baselines and Bigrams: Simple, Good Sentiment and Topic Classification paper
Specifically trying MNB and NBSVM
https://www.aclweb.org/anthology/P12-2018

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import spacy
nlp = spacy.load('en')

In [2]:
df = pd.read_csv('train.csv')
X_train, X_test, y_train, y_test = train_test_split(df[['comment_text']],
                                                    df.drop(columns=['id', 'comment_text']),
                                                    test_size=0.2)
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
nlp.vocab[u'you'].is_stop = False

In [4]:
def tokenize(s):
    tokens = nlp(unicode(s))
    return [t.lemma_ if t.lemma_ != u'-PRON-' else t.text for t in tokens if not t.is_stop and not t.is_space and not t.is_punct]

In [5]:
tfidf_vec = TfidfVectorizer(strip_accents='unicode', tokenizer=tokenize, ngram_range=(1,2),
                            max_df=0.9, min_df=3, sublinear_tf=True)

In [6]:
X_train_tfidf = tfidf_vec.fit_transform(X_train.comment_text)
X_test_tfidf = tfidf_vec.transform(X_test.comment_text)

In [7]:
def get_r(X, y_pos, y_neg, alpha=1.):
    p = X[y_pos].sum(0) + alpha
    q = X[y_neg].sum(0) + alpha
    
    p /= sum(p)
    q /= sum(q)
    
    return np.log(p, q).A1

In [35]:
def get_b(y_pos, y_neg):
    return np.log(float(len(y_pos)) / len(y_neg))

In [39]:
def run_mnb(X, r, b):
    return r * X + b

In [10]:
def fit_nbsvm(X, y_pos, y_neg, r, sk_model):
    f = r * X
    return sk_model.fit(f, y)   # Skips reseting coefs b/c those are read only in SKlearn

In [11]:
lr = LogisticRegression()
svm = LinearSVC(C=0.1)

In [17]:
c = 'toxic'
y_pos = y_train[y_train[c] == 1][c].values
y_neg = y_train[y_train[c] == 0][c].values

In [22]:
r = get_r(X_train_tfidf, y_pos, y_neg)

In [36]:
b = get_b(y_pos, y_neg)

In [None]:
np.multiply(X_test_tfidf, r)

In [40]:
pred_mnb = run_mnb(X_test_tfidf, r, b)

ValueError: dimension mismatch

In [None]:
lr_model = fit_nbsvm(X_train_tfidf, y_pos, y_neg, r, lr)

In [None]:
pred_lr = lr_model.predict_proba(X_test_tfidf.multiply(r))[:,1]

In [None]:
svm_model = fit_nbsvm(X_train_tfidf, y_pos, y_neg, r, svm)
pred_svm = svm_model.predict_proba(X_test_tfidf.multiply(r))[:,1]

In [None]:
from sklearn.metrics import confusion_matrix, log_loss

In [None]:
print 'Log loss MNB: {}'.format(log_loss(y_test[c], pred_mnb))
print 'Log loss NBLR: {}'.format(log_loss(y_test[c], pred_lr))
print 'Log loss NBSVM: {}'.format(log_loss(y_test[c], pred_svm))

In [None]:
confusion_matrix(y_test[c], pred_mnb)

In [None]:
confusion_matrix(y_test[c], pred_lr)

In [None]:
confusion_matrix(y_test[c], pred_svm)

In [None]:
###############

In [None]:
import re
re_tok = re.compile('([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize_1(s):
    return re_tok.sub(r' \1 ', s).split()

In [None]:
tfidf_vec_1 = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1)
trn_term_doc1 = tfidf_vec_1.fit_transform(X_train.comment_text)
trn_term_doc_test1 = tfidf_vec_1.transform(X_test.comment_text)

In [None]:
x = trn_term_doc1
test_x = trn_term_doc_test1

def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((len(X_test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m, r = get_mdl(y_train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

In [None]:
trn_term_doc1[y_train['toxic']==1].sum(0)

In [None]:
for i, l in enumerate(label_cols): 
    print log_loss(y_test[l], preds[:,i])

In [None]:
dtrain = xgb.DMatrix(trn_term_doc1, label=y_train[['toxic']])
dtest = xgb.DMatrix(trn_term_doc_test1, label=y_test[['toxic']])

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}
bst = xgb.train(xgb_params, dtrain, 400, [(dtrain, 'train'), (dtest, 'test')], verbose_eval=10)

In [None]:
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.model_selection import train_test_split

In [None]:
preds = bst.predict(dtest)
tn, fp, fn, tp = confusion_matrix(y_test.toxic, [1 if p > 0.5 else 0 for p in preds]).ravel()
print confusion_matrix(y_test.toxic, [1 if p > 0.5 else 0 for p in preds])

In [None]:
print 'FP rate: {}'.format(float(fp) / (fp + tn))
print 'FN rate: {}'.format(float(fn) / (tp + fn))

In [None]:
import pickle

In [None]:
with open('tfidf.pkl', 'w') as f:
    pickle.dump(trn_term_doc, f)

In [None]:
def get_comment_examples(df, n=20):
    for i, comment in enumerate(df.sample(n).comment_text):
        print '\x1b[1;34mExample {}\x1b[0m'.format(i+1)
        print comment