In [1]:
import numpy as np
from collections import namedtuple

from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import vstack, hstack

import gc; gc.enable()

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Params

ngram_min = 1
ngram_max = 3

In [3]:
# Read Embedding Files

def read_embeddings(filename):
    X=[]
    with open(filename) as file:
        for line in file:
            vector = line.split()[1:]
            vector = np.array(vector)
            vector = vector.astype(np.float64)
            X.append(vector)
    return np.array(X)

X_train_embedding = read_embeddings('emb_train.txt')
X_test_embedding = read_embeddings('emb_test.txt')

gc.collect();

In [4]:
# Read Docs

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = []
filename='alldata-3gram.txt'

with open(filename, encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = line.split()
        words = tokens[1:]
        tags = [line_no] 
        split = ['train','test','extra','extra'][line_no//25000]  
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] 
        alldocs.append(SentimentDocument(words, tags, split, sentiment))
    
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']

gc.collect();

In [5]:
# Feature Extraction for Naive Bayes Model

count_vect = CountVectorizer(tokenizer=lambda text: text,
                             preprocessor=lambda text:text, 
                             binary=True,
                             ngram_range=(ngram_min,ngram_max))

X_train_NB = count_vect.fit_transform([x.words for x in train_docs])
y_train = [doc.sentiment for doc in train_docs]

gc.collect();

* Bernoulli NB: 
    * https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html
* Multi-Nomial NB: 
    * https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB
* Complement NB: 
    * https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html#sklearn.naive_bayes.ComplementNB
* Log-Probability:
    * https://en.wikipedia.org/wiki/Log_probability
* Log-Odds:
    * https://wiki.lesswrong.com/wiki/Log_odds

In [6]:
# Probabilities / Odds

# nb = BernoulliNB()
# nb = MultinomialNB()
nb = ComplementNB()

nb.fit(X_train_NB, y_train)
prob = nb.feature_log_prob_ 

# log(p)-log(q) = log(p/q) 
#               = log(p/(1-p))
#               = log(odds)
log_odds = prob[0] - prob[1]

gc.collect();

In [7]:
# Weight Adjustments

X_train = [x.multiply(log_odds).tocsr() for x in X_train_NB]
X_train = vstack(X_train)

X_test_NB = count_vect.transform([x.words for x in test_docs])
X_test = [x.multiply(log_odds).tocsr() for x in X_test_NB]
X_test = vstack(X_test)

gc.collect();

In [8]:
# Train / Predict / Score

X_train = hstack((X_train, X_train_embedding))
X_test = hstack((X_test, X_test_embedding))

print('TRAIN:')
clf = LogisticRegression(n_jobs=-1, 
                         class_weight='balanced')
clf.fit(X_train, y_train)
print('Accuracy =', clf.score(X_train, y_train)*100)

print()
print('TEST:')
y_test = [doc.sentiment for doc in test_docs]
print('Accuracy =', clf.score(X_test, y_test)*100)

gc.collect();

TRAIN:
Accuracy = 100.0

TEST:
Accuracy = 93.43599999999999


In [9]:
from sklearn.model_selection import cross_val_score

In [11]:
scores = cross_val_score(clf, 
                         X_train, y_train, 
                         scoring='accuracy', 
                         cv=5)

print(scores.mean(), "+/-", scores.std())

In [12]:
scores

array([0.947 , 0.9444, 0.9466, 0.9508, 0.9526])