# Sentiment Classification with Multinomial Naive Bayes

In [1]:
import os
import numpy as np
import math
from sklearn.model_selection import train_test_split

## Data and Features Extraction

In [2]:
data_path = 'datasets/data-tagged/'
classes = [0, 1]

In [3]:
def process_file(filepath):
    """Given a file, returns a list of tokens for that file"""
    x = []
    with open(filepath, 'r') as f:
        for l in f:
            # Filter lines which consist only of new line operator
            if l == '\n':
                continue
            
            token, pos_tagging = l.split('\t')
            x.append(token)
    return x

def preprocess_data(datapath, sentiment='POS'):
    idx = 0
    X = []
    y = []
    sentiment_value = 1 if sentiment == 'POS' else 0
    
    # For file in the folder
    current_path = datapath + sentiment
    for f in os.listdir(current_path):
        x = process_file(current_path + '/' + f)
        X.append(x)
        y.append(sentiment_value)

    return X, y

def get_dictionary(X):
    """Returns a dictionary which maps each token to its index in the feature space, along with a list of all features (tokens) in order"""
    idx = 0
    token_to_idx = {}
    features = []
    
    for x in X:
        for token in x:
            if token not in token_to_idx:
                token_to_idx[token] = idx
                idx += 1
                features.append(token)
    
    return token_to_idx, features

def featurize_data(X, features, token_to_idx):
    """Convert each sample from a list of tokens to a multinomial bag of words representation"""
    X_feat = []
    for x in X:
        x_feat = np.zeros((len(features)))
        for token in x:
            x_feat[token_to_idx[token]] += 1
        X_feat.append(x_feat)
    
    return X_feat

In [4]:
X_pos, y_pos = preprocess_data(data_path, 'POS')
X_neg, y_neg = preprocess_data(data_path, 'NEG')

In [5]:
# X = X_pos + X_neg
# y = y_pos + y_neg

In [6]:
# token_to_idx, features = get_dictionary(X)

In [7]:
# X = featurize_data(X, features, token_to_idx)

In [8]:
# assert(sum(X[0]) != 0)

## Multinomial Naive Bayes Model

In [9]:
class MultinomialNaiveBayes():
    def __init__(self, classes, num_feat):
        # Number of features the model uses
        self.num_feat = num_feat
        # List of the classes
        self.classes = classes
        # Dictionary mapping each class to the prior probability p(C=c)
        self.class_to_prior = {c: 0 for c in classes}
        # self.class_to_feature_to_cond_prob[c][x] is used to store the estimate of the conditional probability p(X=x|C=c)
        self.class_to_feature_to_cond_prob = {c: np.zeros((num_feat,)) for c in classes}
        
    def fit(self, X, y):
        y = np.array(y)
        X = np.array(X)
        # Computer priors
        for c in y:
            self.class_to_prior[c] += 1
        self.class_to_prior.update({c: self.class_to_prior[c] / len(y) for c in self.classes})
        
        # Compute estimate of the conditional probability p(X=x|C=c)
        for c in self.classes:
            X_c = X[y == c]
            features_frequencies = np.sum(X_c, axis=0)
            self.class_to_feature_to_cond_prob[c] = features_frequencies / sum(features_frequencies)
        
    def predict(self, X):
        return np.argmax(np.stack([self.compute_scores(X, c) for c in self.classes], axis=-1), axis=1)
    
    def compute_scores(self, X, c):
        return np.log(self.class_to_prior[c]) + np.matmul(X, np.log(self.smooth(self.class_to_feature_to_cond_prob[c])))
            
#     def compute_score(self, x, c):
#         # Compute score (unnormalized log probability) for given class
#         return np.log(self.class_to_prior[c]) + np.dot(x, np.log(self.smooth(self.class_to_feature_to_cond_prob[c])))
    
    def smooth(self, x, laplacian=False):
        x[x == 0.0] = 10 ** -5
        return x

## Test Model

In [10]:
X = X_pos + X_neg
y = y_pos + y_neg

In [11]:
token_to_idx, features = get_dictionary(X)

In [12]:
X = featurize_data(X, features, token_to_idx)

In [13]:
# X_train = X_pos[:900] + X_neg[:900]
# y_train = y_pos[:900] + y_neg[:900]

# X_test = X_pos[900:] + X_neg[900:]
# y_test = y_pos[900:] + y_neg[900:]

In [14]:
# X_train = featurize_data(X_train, features, token_to_idx)
# X_test = featurize_data(X_test, features, token_to_idx)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
%%time
model = MultinomialNaiveBayes(classes, len(features))

model.fit(X_train, y_train)

CPU times: user 777 ms, sys: 830 ms, total: 1.61 s
Wall time: 1.85 s


In [21]:
%%time
y_pred = model.predict(X_test)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

80.75% of sentences are correctly classified
CPU times: user 166 ms, sys: 153 ms, total: 319 ms
Wall time: 305 ms


## Comparison with Sklearn

In [19]:
%%time
from sklearn.naive_bayes import MultinomialNB

n_correct = 0

clf = MultinomialNB(alpha=1.0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

83.00% of sentences are correctly classified
CPU times: user 803 ms, sys: 484 ms, total: 1.29 s
Wall time: 1.2 s
