# Sentiment Classification with Multinomial Naive Bayes

In [23]:
import os
import numpy as np

## Data and Features Extraction

In [24]:
data_path = 'datasets/data-tagged/'

In [63]:
def process_file(filepath):
    """Given a file, returns a list of tokens for that file"""
    x = []
    with open(filepath, 'r') as f:
        for l in f:
            # Filter lines which consist only of new line operator
            if l == '\n':
                continue
            
            token, pos_tagging = l.split('\t')
            x.append(token)
    return x

def preprocess_data(datapath, sentiment='POS'):
    idx = 0
    X = []
    y = []
    sentiment_value = 1 if sentiment == 'POS' else 0
    
    # For file in the folder
    current_path = datapath + sentiment
    for f in os.listdir(current_path):
        x = process_file(current_path + '/' + f)
        X.append(x)
        y.append(sentiment_value)

    return X, y

def get_dictionary(X):
    """Returns a dictionary which maps each token to its index in the feature space, along with a list of all features (tokens) in order"""
    idx = 0
    token_to_idx = {}
    features = []
    
    for x in X:
        for token in x:
            if token not in token_to_idx:
                token_to_idx[token] = idx
                idx += 1
                features.append(token)
    
    return token_to_idx, features

def featurize_data(X, features, token_to_idx):
    """Convert each sample from a list of tokens to a multinomial bag of words representation"""
    X_feat = []
    for x in X:
        x_feat = np.zeros((len(features)))
        for token in x:
            x_feat[token_to_idx[token]] += 1
        X_feat.append(x_feat)
    
    return X_feat

In [64]:
X_pos, y_pos = preprocess_data(data_path, 'POS')
X_neg, y_neg = preprocess_data(data_path, 'NEG')

In [65]:
X = X_pos + X_neg
y = y_pos + y_neg

In [66]:
token_to_idx, features = get_dictionary(X)

In [67]:
X_feat = featurize_data(X, features, token_to_idx)

In [68]:
assert(sum(X_feat[0]) != 0)

## Multinomial Naive Bayes Model

In [72]:
class MultinomialNaiveBayes():
    pass