# Sentiment Analysis using Naive Bayes

In [29]:
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import numpy as np

tknzr = TweetTokenizer()
sws = stopwords.words('english')

## Preprocess and Split Data

In [30]:
X = []
y = []
word_to_index = {}
words_vocabulary = []


def preprocess_data():
    idx = 0
    with open('data/amazon_cells_labelled.txt', 'r') as f:
        for l in f:
            sentence, sentiment = l.split('\t')
            sentiment = int(sentiment)
            y.append(sentiment)

            words = tknzr.tokenize(sentence)
            # Remove stopwords does not improve accuracy but makes the model faster as we have less words to process
            words = [word for word in words if word not in sws]
            X.append(words)
            
            for word in words:
                if word not in word_to_index:
                    word_to_index[word] = idx
                    words_vocabulary.append(word)
                    idx += 1
    
    return X, y

def text_to_features(X):
    X_feat = []
    for x in X:
        x_feat = [1 if word in x else 0 for word in words_vocabulary]
        X_feat.append(x_feat)
    
    return X_feat
        
X, y = preprocess_data()
X = text_to_features(X)
            
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
print("We have {} train sentences and {} test sentences".format(len(X_train), len(X_test)))

We have 800 train sentences and 200 test sentences


In [32]:
print("{0:.2f}% of the train sentences are positive".format(sum(y_train) * 100 / len(y_train)))

49.12% of the train sentences are positive


In [33]:
print("{0:.2f}% of the test sentences are positive".format(sum(y_test) * 100 / len(y_test)))

53.50% of the test sentences are positive


In [34]:
print("The vocabulary size is {}".format(len(word_to_index)))

The vocabulary size is 2128


## Implement Naive Bayes Model
Compute the probabilities needed to make inference

In [62]:
class NaiveBayes():
    def __init__(self, categories, num_feat):
        self.num_feat = num_feat
        self.categories = categories
        self.categories_to_frequencies = {k: 0 for k in categories}
        # Frequencies of each word for each category is initially set to 1 so we avoid zero probability when computing likelihood
        self.categories_to_feature_to_frequencies = {k: {k2: 1 for k2 in range(num_feat)} for k in categories}
    
    def fit(self, X_train, y_train):
        for i, features in enumerate(X_train):
            category = y_train[i]

            self.categories_to_frequencies[category] += 1
            
            for j, feature in enumerate(features):
                if feature == 1:
                    self.categories_to_feature_to_frequencies[category][j] += 1
                    
        self.categories_to_total_frequency = {category: sum(self.categories_to_feature_to_frequencies[category].values()) for category in self.categories}
                
    def predict(self, features):
        return np.argmax([self.compute_category_score(category, features) for category in self.categories])
    
    def compute_category_score(self, category, features):
        prior = self.categories_to_frequencies[category] / sum(self.categories_to_frequencies.values())
        
        likelihood = 1
        for feature_idx, feature in enumerate(features):
            prob_feature_pos = self.categories_to_feature_to_frequencies[category][feature_idx] / self.categories_to_total_frequency[category]
            prob_feature_neg = 1 - prob_feature_pos
            likelihood *= feature * prob_feature_pos + (1 - feature) * prob_feature_neg
            
        return prior * likelihood

## Test

In [63]:
n_correct = 0
model = NaiveBayes([0, 1], len(words_vocabulary))

model.fit(X_train, y_train)

for i, features in enumerate(X_test):
    y_pred = model.predict(features)
    if y_pred == y_test[i]:
        n_correct += 1

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

78.00% of sentences are correctly classified


## Compare to Sklearn

In [64]:
from sklearn.naive_bayes import BernoulliNB

In [59]:
n_correct = 0

clf = BernoulliNB(alpha=1.0)
clf.fit(X_train, y_train)

for i, features in enumerate(X_test):
    y_pred = clf.predict([features])[0]
    if y_pred == y_test[i]:
        n_correct += 1

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

78.50% of sentences are correctly classified
