# Sentiment Analysis using Naive Bayes

In [10]:
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import numpy as np

tknzr = TweetTokenizer()
sws = stopwords.words('english')

## Preprocess and Split Data

In [2]:
X = []
y = []

def preprocess_data():
    with open('data/amazon_cells_labelled.txt', 'r') as f:
        for l in f:
            sentence, sentiment = l.split('\t')
            sentiment = int(sentiment)
            y.append(sentiment)

            words = tknzr.tokenize(sentence)
            # Remove stopwords does not improve accuracy but makes the model faster as we have less words to process
            words = [word for word in words if word not in sws]
            X.append(words)
            
preprocess_data()
            
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
print("We have {} train sentences and {} test sentences".format(len(X_train), len(X_test)))

We have 800 train sentences and 200 test sentences


In [4]:
print("{0:.2f}% of the train sentences are positive".format(sum(y_train) * 100 / len(y_train)))

49.12% of the train sentences are positive


In [5]:
print("{0:.2f}% of the test sentences are positive".format(sum(y_test) * 100 / len(y_test)))

53.50% of the test sentences are positive


## Implement Naive Bayes Model
Compute the probabilities needed to make inference

In [24]:
class NaiveBayes():
    def __init__(self, categories):
        self.categories = categories
        self.categories_to_frequencies = {k: 0 for k in categories}
        self.categories_to_words_to_frequencies = {k: {} for k in categories}
    
    def train(self, X_train, y_train):
        for i, words in enumerate(X_train):
            sentiment = y_train[i]

            self.categories_to_frequencies[sentiment] += 1

            for word in words:
                if word not in self.categories_to_words_to_frequencies[sentiment]:
                    self.categories_to_words_to_frequencies[sentiment][word] = 0

                self.categories_to_words_to_frequencies[sentiment][word] += 1   
                
    def predict(self, words):
        return np.argmax([self.compute_category_score(category) for category in self.categories])
    
    def compute_category_score(self, category):
        prior = self.categories_to_frequencies[category] / sum(self.categories_to_frequencies.values())
        
        likelihood = 1
        for word in words:
            if word in self.categories_to_words_to_frequencies[category]:
                p_word_given_category = self.categories_to_words_to_frequencies[category][word] / sum(self.categories_to_words_to_frequencies[category].values())
            else:
                p_word_given_category = 0.0001
            likelihood *= p_word_given_category
            
        return prior * likelihood

## Test

In [25]:
n_correct = 0
model = NaiveBayes([0, 1])

model.train(X_train, y_train)

for i, words in enumerate(X_test):
    y_pred = model.predict(words)
    if y_pred == y_test[i]:
        n_correct += 1

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

78.00% of sentences are correctly classified
