# Sentiment Analysis using Naive Bayes

In [46]:
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

tknzr = TweetTokenizer()
sws = stopwords.words('english')

## Preprocess and Split Data

In [71]:
X = []
y = []

def preprocess_data():
    with open('data/amazon_cells_labelled.txt', 'r') as f:
        for l in f:
            sentence, sentiment = l.split('\t')
            sentiment = int(sentiment)
            y.append(sentiment)

            words = tknzr.tokenize(sentence)
            # Remove stopwords does not improve accuracy but makes the model faster as we have less words to process
            words = [word for word in words if word not in sws]
            X.append(words)
            
preprocess_data()
            
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
print("We have {} train sentences and {} test sentences".format(len(X_train), len(X_test)))

We have 800 train sentences and 200 test sentences


In [73]:
print("{0:.2f}% of the train sentences are positive".format(sum(y_train) * 100 / len(y_train)))

49.12% of the train sentences are positive


In [74]:
print("{0:.2f}% of the test sentences are positive".format(sum(y_test) * 100 / len(y_test)))

53.50% of the test sentences are positive


## Implement Naive Bayes Model
Compute the probabilities needed to make inference

In [75]:
categories_to_frequencies = {0: 0, 1: 0}
categories_to_words_to_frequencies = {0: {}, 1: {}}

for i, words in enumerate(X_train):
    sentiment = y_train[i]

    categories_to_frequencies[sentiment] += 1

    for word in words:
        if word not in categories_to_words_to_frequencies[sentiment]:
            categories_to_words_to_frequencies[sentiment][word] = 0

        categories_to_words_to_frequencies[sentiment][word] += 1        

Implement the function which uses the above computed frequencies to make predictions

In [76]:
def predict(words):
    positive = 1
    negative = 0
    
    p_negative_given_data = 1
    p_positive_given_data = 1
    
    p_negative = categories_to_frequencies[negative] / sum(categories_to_frequencies.values())
    p_negative_given_data *= p_negative
    
    p_positive = categories_to_frequencies[positive] / sum(categories_to_frequencies.values())
    p_positive_given_data *= p_positive
    
    for word in words:
        if word in categories_to_words_to_frequencies[negative]:
            p_word_given_negative = categories_to_words_to_frequencies[negative][word] / sum(categories_to_words_to_frequencies[negative].values())
        else:
            p_word_given_negative = 0.0001
        p_negative_given_data *= p_word_given_negative
        
        if word in categories_to_words_to_frequencies[positive]:
            p_word_given_positive = categories_to_words_to_frequencies[positive][word] / sum(categories_to_words_to_frequencies[negative].values())
        else:
            p_word_given_positive = 0.0001
        p_positive_given_data *= p_word_given_positive
    
    return 0 if p_negative_given_data > p_positive_given_data else 1

## Test

In [77]:
n_correct = 0

for i, words in enumerate(X_test):
    y_pred = predict(words)
    if y_pred == y_test[i]:
        n_correct += 1

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

80.00% of sentences are correctly classified
