# Getting the data

In [23]:
# Import libraries
import nltk
from nltk.corpus import stopwords,twitter_samples
import os
import re
import string
import pandas as pd
import numpy as np
import random
from utils import process_tweet,lookup

In [12]:
# Get all tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [13]:
# Total number of tweets
print("Total positive tweets : ",len(all_positive_tweets))
print("Total negative tweets : ",len(all_negative_tweets))

Total positive tweets :  5000
Total negative tweets :  5000


In [19]:
# Split data into training and testing(validation)
split_size = int(0.8 * len(all_positive_tweets))
print(split_size)

train_pos = all_positive_tweets[:split_size]
test_pos = all_positive_tweets[split_size:]
train_neg = all_negative_tweets[:split_size]
test_neg = all_negative_tweets[split_size:]

train_tweets = train_pos + train_neg
test_tweets = test_pos + test_neg

train_labels = np.append(np.ones(len(train_pos)),np.zeros(len(train_neg)))
test_labels = np.append(np.ones(len(test_pos)),np.zeros(len(test_neg)))

4000


# Pre-processing the data

Tokenize, remove punctuation, remove Stopwords, Perform Stemming

We have a helper function in utils.py file to do all that


In [25]:
Example_custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
print(process_tweet(Example_custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


In [32]:
# Implement a function to return the frequency mapping of each word and it's label
# (Word,label) : frequency

def count_tweets(result,tweets,ys):
    # result is empty dictionary. We have to populate it 
    # tweets is list of tweets
    # ys is label of each tweet
    
    for tweet,y in zip(tweets,ys):
        token_list = process_tweet(tweet)
        for token in token_list:
            pair = (token,y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
    return result

In [33]:
# Testing your function

result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [36]:
# Build our frequency table
freqs = count_tweets({},train_tweets,train_labels)

# Test Naive Bayes

In [41]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. 
        loglikelihood: the log likelihood of you Naive bayes equation.
    '''
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            N_pos += freqs[pair]
        # else, the label is negative
        else:
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents 
    D_pos = sum(np.array(train_y) == 1)

    # Calculate D_neg, the number of negative documents 
    D_neg = D - D_pos

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos+1)/(N_pos + V)
        p_w_neg = (freq_neg+1)/(N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)

    return logprior, loglikelihood


In [47]:
logprior, loglikelihood = train_naive_bayes(freqs, train_tweets, train_labels)
print(logprior)
print(len(loglikelihood))

0.0
9085


In [48]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p


In [49]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0 

    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    error = np.sum(np.array(y_hats)!=np.array(test_y))/len(test_y)

    accuracy = 1 - error

    return accuracy


In [50]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_tweets, test_labels, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940


 Accuracy of 99 %. That's marvellous.

In [59]:
# Let's check it on any of our tweet. Take care that if the word is not present in the vocab, 
# that word won't contribute in helping to find the sentiment

my_tweet = 'She was excited to watch the movie'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
if p>0:
    sentiment = "positive"
else:
    sentiment = "negative"
print("The sentiment of the given tweet is",sentiment)

The sentiment of the given tweet is positive
