In [1]:
#from os import getcwd
import string
import sys

import nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import pdb

sys.path.append('..')
from utils import get_all_tweets, process_tweet, lookup

In [2]:
DATA = '../../../../data/twitter_samples'

In [4]:
all_positive_tweets = get_all_tweets(DATA, 'positive')
all_negative_tweets = get_all_tweets(DATA, 'negative')

In [5]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [6]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]
train_x = train_pos + train_neg
test_x = test_pos + test_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

# Process 

In [7]:
custom_tweet = ('RT @Twitter @chapagain Hello There! Have a great day. '
                ':) #good #morning http://chapagain.com.np')
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


### Helper Funcs

In [10]:
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its 
          frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet 
          (either 0 [neg] or 1 [pos])
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word, y)
            result[pair] = result.get(pair, 0) + 1
    return result

In [11]:
# Test
result = {}
tweets = [
    'i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [12]:
freqs = count_tweets({}, train_x, train_y)

# Naïve Bayes: Train

Log Prior:
$$\text{logprior} = log \left( \frac{P(D_{pos})}{P(D_{neg})} \right) = log \left( \frac{D_{pos}}{D_{neg}} \right)$$.

$$\text{logprior} = \log (P(D_{pos})) - \log (P(D_{neg})) = \log (D_{pos}) - \log (D_{neg})$$


Log Likelihood:     
$$\text{loglikelihood} = \log \left(\frac{P(W_{pos})}{P(W_{neg})} \right)$$

Where (with LaPlacian smoothing): 
$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V} $$
$$ P(W_{neg}) = \frac{freq_{neg} + 1}{N_{neg} + V} $$

In [17]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0
    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = V_pos = V_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            V_pos += 1
            N_pos += freqs[pair]
        else:
            V_neg += 1
            N_neg += freqs[pair]
            
    D = len(train_x)
    D_pos = len(train_y[train_y == 1])
    D_neg = len(train_y[train_y == 0])
    logprior = np.log(D_pos) - np.log(D_neg)
    for word in vocab:
        freq_pos = freqs.get((word, 1), 0)
        freq_neg = freqs.get((word, 0), 0)
        p_w_pos = (freq_pos + 1) / N_pos + V
        p_w_neg = (freq_neg + 1) / N_neg + V
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)
    return logprior, loglikelihood

In [18]:
# Test 
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9108
