# Naive Bayes

In [1]:
import pdb
import nltk
from nltk.corpus import twitter_samples
import numpy as np
import pandas as pd
import string
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

%matplotlib inline
%config InlineBackend.figure_format='svg'

In [2]:
all_positive_tweets=twitter_samples.strings('positive_tweets.json')
all_negative_tweets=twitter_samples.strings('negative_tweets.json')

# Split the data
train_pos=all_positive_tweets[:4000]
test_pos=all_positive_tweets[4000:]
train_neg=all_negative_tweets[:4000]
test_neg=all_negative_tweets[4000:]

train_x=train_pos+train_neg
test_x=test_pos+test_neg

train_y=np.append(np.ones(len(train_pos)),np.zeros(len(train_neg)))
test_y=np.append(np.ones(len(test_pos)),np.zeros(len(test_neg)))

# Process the Data

In [3]:
def process_tweet(tweet):
    stemmer=PorterStemmer()
    stopwords_english=stopwords.words('english')
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer=TweetTokenizer(preserve_case=False,strip_handles=True,
                             reduce_len=True)
    
    tweet_tokens=tokenizer.tokenize(tweet)
    
    tweets_clean=[]
    
    for word in tweet_tokens:
        if word not in stopwords_english\
         and word not in string.punctuation:
            stem_word=stemmer.stem(word)
            tweets_clean.append(stem_word)
            
    return tweets_clean

In [4]:
# Test
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


## Implementing your helper functions

In [5]:
def count_tweets(result,tweets,ys):
    for y,tweet in zip(ys,tweets):
        for word in process_tweet(tweet):
            pair=(word,y)
            
            if pair in result:
                result[pair]+=1
            else:
                result[pair]=1
                
    return result

In [6]:
# Test

result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

# Train your model using Naive Bayes

In [7]:
freqs = count_tweets({}, train_x, train_y)

In [8]:
def lookup(freqs,word,label):
    n=0
    pair=(word,label)
    
    if pair in freqs:
        n=freqs[pair]
    return n

In [9]:
def train_naive_bayes(freqs,train_x,train_y):
    loglikelihood={}
    logprior=0
    
    # Calculate V,the number of unique words in the vocab
    vocab=set([pair[0] for pair in freqs.keys()])
    V=len(vocab)
    
    # Calculate N_pos,N_neg,V_pos,V_neg
    N_pos=N_neg=0
#     V_pos=V_neg=0
    
    for pair in freqs.keys():
        if pair[1]>0:
#             V_pos+=1
            N_pos+=freqs[pair]
        else:
#             V_neg+=1
            N_neg+=freqs[pair]
    
#     D=len(train_y)

    # Calculate D_pos,the number of positive documents
    D_pos=(len(list(filter(lambda x:x>0,train_y))))
         
    # Calculate D_neg,the number of negative documents
    D_neg=(len(list(filter(lambda x:x<=0,train_y))))

    # Calculate logprior
    logprior=np.log(D_pos)-np.log(D_neg)
    
    for word in vocab:
        # Calculate the frequency of positive/negative word
        freq_pos=lookup(freqs,word,1)
        freq_neg=lookup(freqs,word,0)
        
        # Calculate the probability that each word is positice/negative
        p_w_pos=(freq_pos+1)/(N_pos+V)
        p_w_neg=(freq_neg+1)/(N_neg+V)
        
        # Calculate the log likelihood of the word
        loglikelihood[word]=np.log(p_w_pos/p_w_neg)
        
    return logprior,loglikelihood

In [10]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9085


# Test your naive bayes

In [11]:
def naive_bayes_predict(tweet,logprior,loglikelihood):
    word_1=process_tweet(tweet)
    
    p=0
    
    p+=logprior
    
    for word in word_1:
        if word in loglikelihood:
            p+=loglikelihood[word]
        
    return p

In [12]:
# Test

my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 1.5737794405738943


## Implement test_naive_bayes

In [13]:
def test_naive_bayes(test_x,test_y,logprior,loglikelihood):
    accuracy=0
    y_hats=[]
    
    for tweet in test_x:
        if naive_bayes_predict(tweet,logprior,loglikelihood)>0:
            y_hat_i=1
        else:
            y_hat_i=0
        y_hats.append(y_hat_i)
    
    error=np.mean(np.absolute(y_hats-test_y))
    
    accuracy=1-error
    
    return accuracy

In [14]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940


In [15]:
# Test

for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.15
I am bad -> -1.29
this movie should have been great. -> 2.14
great -> 2.14
great great -> 4.28
great great great -> 6.41
great great great great -> 8.55


In [16]:
# Test

my_tweet = 'you are bad :('
naive_bayes_predict(my_tweet, logprior, loglikelihood)

-8.802119484044237

# Filter words by Ratio of positive to negative counts

In [17]:
def get_ratio(freqs,word):
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}
   
    pos_neg_ratio['positive'] = lookup(freqs,word,1)

    pos_neg_ratio['negative'] = lookup(freqs,word,0)

    # calculate the ratio of positive to negative counts for the word
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive'] + 1)/(pos_neg_ratio['negative'] + 1)

    return pos_neg_ratio

In [18]:
get_ratio(freqs, 'happi')

{'positive': 161, 'negative': 18, 'ratio': 8.526315789473685}

#### Implement `get_words_by_threshold(freqs,label,threshold)`

* If we set the label to 1, then we'll look for all words whose threshold of positive/negative is at least as high as that threshold, or higher.
* If we set the label to 0, then we'll look for all words whose threshold of positive/negative is at most as low as the given threshold, or lower.
* Use the `get_ratio()` function to get a dictionary containing the positive count, negative count, and the ratio of positive to negative counts.
* Append a dictionary to a list, where the key is the word, and the dictionary is the dictionary `pos_neg_ratio` that is returned by the `get_ratio()` function.
An example key-value pair would have this structure:
```
{'happi':
    {'positive': 10, 'negative': 20, 'ratio': 0.5}
}
```

In [19]:
def get_words_by_threshold(freqs,label,threshold):
    word_list = {}

    for key in freqs.keys():
        word, _ = key

        pos_neg_ratio = get_ratio(freqs, word)

        if label == 1 and pos_neg_ratio['ratio'] >= threshold :

            word_list[word] = pos_neg_ratio

        elif label == 0 and pos_neg_ratio['ratio'] <= threshold:
            
            word_list[word] = pos_neg_ratio

    return word_list

In [20]:
# Test
get_words_by_threshold(freqs, label=0, threshold=0.05)

{':(': {'positive': 1, 'negative': 3663, 'ratio': 0.0005458515283842794},
 ':-(': {'positive': 0, 'negative': 378, 'ratio': 0.002638522427440633},
 'zayniscomingbackonjuli': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '26': {'positive': 0, 'negative': 20, 'ratio': 0.047619047619047616},
 '>:(': {'positive': 0, 'negative': 43, 'ratio': 0.022727272727272728},
 'lost': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '♛': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 '》': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 'beli̇ev': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'wi̇ll': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'justi̇n': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｓｅｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｍｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}}

In [21]:
get_words_by_threshold(freqs, label=1, threshold=10)

{'followfriday': {'positive': 23, 'negative': 0, 'ratio': 24.0},
 'commun': {'positive': 27, 'negative': 1, 'ratio': 14.0},
 ':)': {'positive': 2847, 'negative': 2, 'ratio': 949.3333333333334},
 'flipkartfashionfriday': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':D': {'positive': 498, 'negative': 0, 'ratio': 499.0},
 ':p': {'positive': 103, 'negative': 0, 'ratio': 104.0},
 'influenc': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':-)': {'positive': 543, 'negative': 0, 'ratio': 544.0},
 "here'": {'positive': 20, 'negative': 0, 'ratio': 21.0},
 'youth': {'positive': 14, 'negative': 0, 'ratio': 15.0},
 'bam': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'warsaw': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'shout': {'positive': 11, 'negative': 0, 'ratio': 12.0},
 ';)': {'positive': 22, 'negative': 0, 'ratio': 23.0},
 'stat': {'positive': 51, 'negative': 0, 'ratio': 52.0},
 'arriv': {'positive': 57, 'negative': 4, 'ratio': 11.6},
 'via': {'positive': 60, 'negative': 1, 

Notice the difference between the positive and negative ratios. Emojis like :( and words like 'me' tend to have a negative connotation. Other words like 'glad', 'community', and 'arrives' tend to be found in the positive tweets. 

# Error Analysis

In [22]:
# error demo
print('Truth Predicted Tweet')
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            process_tweet(x)).encode('ascii', 'ignore')))

Truth Predicted Tweet
1	0.00	b''
1	0.00	b'truli later move know queen bee upward bound movingonup'
1	0.00	b'new report talk burn calori cold work harder warm feel better weather :p'
1	0.00	b'harri niall 94 harri born ik stupid wanna chang :D'
1	0.00	b''
1	0.00	b''
1	0.00	b'park get sunlight'
1	0.00	b'uff itna miss karhi thi ap :p'
0	1.00	b'hello info possibl interest jonatha close join beti :( great'
0	1.00	b'u prob fun david'
0	1.00	b'pat jay'
0	1.00	b'whatev stil l young >:-('


# Predict with your own tweet

In [23]:
my_tweet = 'I am happy because I am learning :)'

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

9.57402369584527
