In [1]:
import nltk 
from  nltk.corpus import stopwords, twitter_samples
import numpy as np 
import pandas as pd 
import re

import string 
from nltk.tokenize import TweetTokenizer 
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

# from os import getcwdimport w2_unittest'

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')


In [3]:
train_pos = all_positive_tweets[:4000]
train_neg = all_negative_tweets[:4000]
test_pos = all_positive_tweets[4000:]
test_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg 

train_y = np.append(np.ones(len(train_pos)),np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

# Process the data 

In [4]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    english_stopwords = stopwords.words('english')

    # remove stock  markets symbols from the tweets
    tweet = re.sub(r'\$\w*','', tweet)

    # remove retweet symbol form the tweets
    tweet = re.sub(r'^RT[\s]+','', tweet)

    # remove the url from the tweets 
    tweet = re.sub(r'https?://[^\s\n\r]+','',tweet)
    
    # remove hashtags

    tweet = re.sub(r'#','',tweet)

    # tokenize tweets 
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

    tweet_tokens = tokenizer.tokenize(tweet)
    clean_tokens=[]
    for word in tweet_tokens:
        
        if (word not in english_stopwords and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            clean_tokens.append(stem_word)
    return clean_tokens
            

In [5]:
custom_tweet = "RT @Twitter @chapagain # Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


In [6]:
def frequncy_dictionary(tweets, labels):
    freq_dict = {}
    for tweet, label in zip(tweets, labels):
        for word in process_tweet(tweet):
            pair = (word, label)
            if pair in freq_dict:
                freq_dict[pair] +=1
            else:
                freq_dict[pair] = 1
    return freq_dict


In [7]:
freq_dict = frequncy_dictionary(train_x,train_y)

In [8]:
len(freq_dict)

11427

In [9]:
def train_naive_bayes(freqs, train_x, train_y):

    loglikelihood = {}
    logprior = 0 

    # calcuate the number of unique words in the vocabulary 
    vocab = set([k[0] for k, v in freqs.items()])
    V = len(vocab)

    # Calculate the N_pos, N_neg, V_pos, V_neg

    N_pos = N_neg = 0 
    for pair in freqs.keys():
        if pair[1]>0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]

    # Calculating the number of documents(tweets here)
    D = len(train_y)
    
    # Calculating the number of positive number 
    D_pos = len([x for x in train_y if x == 1])

    # Calculating the number of negative number 
    D_neg = len([x for x in train_y if x ==0])

    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = freqs.get((word,1), 0)
        freq_neg = freqs.get((word,0), 0)

        # calculate the proability that each word is positive, and negative
        p_w_pos = (freq_pos +1)/(N_pos + V)
        p_w_neg = (freq_neg +1)/ (N_pos + V)

        # Calculate the loglikelihood of the word 
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood



In [10]:
logprior, loglikelihood = train_naive_bayes(freq_dict, train_x, train_y)
print(logprior, len(loglikelihood))

0.0 9161


In [11]:
print(loglikelihood)

{'obama': 0.6931471805599453, 'went': -0.9343092373768332, 'sing': -0.6931471805599453, 'nb': 0.6931471805599453, 'jace': -0.6931471805599453, 'dont': -1.203972804325936, '482': -0.6931471805599453, '27juli': 0.6931471805599453, 'interest': 1.041453874828161, 'starbuck': -0.6931471805599453, 'kindest': 1.0986122886681098, 'yall': 0.0, 'seungchan': -0.6931471805599453, 'skin': 1.791759469228055, 'there': -0.5108256237659905, 'disappear': -0.6931471805599453, 'pblogger': 0.6931471805599453, 'inspi': -0.6931471805599453, 'ignor': -2.0794415416798357, 'movement': -0.6931471805599453, 'tree': -0.2876820724517809, 'leed': -1.6094379124341003, 'voucher': 0.6931471805599453, 'return': -0.8472978603872036, 'christina': 0.6931471805599453, 'lead': 0.0, 'citi': 0.8109302162163288, "otp'": -0.6931471805599453, 'carniv': 0.6931471805599453, 'handl': -0.40546510810816444, 'kb': 0.0, 'amaz': 1.1727202608218315, 'derek': -0.6931471805599453, 'yey': 0.6931471805599453, 'lure': -0.6931471805599453, 'req

In [12]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_list = process_tweet(tweet)

    # intializing the probability to 0 
    p = 0 

    # Adding the logproior 

    p+= logprior

    for word in word_list:
        if word in loglikelihood:
            p += loglikelihood[word]

    return p 


In [13]:
my_tweet = "hi she is smiling "
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

3.244745204846618


In [14]:
m_t = "DO you not want to go out ?"

p_ = naive_bayes_predict(m_t, logprior, loglikelihood)
print(p_)

-1.3328056761689044


In [17]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood,naive_bayes_predict= naive_bayes_predict):
    accuracy= 0 

    y_hats = []

    for tweet in test_x:
        if naive_bayes_predict(tweet,logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0

        y_hats.append(y_hat_i)

    error = np.mean(np.abs(test_y - y_hats))

    accuracy = 1 - error

    return accuracy

In [18]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9955


In [19]:
my_twee = "you are bad "

naive_bayes_predict(my_twee, logprior, loglikelihood)

-1.2992829841302609

Get ratio


In [20]:
def lookup(freq_dict, word, label):
    n = 0
    pair =(word, label)
    if pair in freq_dict:
        n = freq_dict[pair]
    return n 

In [22]:
def get_ratio(freq_dict, word):
    pos_neg_ratio = {'positive':0, 'negative':0, 'ratio':0.0}

    pos_neg_ratio['positive'] = lookup(freq_dict, word, 1)

    pos_neg_ratio['negative'] = lookup(freq_dict, word, 0 )

    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive'] + 1)/ (pos_neg_ratio['negative'] + 1)

    return pos_neg_ratio



In [23]:
get_ratio(freq_dict, 'love')

{'positive': 336, 'negative': 128, 'ratio': 2.612403100775194}

In [27]:
def get_words_by_threshold(freq_dict, label, threshold, get_ratio= get_ratio):
    word_list ={}

    for key in freq_dict.keys():
        word,_ = key

        pos_neg_ratio = get_ratio(freq_dict,word)

        if label == 1 and pos_neg_ratio['ratio'] >= threshold:
            word_list[word] = pos_neg_ratio
        elif label == 0 and pos_neg_ratio['ratio'] <= threshold:
            word_list[word] = pos_neg_ratio

    return word_list


In [28]:
get_words_by_threshold(freq_dict, 1, 0.5)

{'followfriday': {'positive': 23, 'negative': 0, 'ratio': 24.0},
 'top': {'positive': 30, 'negative': 5, 'ratio': 5.166666666666667},
 'engag': {'positive': 7, 'negative': 0, 'ratio': 8.0},
 'member': {'positive': 14, 'negative': 6, 'ratio': 2.142857142857143},
 'commun': {'positive': 27, 'negative': 1, 'ratio': 14.0},
 'week': {'positive': 72, 'negative': 47, 'ratio': 1.5208333333333333},
 ':)': {'positive': 2960, 'negative': 2, 'ratio': 987.0},
 'hey': {'positive': 60, 'negative': 20, 'ratio': 2.9047619047619047},
 'jame': {'positive': 7, 'negative': 3, 'ratio': 2.0},
 'odd': {'positive': 2, 'negative': 2, 'ratio': 1.0},
 ':/': {'positive': 5, 'negative': 8, 'ratio': 0.6666666666666666},
 'call': {'positive': 27, 'negative': 22, 'ratio': 1.2173913043478262},
 'contact': {'positive': 4, 'negative': 7, 'ratio': 0.625},
 'centr': {'positive': 1, 'negative': 2, 'ratio': 0.6666666666666666},
 '02392441234': {'positive': 1, 'negative': 0, 'ratio': 2.0},
 'assist': {'positive': 1, 'negative

In [30]:
get_words_by_threshold(freq_dict, 0, 10)

{'top': {'positive': 30, 'negative': 5, 'ratio': 5.166666666666667},
 'engag': {'positive': 7, 'negative': 0, 'ratio': 8.0},
 'member': {'positive': 14, 'negative': 6, 'ratio': 2.142857142857143},
 'week': {'positive': 72, 'negative': 47, 'ratio': 1.5208333333333333},
 'hey': {'positive': 60, 'negative': 20, 'ratio': 2.9047619047619047},
 'jame': {'positive': 7, 'negative': 3, 'ratio': 2.0},
 'odd': {'positive': 2, 'negative': 2, 'ratio': 1.0},
 ':/': {'positive': 5, 'negative': 8, 'ratio': 0.6666666666666666},
 'pleas': {'positive': 81, 'negative': 243, 'ratio': 0.3360655737704918},
 'call': {'positive': 27, 'negative': 22, 'ratio': 1.2173913043478262},
 'contact': {'positive': 4, 'negative': 7, 'ratio': 0.625},
 'centr': {'positive': 1, 'negative': 2, 'ratio': 0.6666666666666666},
 '02392441234': {'positive': 1, 'negative': 0, 'ratio': 2.0},
 'abl': {'positive': 6, 'negative': 17, 'ratio': 0.3888888888888889},
 'assist': {'positive': 1, 'negative': 0, 'ratio': 2.0},
 'mani': {'positi

# Error analysis

In [31]:
print('Truth Predicted Tweet')

for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x,logprior, loglikelihood)
    if y != (np.sign(y_hat)>0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0 , ' '.join(
            process_tweet(x)).encode('ascii','ignore')))

Truth Predicted Tweet
1	0.00	b'truli later move know queen bee upward bound movingonup'
1	0.00	b'new report talk burn calori cold work harder warm feel better weather :p'
1	0.00	b'harri niall 94 harri born ik stupid wanna chang :d'
1	0.00	b'park get sunlight'
1	0.00	b'uff itna miss karhi thi ap :p'
0	1.00	b'hello info possibl interest jonatha close join beti :( great'
0	1.00	b'u prob fun david'
0	1.00	b'pat jay'
0	1.00	b'sr financi analyst expedia inc bellevu wa financ expediajob job job hire'


# Predcit with your own tweet 

In [32]:
my_tweet = "I am happy because I am Learning :)"

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

9.603597049009228
