In [1]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def test_lookup(func):
    freqs = {('sad', 0): 4,
             ('happy', 1): 12,
             ('oppressed', 0): 7}
    word = 'happy'
    label = 1
    if func(freqs, word, label) == 12:
        return 'SUCCESS!!'
    return 'Failed Sanity Check!'


def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [2]:
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd
nltk.download('stopwords')
nltk.download('twitter_samples')
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\choud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\choud\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


In [3]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]
train_x = train_pos + train_neg
test_x = test_pos + test_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


In [4]:
def count_tweets(result,tweets,ys):
    for y,tweet in zip(ys,tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
    return result
result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1,0,0,0,0]
count_tweets(result,tweets,ys)
print(result)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}


In [5]:
freqs=count_tweets({},train_x,train_y)
def train_naive_bayes(freqs,train_x,train_y):
    loglikelihood = {}
    logprior = 0
    vocab=set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    N_pos = N_neg = V_pos = V_neg = 0
    for pair in freqs.keys():
        if pair[1]>0:
            V_pos += 1
            N_pos += freqs[pair]
        else:
            V_neg += 1
            N_neg += freqs[pair]
    D = len(train_y)
    D_pos=len(list(filter(lambda x:x>0,train_y)))
    D_neg = D-D_pos
    logprior = np.log(D_pos)-np.log(D_neg)
    for word in vocab:
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)
        p_w_pos = (freq_pos+1)/(N_pos+V)
        p_w_neg = (freq_neg+1)/(N_neg+V)
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
    return logprior,loglikelihood
logprior,loglikelihood = train_naive_bayes(freqs,train_x,train_y)
print(logprior)
print(len(loglikelihood))

0.0
9085


In [6]:
def naive_bayes_predict(tweet,logprior,loglikelihood):
    word_l = process_tweet(tweet)
    p = 0
    p += logprior
    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]
    return p
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 1.5737244858565678


In [7]:
def test_naive_bayes(test_x,test_y,logprior,loglikelihood):
    accuracy = 0
    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet,logprior,loglikelihood)>0:
            y_hat_i = 1
        else:
            y_hat_i = 0
        y_hats.append(y_hat_i)
    error = np.mean(np.absolute(y_hats-test_y))
    accuracy = 1-error
    return accuracy
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))


Naive Bayes accuracy = 0.9940


In [8]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f'{tweet} -> {p}')


I am happy -> 2.148265924008666
I am bad -> -1.2941744161875384
this movie should have been great. -> 2.1429031920840935
great -> 2.137794624141371
great great -> 4.275589248282742
great great great -> 6.413383872424113
great great great great -> 8.551178496565484


In [9]:
def get_ratio(freqs,word):
    pos_neg_ratio = {'positive':0,'negative':0,'ratio':0.0}
    pos_neg_ratio['positive'] = lookup(freqs,word,1)
    pos_neg_ratio['negative'] = lookup(freqs,word,0)
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive']+1)/(pos_neg_ratio['negative']+1)
    return pos_neg_ratio

get_ratio(freqs,'happi')

{'positive': 161, 'negative': 18, 'ratio': 8.526315789473685}

In [10]:
def get_words_by_threshold(freqs,label,threshold):
    word_list = {}
    for key in freqs.keys():
        word,lab = key
        if lab == label:
            if freqs[key]>=threshold:
                word_list[key] = freqs[key]
    return word_list
get_words_by_threshold(freqs,1,10)
get_words_by_threshold(freqs,0,10)
#Error Analysis
print('Truth Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = naive_bayes_predict(x,logprior,loglikelihood)
    if y!= (np.sign(y_hat)>0):
        print('%d\t%0.2f\t%s' % (y,y_hat,x))

my_tweet = 'I am happy because I am learning :)'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

Truth Predicted Tweet
1	0.00	http://t.co/t2z9ax4qyd - hey, now! Come see us at the @NTCalkeAbbey Summer Fine Food Fair on Sunday, 11am-4pm in the Riding School :)
1	-1.50	@jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp
1	-0.89	A new report talks about how we burn more calories in the cold, because we work harder to warm up. Feel any better about the weather? :p
1	-0.42	Harry and niall and -94 (when harry was born) ik it's stupid and i wanna change it :D https://t.co/gHAt8ZDAfF
1	0.00	@miabellasesso http://t.co/FtI5vLQJks @SBNation and a few select others.. will get to you :)
1	0.00	https://t.co/gBIMDzQBY5 Must watch :D #BajrangiBhaijaanHighestWeek1
1	-0.94	off to the park to get some sunlight : )
1	-0.40	@msarosh Uff Itna Miss karhy thy ap :p
0	0.74	@rcdlccom hello, any info about possible interest in Jonathas ?? He is close to join Betis :( greatings
0	1.58	@phenomyoutube u probs had more fun with d