In [12]:
# !pip install googletrans==3.1.0a0
# !pip install sinling


##### Imports


In [13]:
from googletrans import Translator
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sinling import SinhalaStemmer
from sklearn.model_selection import train_test_split
import nltk
import numpy as np
import pandas as pd
import re
import string


##### Download stop words


In [14]:
from nltk.corpus import stopwords
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to /home/chamal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##### Defining Sinhala stop words


In [15]:
stopwords_sinhala = ["‡∑É‡∑Ñ", "‡∑É‡∂∏‡∂ú", "‡∑É‡∂∏‡∂ü", "‡∂Ö‡∑Ñ‡∑è", "‡∂Ü‡∑Ñ‡∑ä", "‡∂Ü", "‡∂ï‡∑Ñ‡∑ù", "‡∂Ö‡∂±‡∑ö", "‡∂Ö‡∂≥‡∑ù", "‡∂Ö‡∂¥‡∑ú‡∂∫‡∑í", "‡∂Ö‡∂¥‡∑ù", "‡∂Ö‡∂∫‡∑í‡∂∫‡∑ù", "‡∂Ü‡∂∫‡∑í", "‡∂å‡∂∫‡∑í", "‡∂†‡∑ì", "‡∂†‡∑í‡∑Ñ‡∑ä", "‡∂†‡∑í‡∂ö‡∑ä", "‡∑Ñ‡∑ù‚Äç", "‡∂Ø‡∑ù", "‡∂Ø‡∑ù‡∑Ñ‡∑ù", "‡∂∏‡∑ô‡∂±‡∑ä", "‡∑É‡∑ö", "‡∑Ä‡∑ê‡∂±‡∑í", "‡∂∂‡∂≥‡∑î", "‡∑Ä‡∂±‡∑ä", "‡∂Ö‡∂∫‡∑î‡∂ª‡∑î", "‡∂Ö‡∂∫‡∑î‡∂ª‡∑í‡∂±‡∑ä", "‡∂Ω‡∑ô‡∑É", "‡∑Ä‡∑ê‡∂©‡∑í", "‡∑Å‡∑ä‚Äç‡∂ª‡∑ì", "‡∑Ñ‡∑è", "‡∂∫", "‡∂±‡∑í‡∑É‡∑è", "‡∂±‡∑í‡∑É‡∑è‡∑Ä‡∑ô‡∂±‡∑ä", "‡∂∂‡∑Ä‡∂ß", "‡∂∂‡∑Ä", "‡∂∂‡∑Ä‡∑ô‡∂±‡∑ä", "‡∂±‡∂∏‡∑ä", "‡∑Ä‡∑ê‡∂©‡∑í", "‡∑É‡∑í‡∂ß", "‡∂Ø‡∑ì", "‡∂∏‡∑Ñ‡∑è", "‡∂∏‡∑Ñ", "‡∂¥‡∂∏‡∂´", "‡∂¥‡∂∏‡∂´‡∑í‡∂±‡∑ä", "‡∂¥‡∂∏‡∂±", "‡∑Ä‡∂±", "‡∑Ä‡∑í‡∂ß", "‡∑Ä‡∑í‡∂ß‡∑í‡∂±‡∑ä", "‡∂∏‡∑ö", "‡∂∏‡∑ô‡∂Ω‡∑ô‡∑É", "‡∂∏‡∑ô‡∂∫‡∑í‡∂±‡∑ä", "‡∂á‡∂≠‡∑í", "‡∂Ω‡∑ô‡∑É", "‡∑É‡∑í‡∂Ø‡∑î", "‡∑Ä‡∑Å‡∂∫‡∑ô‡∂±‡∑ä", "‡∂∫‡∂±", "‡∑É‡∂≥‡∑Ñ‡∑è", "‡∂∏‡∂ú‡∑í‡∂±‡∑ä", "‡∑Ñ‡∑ù‚Äç", "‡∂â‡∂≠‡∑è", "‡∂í", "‡∂ë‡∂∏", "‡∂Ø", "‡∂Ö‡∂≠‡∂ª", "‡∑Ä‡∑í‡∑É‡∑í‡∂±‡∑ä", "‡∑É‡∂∏‡∂ú", "‡∂¥‡∑í‡∑Ö‡∑í‡∂∂‡∂≥‡∑Ä", "‡∂¥‡∑í‡∑Ö‡∑í‡∂∂‡∂≥", "‡∂≠‡∑î‡∑Ö", "‡∂∂‡∑Ä", "‡∑Ä‡∑ê‡∂±‡∑í", "‡∂∏‡∑Ñ", "‡∂∏‡∑ô‡∂∏", "‡∂∏‡∑ô‡∑Ñ‡∑í", "‡∂∏‡∑ö", "‡∑Ä‡∑ô‡∂≠", "‡∑Ä‡∑ô‡∂≠‡∑í‡∂±‡∑ä", "‡∑Ä‡∑ô‡∂≠‡∂ß", "‡∑Ä‡∑ô‡∂±‡∑î‡∑Ä‡∑ô‡∂±‡∑ä", "‡∑Ä‡∑ô‡∂±‡∑î‡∑Ä‡∂ß", "‡∑Ä‡∑ô‡∂±", "‡∂ú‡∑ê‡∂±", "‡∂±‡∑ë", "‡∂Ö‡∂±‡∑î‡∑Ä", "‡∂±‡∑Ä", "‡∂¥‡∑í‡∑Ö‡∑í‡∂∂‡∂≥", "‡∑Ä‡∑í‡∑Å‡∑ö‡∑Ç", "‡∂Ø‡∑ê‡∂±‡∂ß", "‡∂ë‡∑Ñ‡∑ô‡∂±‡∑ä", "‡∂∏‡∑ô‡∑Ñ‡∑ô‡∂±‡∑ä", "‡∂ë‡∑Ñ‡∑ö", "‡∂∏‡∑ô‡∑Ñ‡∑ö", "‡∂∏", "‡∂≠‡∑Ä‡∂≠‡∑ä", "‡∂≠‡∑Ä ", "‡∑É‡∑Ñ",
                     "‡∂Ø‡∂ö‡∑ä‡∑Ä‡∑è", "‡∂ß", "‡∂ú‡∑ö", "‡∂ë", "‡∂ö", "‡∂ö‡∑ä", "‡∂∂‡∑Ä‡∂≠‡∑ä", "‡∂∂‡∑Ä‡∂Ø", "‡∂∏‡∂≠", "‡∂á‡∂≠‡∑î‡∂Ω‡∑î", "‡∂á‡∂≠‡∑î‡∑Ö‡∑î", "‡∂∏‡∑ô‡∑É‡∑ö", "‡∑Ä‡∂©‡∑è", "‡∑Ä‡∂©‡∑è‡∂≠‡∑ä‡∂∏", "‡∂±‡∑í‡∂≠‡∑í", "‡∂±‡∑í‡∂≠‡∑í‡∂≠‡∑ä", "‡∂±‡∑í‡∂≠‡∑ú‡∂ª", "‡∂±‡∑í‡∂≠‡∂ª", "‡∂â‡∂ö‡∑ä‡∂∂‡∑í‡∂≠‡∑í", "‡∂Ø‡∑ê‡∂±‡∑ä", "‡∂∫‡∂Ω‡∑í", "‡∂¥‡∑î‡∂±", "‡∂â‡∂≠‡∑í‡∂±‡∑ä", "‡∑É‡∑í‡∂ß", "‡∑É‡∑í‡∂ß‡∂±‡∑ä", "‡∂¥‡∂ß‡∂±‡∑ä", "‡∂≠‡∑ô‡∂ö‡∑ä", "‡∂Ø‡∂ö‡∑ä‡∑Ä‡∑è", "‡∑É‡∑è", "‡∂≠‡∑è‡∂ö‡∑ä", "‡∂≠‡∑î‡∑Ä‡∂ö‡∑ä", "‡∂¥‡∑Ä‡∑è", "‡∂Ø", "‡∑Ñ‡∑ù‚Äç", "‡∑Ä‡∂≠‡∑ä", "‡∑Ä‡∑í‡∂±‡∑è", "‡∑Ñ‡∑ê‡∂ª", "‡∂∏‡∑í‡∑É", "‡∂∏‡∑î‡∂≠‡∑ä", "‡∂ö‡∑í‡∂∏", "‡∂ö‡∑í‡∂∏‡∑ä", "‡∂á‡∂∫‡∑í", "‡∂∏‡∂±‡∑ä‡∂Ø", "‡∑Ñ‡∑ô‡∑Ä‡∂≠‡∑ä", "‡∂±‡∑ú‡∑Ñ‡∑ú‡∂≠‡∑ä", "‡∂¥‡∂≠‡∑è", "‡∂¥‡∑è‡∑É‡∑è", "‡∂ú‡∑è‡∂±‡∑ô", "‡∂≠‡∑Ä", "‡∂â‡∂≠‡∑è", "‡∂∂‡∑ú‡∑Ñ‡∑ù", "‡∑Ä‡∑Ñ‡∑è", "‡∑É‡∑ô‡∂Ø", "‡∑É‡∑ê‡∂±‡∑í‡∂±‡∑ä", "‡∑Ñ‡∂±‡∑í‡∂ö", "‡∂ë‡∂∏‡∑ä‡∂∂‡∑è", "‡∂ë‡∂∏‡∑ä‡∂∂‡∂Ω", "‡∂∂‡∑ú‡∂Ω", "‡∂±‡∂∏‡∑ä", "‡∑Ä‡∂±‡∑è‡∑Ñ‡∑í", "‡∂ö‡∂Ω‡∑ì", "‡∂â‡∂≥‡∑î‡∂ª‡∑è", "‡∂Ö‡∂±‡∑ä‡∂±", "‡∂î‡∂±‡∑ä‡∂±", "‡∂∏‡∑ô‡∂±‡∑ä‡∂±", "‡∂ã‡∂Ø‡∑ô‡∑É‡∑è", "‡∂¥‡∑í‡∂´‡∑í‡∑É", "‡∑É‡∂≥‡∑Ñ‡∑è", "‡∂Ö‡∂ª‡∂∂‡∂∫‡∑è", "‡∂±‡∑í‡∑É‡∑è", "‡∂ë‡∂±‡∑í‡∑É‡∑è", "‡∂ë‡∂∂‡∑ê‡∑Ä‡∑í‡∂±‡∑ä", "‡∂∂‡∑ê‡∑Ä‡∑í‡∂±‡∑ä", "‡∑Ñ‡∑ô‡∂∫‡∑í‡∂±‡∑ä", "‡∑É‡∑ö‡∂ö‡∑ä", "‡∑É‡∑ö‡∂ö", "‡∂ú‡∑ê‡∂±", "‡∂Ö‡∂±‡∑î‡∑Ä", "‡∂¥‡∂ª‡∑í‡∂Ø‡∑í", "‡∑Ä‡∑í‡∂ß", "‡∂≠‡∑ô‡∂ö‡∑ä", "‡∂∏‡∑ô‡∂≠‡∑ô‡∂ö‡∑ä", "‡∂∏‡∑ö‡∂≠‡∑è‡∂ö‡∑ä", "‡∂≠‡∑î‡∂ª‡∑î", "‡∂≠‡∑î‡∂ª‡∑è", "‡∂≠‡∑î‡∂ª‡∑è‡∑Ä‡∂ß", "‡∂≠‡∑î‡∂Ω‡∑í‡∂±‡∑ä", "‡∂±‡∂∏‡∑î‡∂≠‡∑ä", "‡∂ë‡∂±‡∂∏‡∑î‡∂≠‡∑ä", "‡∑Ä‡∑É‡∑ä", "‡∂∏‡∑ô‡∂±‡∑ä", "‡∂Ω‡∑ô‡∑É", "‡∂¥‡∂ª‡∑í‡∂Ø‡∑í", "‡∂ë‡∑Ñ‡∑ô‡∂≠‡∑ä"]


##### Load the dataset


In [16]:

df = pd.read_csv("./PublicFigureStatementsSinglish.xls", encoding="utf-16")
df.head(10)


Unnamed: 0,Statement,Impact,StateLength
0,‡∂∏‡∂∏ ‡∂±‡∑î‡∑Ä‡∂ª‡∂ë‡∑Ö‡∑í‡∂∫‡∑ö ‡∂â‡∂Ø‡∂±‡∑ä ‡∂ö‡∑ú‡∑Ö‡∂π‡∂ß ‡∂á‡∑Ä‡∑í‡∂Ω‡∑ä‡∂Ω‡∑è ‡∂ú‡∑è‡∂∫‡∂ö‡∂∫‡∑ô‡∂ö‡∑ä ‡∑Ä‡∑ô‡∂±‡∑ä‡∂±...,Positive,403
1,‡∂Ö‡∂Ø ‡∂ã‡∂Ø‡∑ö ‡∂Ø‡∑ê‡∂ö‡∂¥‡∑î ‡∑É‡∑î‡∂±‡∑ä‡∂Ø‡∂ª ‡∂Ø‡∂ª‡∑ä‡∑Å‡∂´‡∂∫‡∂ö‡∑ä. ‡∂±‡∑î‡∂ú‡∑ö‡∂ú‡∑ú‡∂© St.Johns...,Positive,298
2,"Smoking is a bad habit \nMenda , Daneeüèè‚úå.\nCri...",Positive,91
3,‡∑Ñ‡∑ô‡∑ÖPay for Business\r\niOS App ‡∂ë‡∂ö ‡∂∏‡∑ö ‡∑Ä‡∂± ‡∑Ä‡∑í‡∂ß Ap...,Positive,112
4,Shooting ‡∑Ä‡∂Ω‡∂ß ‡∂±‡∑î‡∑Ä‡∂ª ‡∂ú‡∑í‡∂∫‡∂¥‡∑î ‡∂∏‡∂ú‡∑ö ‡∑Ñ‡∑í‡∂≠ ‡∂ú‡∑í‡∂∫ ‡∂≠‡∑ê‡∂±‡∂ö‡∑ä.. ‚ù§Ô∏è...,Positive,359
5,‡∂∏‡∑ö ‡∂Ω‡∑É‡∑ä‡∑É‡∂± ‡∂Ö‡∑Ñ‡∑í‡∂Ç‡∑É‡∂ö ‡∂∏‡∑î‡∑Ñ‡∑î‡∂±‡∑î ‡∑Ä‡∂Ω‡∂ß ‡∑Ñ‡∑í‡∂±‡∑è‡∑Ä‡∂ö‡∑ä ‡∂ú‡∑ö‡∂±‡∑ä‡∂± ‡∂Ö‡∂¥‡∑í‡∂ß ...,Positive,386
6,‡∑Ñ‡∑ô‡∂ß ‡∂Ø‡∑í‡∂±‡∂∫‡∑ö ‡∂Ü‡∂ª‡∂∏‡∑ä‡∂∑ ‡∑Ä‡∂± ‡∂Ü‡∑É‡∑í‡∂∫‡∑è‡∂±‡∑î ‡∂ö‡∑î‡∑É‡∂Ω‡∑è‡∂± ‡∂ö‡∑ä‚Äç‡∂ª‡∑í‡∂ö‡∂ß‡∑ä ‡∂≠‡∂ª‡∂ü...,Positive,170
7,‡∂Ö‡∂ª‡∂ú‡∂Ω‡∂∫‡∑ö ‡∂±‡∑í‡∂∫‡∂∏‡∑î‡∑Ä‡∂±‡∑ä ‡∂Ø‡∂©‡∂∫‡∂∏ ‡∂±‡∑Ä‡∂≠‡∂±‡∑î !! \nLive video ‡∂ë‡∂ö‡∂ö...,Negative,412
8,‡∂Ö‡∂¥‡∑í ‡∂Ö‡∂¥‡∑ä‡∂¥‡∂†‡∑ä‡∂†‡∑í ‡∑Ä‡∂ß‡∑ö ‡∂Ü‡∂©‡∂∏‡∑ä‡∂∂‡∂ª‡∑ô‡∂±‡∑ä ‡∂â‡∂±‡∑ä‡∂±‡∑Ä‡∑è ‡∑Ä‡∂ú‡∑ö ‡∂ë‡∂∫‡∑è ‡∑É‡∑î‡∑Ä ...,Negative,270
9,‡∂∂‡∑ä‚Äç‡∂ª‡∑Ñ‡∑ä‡∂∏‡∑è‡∑É‡∑ä‡∂≠‡∑ä‚Äç‡∂ª ‡∂†‡∑í‡∂≠‡∑ä‚Äç‡∂ª‡∂¥‡∂ß‡∑í‡∂∫‡∑ö ‚Äú‡∂ö‡∑ö‡∑É‡∂ª‡∑í‡∂∫‡∑è‚Äù ‡∑É‡∑í‡∂Ç‡∂Ø‡∑î‡∑Ä‡∂ß c...,Positive,362


##### Filtering dataset into positives and negatives


In [17]:
all_positive_tweets = df[df["Impact"] ==
                         "Positive"]["Statement"].values.tolist()
all_negative_tweets = df[df["Impact"] ==
                         "Negative"]["Statement"].values.tolist()

print("Number of positive tweets: ", len(all_positive_tweets))
print("Number of negative tweets: ", len(all_negative_tweets))


Number of positive tweets:  517
Number of negative tweets:  505


##### Randomly choosing 300 samples for each positives and negatives


In [18]:
dataset_size = 300
all_positive_tweets = np.random.choice(
    all_positive_tweets, size=dataset_size, replace=False).tolist()
all_negative_tweets = np.random.choice(
    all_negative_tweets, size=dataset_size, replace=False).tolist()


print("Number of positive tweets: ", len(all_positive_tweets))
print("Number of negative tweets: ", len(all_negative_tweets))


Number of positive tweets:  300
Number of negative tweets:  300


##### Tweet processing function depending on the language to translate to


In [19]:
def process_tweet(tweet, language="si"):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
        language: language to translate to "si" or "en"
    Output:
        tweets_clean: a list of words containing the processed tweet
    """

    english_stemmer = PorterStemmer()
    sinhala_stemmer = SinhalaStemmer()

    stopwords_english = stopwords.words("english")

    # remove stock market tickers like $GE
    tweet = re.sub(r"\$\w*", "", tweet)

    # remove old style retweet text "RT"
    tweet = re.sub(r"^RT[\s]+", "", tweet)

    # remove hyperlinks
    tweet = re.sub(r"(http|https|ftp):\/\/(\S*)", "", tweet)

    # remove hashtag sign from words
    tweet = tweet.replace("#", "")

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    translator = Translator()

    # remove stopwords and punctuation
    def filter_english(
        word): return word not in stopwords_english and word not in string.punctuation

    def filter_sinhala(
        word): return word not in stopwords_sinhala and word not in string.punctuation

    def stem_english(word):
        if (filter_english(word)):
            stem_word = english_stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    def stem_sinhala(word):
        if (filter_sinhala(word)):
            stem_word = sinhala_stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word[0])

    if language == "en":

        for word in tweet_tokens:
            detected_language = translator.detect(word)
            if "en" in detected_language.lang:  # parse English word
                stem_english(word)
            elif "si" in detected_language.lang:  # parse Sinhala word
                if filter_sinhala(word):
                    translated_text = translator.translate(word)
                    tokenized_text = tokenizer.tokenize(translated_text.text)
                    for word in tokenized_text:
                        stem_english(word)

    elif language == "si":

        for word in tweet_tokens:
            detected_language = translator.detect(word)
            if "en" in detected_language.lang:  # parse English word
                if (filter_english(word)):
                    translated_text = translator.translate(word)
                    tokenized_text = tokenizer.tokenize(translated_text.text)
                    for word in tokenized_text:
                        stem_sinhala(word)
            elif "si" in detected_language.lang:  # parse Sinhala word
                stem_sinhala(word)

    return tweets_clean


##### Word frequency dictionary `(word, label): frequency`


In [20]:
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs


##### Concatenate tweets and create labels accordingly


In [21]:
tweets = all_positive_tweets + all_negative_tweets
labels = np.append(np.ones((len(all_positive_tweets))),
                   np.zeros((len(all_negative_tweets))))

freqs = build_freqs(tweets, labels)


##### Divide train and test split data


In [22]:
# split the data into two pieces, one for training and one for testing (validation set)
train_x, test_x, train_y, test_y = train_test_split(
    tweets, labels, test_size=0.2, random_state=42)


##### Frequency lookup helper function


In [23]:
def lookup(freqs, word, label):
    """
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    """
    n = 0

    n = freqs.get((word, label), 0)

    return n


##### Naive bayes model training function returning the logprior and loglikelihood


In [24]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation.
    '''
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = V_pos = V_neg = 0

    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            # increment the count of unique positive words by 1
            V_pos += 1

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:
            # increment the count of unique negative words by 1
            V_neg += 1

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))

    # Calculate D_neg, the number of negative documents
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)

    return logprior, loglikelihood


##### Creating the model


In [25]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

print(logprior)
print(len(loglikelihood))


-0.01666705248521172
4441


##### Tweet prediction function

In [26]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p


In [30]:
def predictions(sample):
    y_hats = []
    for tweet in sample:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)
    return y_hats


##### Predict random samples

In [39]:
random_indices = np.random.randint(0, len(test_x) - 1, 5)
y_hats = predictions(np.take(test_x, random_indices))
print("Outputs")
data = []
for i in range(5):
    data.append([test_x[i], "positive" if y_hats[i] >
                0 else "negative", "‚úÖ" if y_hats[i] - test_y[i] == 0 else "‚ùå"])
pd.DataFrame(data, columns=["Statement", "Impact", "Result"])


Outputs


Unnamed: 0,Statement,Impact,Result
0,Shopify ‡∂∑‡∑è‡∑Ä‡∑í‡∂≠‡∑è ‡∂ö‡∂ª‡∂± ‡∂Ø‡∑ö‡∑Å‡∑ì‡∂∫ eCommerce ‡∑Ä‡∑ä‚Äç‡∂∫‡∑è‡∂¥‡∑è‡∂ª ‡∑É‡∂≥...,negative,‚ùå
1,Good Morning Everyone \n‡∂î‡∂∫‡∑è‡∂Ω ‡∂∂‡∂Ω‡∂±‡∑ä ‡∂â‡∂±‡∑ä‡∂±‡∑ô ‡∂Ö‡∑Ä‡∑î‡∂ª‡∑ê‡∂Ø...,negative,‚úÖ
2,Independent Televison Network Itn ‡∂ë‡∂ö‡∑ö ‡∂∂‡∑î‡∂Ø‡∑ä‡∂∞‡∑í‡∂∏‡∂≠...,positive,‚ùå
3,‡∂á‡∂≠‡∑ä‡∂≠‡∂ß‡∂∏ ‡∂î‡∂∫ ‡∂∫‡∂ß‡∂≠‡∑ä ‡∂ö‡∂ª‡∂ú‡∂≠‡∑ä‡∂≠‡∑è ‡∂ö‡∑í‡∑Ä‡∑ä‡∑Ä‡∂ß ‡∂í‡∂ö ‡∂í ‡∂ö‡∑è‡∂Ω‡∑ô Upper ...,negative,‚ùå
4,‡∂ö‡∑ú‡∑Ñ‡∑ö ‡∂¥‡∑í‡∂¥‡∑î‡∂´‡∂≠‡∑ä ‡∂∏‡∂Ω‡∑ä ‡∂ë‡∂ö‡∑Ä‡∂ú‡∑ö‡∂∏ ‡∑É‡∑î‡∑Ä‡∂≥‡∂∫‡∑ì.‡∑Ñ‡∂ª‡∑í‡∂∏ ‡∂Ω‡∑É‡∑ä‡∑É‡∂±‡∂ß ‡∂ú‡∑è‡∂∫...,positive,‚úÖ


In [31]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly

    y_hats = predictions(test_x)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.absolute(test_y - y_hats))

    # Accuracy is 1 minus the error
    accuracy = 1 - error

    return accuracy


In [32]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))


Naive Bayes accuracy = 0.9833
