First, handle our imports.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import resample
import math

We declare a function, clean_comment, to regex and tokenize comments.

In [None]:
stop_words = set(stopwords.words("english"))

def clean_comment(comment):
    ps = PorterStemmer()
    regex = re.compile('[^ a-zA-Z]')
    cleaned_comment = regex.sub('', comment)
    tokenized_words = word_tokenize(cleaned_comment.lower())
    cleaned_comments = []

    for word in tokenized_words:
        if word not in stop_words:
            cleaned_comments.append(ps.stem(word))

    return cleaned_comments

We start by creating a df of our training data.

In [None]:
df = pd.read_csv('stock_data.csv', sep=',', encoding='latin-1')

Let us look into class balance in the above df.

In [None]:
plt.figure(figsize = None)
sns.set_theme(style="darkgrid")
sns.countplot(x="Sentiment", data=df)
plt.title("Pos vs. Negative Sentiment", fontsize = 15)
plt.show()

Based on the above, we need to upsample our negative sentiment.

In [None]:
df_majority = df[df['Sentiment'] == 1]
df_minority = df[df['Sentiment'] == -1]

minority_upsample = resample(df_minority, replace = True, n_samples = df_majority.shape[0], random_state=101)

df_upsampled = pd.concat([minority_upsample, df_majority])
df_upsampled = df_upsampled.sample(frac=1)

Check our upsample

In [None]:
plt.figure(figsize = None)
sns.set_theme(style="darkgrid")
sns.countplot(x="Sentiment", data=df_upsampled)
plt.title("Pos vs. Negative Sentiment", fontsize = 15)
plt.show()

Message Counter & Occurence Finder

In [None]:
def find_occurrence(frequency, word, label):
    '''
    Params:
        frequency: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Return:
        n: the number of times the word with its corresponding label appears.
    '''
    composite_key = (word, label)
    n = frequency[composite_key]

    return n

In [None]:
def comment_counter(output_occurrence, comments, sentiment):
    '''
    Params:
        output_occurrence: a dictionary that will be used to map each pair to its frequency
        messages: a list of messages
        spam_or_ham: a list corresponding to the sentiment of each message (either 0 or 1)
    Return:
        output: a dictionary mapping each pair to its frequency
    '''
    ## Steps :
    # define the key, which is the word and label tuple
    # if the key exists in the dictionary, increment the count
    # else, if the key is new, add it to the dictionary and set the count to 1

    output_occurence = {}


    for label, comment in zip(sentiment, comments):
        for word in clean_comment(comment):
            composite_key = (word, label)
            keys = output_occurrence.keys()
            if composite_key in keys:
                output_occurrence[composite_key] += 1
            else:
                output_occurrence[composite_key] = 1

    return output_occurrence

Create our frequency dictionary

In [None]:
comments = df_upsampled['Text']
sentiment = df_upsampled['Sentiment']
frequencies = comment_counter(output_occurrence={}, comments=comments, sentiment=sentiment)

print(frequencies)

Implementation of the Naive Bayes Function. This implementation is from A1.

In [None]:
def train_naive_bayes(frequencies, comments, sentiment):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of messages
        train_y: a list of labels correponding to the messages (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0
    
    vocab = []
   
    for key in frequencies.keys():
        vocab.append(key[0])

    # calculate num_pos and num_neg - the total number of positive and negative words for all documents
    num_pos = num_neg = 0
    for pair in frequencies.keys():
        # if the label is positive (greater than zero)
        if frequencies[pair] > 0:

            # Increment the number of positive words by the count for this (word, label) pair
            num_pos += frequencies[pair]

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) pair
            num_neg += frequencies[pair]

    # Calculate num_doc, the number of documents
    num_doc = len(frequencies)  

    # Calculate D_pos, the number of positive documents 
    pos_num_docs = 0
    for pair in frequencies.keys():
        if frequencies[pair] > 0:
            pos_num_docs += 1

    # Calculate D_neg, the number of negative documents 
    neg_num_docs = 0
    for pair in frequencies.keys():
        if frequencies[pair] > 0:
            neg_num_docs += 1

    # Calculate logprior
    positive_sentiment = 0
    negative_sentiment = 0
    for num in sentiment:
        if num > 0:
            positive_sentiment += 1
    else:
        negative_sentiment += 1

    logprior = math.log(pos_num_docs) - math.log(neg_num_docs)

    # For each word in the vocabulary...
    for word in vocab:
        pos_key = (word, 1)
        neg_key = (word, 0)
        freq_pos = 0
        freq_neg = 0
        # get the positive and negative frequency of the word
        if pos_key in frequencies.keys():
            freq_pos = frequencies[pos_key]

        if neg_key in frequencies.keys():
            freq_neg = frequencies[neg_key]

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1)/(positive_sentiment+num_doc)
        p_w_neg = (freq_neg + 1)/(negative_sentiment+num_doc)

        # calculate the log likelihood of the word
        loglikelihood[word] = math.log(p_w_pos/p_w_neg)


    return logprior, loglikelihood

In [None]:
logprior, loglikelihood = train_naive_bayes(frequencies, comments, sentiment)
#print(logprior)
#print(loglikelihood)

Now that we've implemented Naive Bayes, we should test it.

In [None]:
##Insert test here

We can now implement our naive bayes predict function.

In [None]:
def naive_bayes_predict(comment, logprior, loglikelihood):
    cleaned_comment = clean_comment(comment)
    sentiment = 0
    sentiment += logprior
    for word in cleaned_comment:
        if word in loglikelihood.keys():
            sentiment += loglikelihood[word]
    
    if sentiment > 0:
        sentiment = 1
    else:
        sentiment = -1
    
    return sentiment
    

We can then apply this to the WSB comments.

In [None]:
text = open('wsb_comments.txt', 'r')
comments = text.readlines()
rated_comments = {}

for comment in comments:
    sentiment = naive_bayes_predict(comment, logprior, loglikelihood)
    rated_comments[comment] = sentiment


print(rated_comments)