In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import resample
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from nltk.classify.scikitlearn import SklearnClassifier
import math

We declare a function, clean_comment, to regex and tokenize comments.

In [None]:
stop_words = set(stopwords.words("english"))

def clean_comment(comment):
    ps = PorterStemmer()
    regex = re.compile('[^ a-zA-Z]')
    cleaned_comment = regex.sub('', comment)
    tokenized_words = word_tokenize(cleaned_comment.lower())
    cleaned_comments = []

    for word in tokenized_words:
        if word not in stop_words:
            cleaned_comments.append(ps.stem(word))

    return cleaned_comments

We start by creating a df of our sample data.

In [None]:
df = pd.read_csv('stock_data.csv', sep=',', encoding='latin-1')

Let us look into class balance in the above df.

In [None]:
plt.figure(figsize = None)
sns.set_theme(style="darkgrid")
sns.countplot(x="Sentiment", data=df)
plt.title("Pos vs. Negative Sentiment", fontsize = 15)
plt.show()

Based on the above, we need to upsample our negative sentiment.

In [None]:
df_majority = df[df['Sentiment'] == 1]
df_minority = df[df['Sentiment'] == -1]

minority_upsample = resample(df_minority, replace = True, n_samples = df_majority.shape[0], random_state=101)

df_upsampled = pd.concat([minority_upsample, df_majority])
df_upsampled = df_upsampled.sample(frac=1)

Check our upsample

In [None]:
plt.figure(figsize = None)
sns.set_theme(style="darkgrid")
sns.countplot(x="Sentiment", data=df_upsampled)
plt.title("Pos vs. Negative Sentiment", fontsize = 15)
plt.show()

In [None]:
print(df_upsampled)

 We begin by compiling a feature list of words from our positive and negative comments to begin to see trends in which words fall into which category generally.

We begin by building a frequency distribution of words in our comments. We can also use this to build our vocab.

In [35]:
# TASK CELL
def word_counter(comments, sentiment):
    '''
    Params:
        comments: a list of comments
        sentiment: a list corresponding to the sentiment of each message (either 0 or 1)
    Return:
        output_occurence: a dictionary mapping each pair to its frequency
    '''

    output_occurence = {}
    vocab = []


    for label, comment in zip(sentiment, comments):
        for word in clean_comment(comment):
            vocab.append(word)
            composite_key = (word, label)
            keys = output_occurence.keys()
            if composite_key in keys:
                output_occurence[composite_key] += 1
            else:
                output_occurence[composite_key] = 1

    vocab = set(vocab)
    
    return output_occurence, vocab

Now, we categorize each word as positive or negative. We build a function that finds words in our comments and gets their most frequent classification from the dictionary created in find_occurence.

In [38]:
def classify_word(frequency, word):
    '''
    Params:
        frequency: a dictionary with the frequency of each word
        word: the word to look up
        label: the label corresponding to the word
    Return:
        n: the number of times the word with its corresponding label appears.
    '''
    composite_key_pos = (word, 1)
    composite_key_neg = (word, -1)
    n_pos = 0
    n_neg = 0


    if composite_key_pos in frequency:
        n_pos = frequency[composite_key_pos]
    if composite_key_neg in frequency:
        n_neg = frequency[composite_key_neg]

    if n_pos >= n_neg:
        return (word, 1)
    else:
        return (word, -1)
        



In [39]:
word_counts, vocab = word_counter(df_upsampled['Text'], df_upsampled['Sentiment'])
feature_set = [classify_word(word_counts, w) for w in vocab]

print(feature_set)

[('xi', 1), ('eventu', 1), ('deleverag', 1), ('sieg', -1), ('wbac', 1), ('whose', -1), ('pronounc', 1), ('fox', 1), ('httpstcocojsgvyph', 1), ('allgarbag', -1), ('payday', 1), ('moodi', -1), ('hangout', 1), ('relief', 1), ('weather', -1), ('achiev', 1), ('fomc', 1), ('goreal', 1), ('mtz', -1), ('inc', -1), ('sub', -1), ('screener', 1), ('restart', -1), ('touch', 1), ('traffic', -1), ('rhythm', 1), ('hire', 1), ('packagei', 1), ('snta', 1), ('seem', -1), ('cytxpdat', -1), ('monthshttpstcoztullqf', 1), ('dontbeshort', 1), ('httpstcoqvbcwl', 1), ('pki', 1), ('sec', -1), ('design', 1), ('httpstcorhrcnzinh', 1), ('dn', 1), ('marijuana', 1), ('consider', 1), ('tivo', 1), ('x', 1), ('ine', -1), ('byproduct', 1), ('augh', 1), ('thursday', 1), ('juic', 1), ('som', 1), ('lawsuit', -1), ('tinker', -1), ('bondshttpstcozpfzung', -1), ('trail', 1), ('intend', -1), ('quit', 1), ('tobacco', 1), ('round', 1), ('rememb', 1), ('atampt', 1), ('pf', -1), ('httpstcokkcdftbko', 1), ('inform', 1), ('myx', 1),

Let us see how this works on our training data.