In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import resample
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import ClassifierI
import math
import random
import pickle
from statistics import mode

We declare a function, clean_comment, to regex and tokenize comments.

In [None]:
stop_words = set(stopwords.words("english"))

def clean_comment(comment):
    ps = PorterStemmer()
    regex = re.compile('[^ a-zA-Z]')
    cleaned_comment = regex.sub('', comment)
    tokenized_words = word_tokenize(cleaned_comment.lower())
    cleaned_comments = []

    for word in tokenized_words:
        if word not in stop_words:
            cleaned_comments.append(ps.stem(word))

    return cleaned_comments

We start by creating a df of our sample data.

In [None]:
df = pd.read_csv('stock_data.csv', sep=',', encoding='latin-1')

Let us look into class balance in the above df.

In [None]:
plt.figure(figsize = None)
sns.set_theme(style="darkgrid")
sns.countplot(x="Sentiment", data=df)
plt.title("Pos vs. Negative Sentiment", fontsize = 15)
plt.show()

Based on the above, we need to upsample our negative sentiment.

In [None]:
df_majority = df[df['Sentiment'] == 1]
df_minority = df[df['Sentiment'] == -1]

minority_upsample = resample(df_minority, replace = True, n_samples = df_majority.shape[0], random_state=101)

df_upsampled = pd.concat([minority_upsample, df_majority])
df_upsampled = df_upsampled.sample(frac=1)

Check our upsample

In [None]:
plt.figure(figsize = None)
sns.set_theme(style="darkgrid")
sns.countplot(x="Sentiment", data=df_upsampled)
plt.title("Pos vs. Negative Sentiment", fontsize = 15)
plt.show()

In [None]:
print(df_upsampled)

 We begin by compiling a feature list of words from our positive and negative comments to begin to see trends in which words fall into which category generally.

We begin by building a frequency distribution of words in our comments. We can also use this to build our vocab.

In [None]:
# TASK CELL
def word_counter(comments, sentiment):
    '''
    Params:
        comments: a list of comments
        sentiment: a list corresponding to the sentiment of each message (either 0 or 1)
    Return:
        output_occurence: a dictionary mapping each pair to its frequency
    '''

    output_occurence = {}
    vocab = []


    for label, comment in zip(sentiment, comments):
        for word in clean_comment(comment):
            vocab.append(word)
            composite_key = (word, label)
            keys = output_occurence.keys()
            if composite_key in keys:
                output_occurence[composite_key] += 1
            else:
                output_occurence[composite_key] = 1

    vocab = set(vocab)
    
    return output_occurence, vocab

In [None]:
#random.shuffle(documents)
#print(documents[0])
vocab = []
comments = []
sentiments = []
documents = []

for comment in df_upsampled['Text']:
    comments.append(clean_comment(comment))

for sentiment in df_upsampled['Sentiment']:
    sentiments.append(sentiment)

for i in range(len(comments)):
    documents.append((comments[i], sentiments[i]))

random.shuffle(documents)

for comment in comments:
    for word in comment:
        vocab.append(word)

vocab = nltk.FreqDist(vocab)

vocab_features = list(vocab.keys())

print(documents[0][0])


Now, we categorize each word as positive or negative. We build a function that finds words in our comments and gets their most frequent classification from the dictionary created in find_occurence.

In [None]:
def find_features(comment):
    '''
    Params:
        document: our comment and sentiment
    Return:
        the features, a dictionary of words in the comment mapped to sentiment
    '''
    words = comment
    features = {}

    for w in vocab_features:
        features[w] = (w in words)

    return features
    
        



In [None]:
featuresets = [(find_features(comment), sentiment) for (comment, sentiment) in documents]

print(featuresets[1])

Let us see how this works on our training data.

In [None]:
cutoff = round(len(featuresets) * 0.8)
training_set = featuresets[:cutoff]
testing_set = featuresets[cutoff+1:]

naive_bayes_classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy percent:",(nltk.classify.accuracy(naive_bayes_classifier, testing_set))*100)


We can see our most influential words below.

In [None]:
naive_bayes_classifier.show_most_informative_features(15)

Excellent, we have an accuracy of 82%! Now, we should save this classifier to use when voting later.

In [None]:
save_classifier = open('naivebayes.pickle','wb')
pickle.dump(naive_bayes_classifier, save_classifier)
save_classifier.close()

Other classifiers - this may take a substantial amount of time to run.

In [None]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

Let's check the accuracy of the above models.

In [None]:
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
print("BNB accuracy percent:", (nltk.classify.accuracy(BNB_classifier, testing_set))*100)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


Declaration of our vote classifier class

In [18]:
class VotingClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

        def classify(self, features):
            votes = []
            for classifier in self._classifiers:
                vote = classifier.classify(features)
                votes.append(vote)
            return mode(votes)

        def evaluate_confidence(self, features):
            votes = []
            for classifier in self._classifiers:
                vote = classifier.classify(features)
                votes.append(vote)
            
            choice_votes = votes.count(mode(votes))
            confidence = choice_votes / len(votes)
            return confidence




NameError: name 'ClassifierI' is not defined

In [None]:
voting_classifier = VotingClassifier(naive_bayes_classifier, MNB_classifier, BNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, SVC_classifier, LinearSVC_classifier, NuSVC_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voting_classifier, testing_set))*100)
