### ISA 414 Final Project - Stock Predictions through Unsupervised Sentiment Analysis
##### Ethan Buege, Jorge Nadjar, Mac Magyaros

Libraries being used:
- Pandas
- Numpy
- PSAW (Python Pushshift.io API Wrapper)
- PRAW (Python Reddit API Wrapper)
- PyTorch
- sklearn

In [409]:
# standard python libraries
import requests
from requests import auth
import time
from time import sleep
import datetime as dt
from datetime import timezone
import collections
import random
import logging

# data handling
import pandas as pd
import numpy as np
import re
import pickle
import pymongo

# reddit API
import praw
from psaw import PushshiftAPI
from praw.models import MoreComments

# unsupervised sentiment analysis - doc2vec encoding
import gensim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext as tt

# visualization
import matplotlib.pyplot as plt

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
use_cuda = torch.cuda.is_available()

top_10_tickers = ['GME', 'AMC', 'TSLA', 'AAPL', 'AMZN', 'AMD', 'NVDA', 'MSFT', 'SPY', 'QQQ']

In [410]:
doc2vec_model = pickle.load(open('doc2vec_model.p', "rb"))

In [411]:
def aggregate_posts_by_day(ticker):
    aggregated_daily_posts = {}
    daily_upvotes = {}
    daily_volume = {}

    posts = pickle.load(open(f'E:/reddit_data/{ticker}_series.p', "rb"))
    # replace UTC timestamp with date for easy aggregation
    for post in posts:
        day = dt.datetime.utcfromtimestamp(post.created_utc).date()
        delta = day.weekday() - 4
        if delta > 0:  # if day is weekend, aggregate to friday
            days_back = dt.timedelta(delta)
            day -= days_back

        if day not in aggregated_daily_posts:
            aggregated_daily_posts[day] = ''
            daily_upvotes[day] = 0
            daily_volume[day] = 0


        if 'deleted by user' not in post.title and len(post.title) > 5:
            aggregated_daily_posts[day] = ' '.join([aggregated_daily_posts[day], post.title])
            daily_upvotes[day] += post.score
            daily_volume[day] += 1
            # add bodytext if it exists and has not been deleted
            # WSB has minimum length requirements so the only posts less than 20 chars have been deleted
            if len(post.selftext) > 20:
                aggregated_daily_posts[day] = ' '.join([aggregated_daily_posts[day], post.selftext])

    return aggregated_daily_posts, daily_upvotes, daily_volume


# daily_posts = aggregate_posts_by_day('TSLA')


for ticker in top_10_tickers:
    daily_posts, daily_upvotes, daily_volume = aggregate_posts_by_day(ticker)
    for day in daily_posts:
        tokens = gensim.utils.simple_preprocess(daily_posts[day])
        daily_posts[day] = doc2vec_model.infer_vector(tokens)

    print(f'aggregated and vectorized {ticker} posts saved to disk')
    # this is terribly inefficient but I needed to get this thing finished
    pickle.dump(daily_posts, open(f'E:/reddit_data/{ticker}_vectorized_posts.p', "wb"))
    pickle.dump(daily_upvotes, open(f'E:/reddit_data/{ticker}_daily_upvotes.p', "wb"))
    pickle.dump(daily_volume, open(f'E:/reddit_data/{ticker}_daily_volume.p', "wb"))


aggregated and vectorized GME posts saved to disk
aggregated and vectorized AMC posts saved to disk
aggregated and vectorized TSLA posts saved to disk
aggregated and vectorized AAPL posts saved to disk
aggregated and vectorized AMZN posts saved to disk
aggregated and vectorized AMD posts saved to disk
aggregated and vectorized NVDA posts saved to disk
aggregated and vectorized MSFT posts saved to disk
aggregated and vectorized SPY posts saved to disk
aggregated and vectorized QQQ posts saved to disk


In [441]:
ticker = 'TSLA'

vectorized_posts = pickle.load(open(f'E:/reddit_data/{ticker}_vectorized_posts.p', "rb"))
daily_upvotes = pickle.load(open(f'E:/reddit_data/{ticker}_daily_upvotes.p', "rb"))
daily_volume = pickle.load(open(f'E:/reddit_data/{ticker}_daily_volume.p', "rb"))
prices = pd.read_csv(f'{ticker}_prices.csv')


### Addition - Jorge

In [442]:
keys_values = vectorized_posts.items()
posts_string = {key.strftime('%Y-%m-%d'): value for key, value in keys_values}
prices['docvec'] = prices['from'].map(posts_string)

keys_values = daily_upvotes.items()
posts_string = {key.strftime('%Y-%m-%d'): value for key, value in keys_values}
prices['upvotes'] = prices['from'].map(posts_string)

keys_values = daily_volume.items()
posts_string = {key.strftime('%Y-%m-%d'): value for key, value in keys_values}
prices['post volume'] = prices['from'].map(posts_string)

prices

Unnamed: 0,from,open,high,low,close,volume,afterHours,preMarket,docvec,upvotes,post volume
0,2017-04-24,61.844,62.110,61.2043,61.606,25417525,61.5460,62.000,"[1.7273818, 2.0063844, -0.099462226, -0.209076...",167.0,1.0
1,2017-04-25,61.600,62.796,61.1720,62.758,33688540,62.7500,61.700,,,
2,2017-04-26,62.474,62.900,61.8000,62.034,23475220,62.0020,62.628,,,
3,2017-04-27,62.338,62.618,61.5000,61.726,17342845,61.8200,61.980,"[-1.8340168, 3.2863724, -1.204684, 1.9058441, ...",43.0,2.0
4,2017-04-28,61.966,62.960,61.6000,62.814,22527390,62.9200,61.958,,,
...,...,...,...,...,...,...,...,...,...,...,...
1255,2022-04-18,989.030,1014.920,973.4100,1004.290,17237387,1013.0200,987.250,,,
1256,2022-04-19,1005.060,1034.940,995.3250,1028.150,16604744,1018.0000,1008.180,,,
1257,2022-04-20,1030.000,1034.000,975.2501,977.200,23534922,1031.1800,1015.240,,,
1258,2022-04-21,1074.730,1092.220,996.4150,1008.780,35136565,1011.3999,1031.610,,,


In [443]:
scaler = MinMaxScaler(feature_range=(-1, 1))

price = prices[['close']]
price['close'] = scaler.fit_transform(price['close'].values.reshape(-1, 1))
upvotes = prices[['upvotes']]
upvotes['upvotes'] = scaler.fit_transform(upvotes['upvotes'].values.reshape(-1, 1))
upvotes = upvotes.fillna(0)
volume = prices[['post volume']]
volume['post volume'] = scaler.fit_transform(volume['post volume'].values.reshape(-1, 1))
volume = volume.fillna(0)
print(price.shape)
print(upvotes.shape)
print(volume.shape)

(1260, 1)
(1260, 1)
(1260, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price['close'] = scaler.fit_transform(price['close'].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  upvotes['upvotes'] = scaler.fit_transform(upvotes['upvotes'].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volume['post volume'] = scaler.fit_transform(

In [445]:
n = len(price)
price_list = price['close'].tolist()
docscaler = MinMaxScaler(feature_range=(-1, 1))

docvecs = prices['docvec'].to_numpy()
for i in range(n):
    if not isinstance(docvecs[i], (list,pd.Series,np.ndarray)):
        docvecs[i] = np.zeros(125)
    else:
        # this method of normalization likely loses some some data contained in doc embeddings since it is refit for
        # every sample but I didn't have enough time left figure out a more thorough solution
        docvecs[i] = docscaler.fit_transform(docvecs[i].reshape(-1, 1)).reshape(125,)

# docvecs = docvecs.reshape(1260, 300)
docvecs = docvecs.tolist()
docvecs = np.array(docvecs)
print(docvecs.shape)
print(docvecs[0])
# print(docvecs[1])



(1260, 125)
[ 8.61668468e-01  1.00000012e+00 -4.40952964e-02 -9.84426737e-02
  6.15872979e-01  1.91651747e-01 -9.96173620e-02 -3.80461156e-01
 -1.00000000e+00 -4.48948443e-01  5.06017208e-01  2.35546067e-01
  1.11667261e-01  5.55926681e-01  6.25601351e-01  1.70470059e-01
 -8.32583964e-01  2.65944541e-01  1.02440618e-01  2.44176686e-01
 -2.51006693e-01  4.90468293e-02 -3.71593028e-01 -3.84453803e-01
  7.51768351e-01  4.09332812e-01  2.29429919e-04  6.52647197e-01
  5.13027841e-03  7.34517816e-04 -4.40690875e-01 -4.76047248e-01
 -9.19867977e-02  5.82740664e-01 -6.19729638e-01  1.54287577e-01
 -2.92563379e-01  9.25148278e-02 -8.39719027e-02  4.03524965e-01
  1.22637525e-01  3.09875458e-01  1.81610510e-02  2.61289746e-01
  1.35278940e-01  3.30417573e-01 -3.04801762e-01  1.28914893e-01
  6.11102760e-01 -8.70962664e-02  2.80238628e-01 -4.47457701e-01
 -7.01964080e-01 -3.16663474e-01  1.25908270e-01  5.36438882e-01
  9.74198580e-01  1.44917414e-01  2.15309352e-01 -4.52265799e-01
 -3.29403847e

In [446]:
data = np.concatenate((price, upvotes, volume, docvecs), axis=1)
print(data.shape)
print(data[0])


(1260, 128)
[-9.56768019e-01 -9.96803675e-01 -9.91735537e-01  8.61668468e-01
  1.00000012e+00 -4.40952964e-02 -9.84426737e-02  6.15872979e-01
  1.91651747e-01 -9.96173620e-02 -3.80461156e-01 -1.00000000e+00
 -4.48948443e-01  5.06017208e-01  2.35546067e-01  1.11667261e-01
  5.55926681e-01  6.25601351e-01  1.70470059e-01 -8.32583964e-01
  2.65944541e-01  1.02440618e-01  2.44176686e-01 -2.51006693e-01
  4.90468293e-02 -3.71593028e-01 -3.84453803e-01  7.51768351e-01
  4.09332812e-01  2.29429919e-04  6.52647197e-01  5.13027841e-03
  7.34517816e-04 -4.40690875e-01 -4.76047248e-01 -9.19867977e-02
  5.82740664e-01 -6.19729638e-01  1.54287577e-01 -2.92563379e-01
  9.25148278e-02 -8.39719027e-02  4.03524965e-01  1.22637525e-01
  3.09875458e-01  1.81610510e-02  2.61289746e-01  1.35278940e-01
  3.30417573e-01 -3.04801762e-01  1.28914893e-01  6.11102760e-01
 -8.70962664e-02  2.80238628e-01 -4.47457701e-01 -7.01964080e-01
 -3.16663474e-01  1.25908270e-01  5.36438882e-01  9.74198580e-01
  1.44917414e

Slice data and construct training/test sets


In [447]:
def split_data(stock_data, lookback):
    data_raw = np.array(stock_data)  # convert to numpy array
    data = []

    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - lookback):
        data.append(data_raw[index: index + lookback])

    data = np.array(data)
    test_set_size = int(np.round(0.2 * data.shape[0]))
    train_set_size = data.shape[0] - (test_set_size)

    x_train = data[:train_set_size, :-1, :]
    y_train = data[:train_set_size, -1, :1]

    x_test = data[train_set_size:, :-1]
    y_test = data[train_set_size:, -1, :1]

    return [x_train, y_train, x_test, y_test]


lookback = 20  # lookback sequence length
x_train, y_train, x_test, y_test = split_data(data, lookback)

x_train = torch.from_numpy(x_train).type(torch.Tensor)
x_test = torch.from_numpy(x_test[:-1]).type(torch.Tensor)
y_train_lstm = torch.from_numpy(y_train).type(torch.Tensor)
y_test_lstm = torch.from_numpy(y_test).type(torch.Tensor)

print('y_train.shape = ',y_train_lstm.shape)
print('y_test.shape = ',y_test_lstm.shape)

# y_train_gru = torch.from_numpy(y_train).type(torch.Tensor)
# y_test_gru = torch.from_numpy(y_test).type(torch.Tensor)

y_train.shape =  torch.Size([992, 1])
y_test.shape =  torch.Size([248, 1])


Create categorical y-vectors where y=0 if stock price went down on the following day, and y=1 if it went up

In [448]:
y_train_categorical = np.array([int(y_train[i+1] > y_train[i]) for i in range(len(y_train_lstm) - 1)])
y_train_categorical = np.append(y_train_categorical, int(y_test[0] > y_train[-1]))
print(len(y_train_categorical))
print(y_train_categorical)

y_test_categorical = np.array([int(y_test[i+1] > y_test[i]) for i in range(len(y_test) - 1)])
print(len(y_test_categorical))
print(y_test_categorical)

y_train_categorical = torch.from_numpy(y_train_categorical).type(torch.Tensor).unsqueeze(-1)
y_test_categorical = torch.from_numpy(y_test_categorical).type(torch.Tensor).unsqueeze(-1)

print('x_train.shape = ',x_train.shape)
print('y_train.shape = ',y_train_categorical.shape)
print('x_test.shape = ',x_test.shape)
print('y_test.shape = ',y_test_categorical.shape)

992
[0 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 1 1 0
 1 0 1 0 1 0 1 0 1 0 1 0 0 1 1 1 0 1 0 0 1 1 0 1 0 0 0 1 1 1 0 0 1 1 1 0 0
 0 1 0 1 0 1 1 1 1 0 0 0 0 0 1 0 0 1 1 1 1 1 1 0 1 0 1 0 0 1 1 0 0 0 1 0 1
 0 0 1 0 0 1 0 1 0 0 0 1 0 1 1 1 0 1 0 1 1 1 0 1 0 0 0 1 0 1 1 1 0 0 1 0 0
 0 1 0 0 0 1 0 1 0 0 1 1 0 1 1 0 1 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 0 1 1
 0 1 1 0 0 1 1 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 1 0 0
 1 0 0 1 0 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 1 1 0
 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0
 1 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1
 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0 1 1 1 0 1 1 1 0 0
 1 1 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 1 1 1 0 1 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0
 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 1 1 0
 1 1 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 1
 0 1 0 0 0 1 0 0 1 1 

Define model architecture

In [453]:
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])
        out = torch.sigmoid(out)
        return out

In [454]:
input_dim = 128
hidden_dim = 64
num_layers = 2
output_dim = 1
num_epochs = 125

model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
# criterion = torch.nn.MSELoss(reduction='mean')
criterion = nn.BCELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

Train model over 100 epochs

In [455]:
hist = np.zeros(num_epochs)
start_time = time.time()
lstm = []
for t in range(num_epochs):
    y_train_pred = model(x_train)
    loss = criterion(y_train_pred, y_train_categorical)
    print("Epoch ", t, "BCE: ", loss.item())
    hist[t] = loss.item()
    optimiser.zero_grad()
    loss.backward()
    optimiser.step()

training_time = time.time() - start_time
print("Training time: {}".format(training_time))

Epoch  0 BCE:  0.6947706937789917
Epoch  1 BCE:  0.6935557723045349
Epoch  2 BCE:  0.6925581693649292
Epoch  3 BCE:  0.6917356252670288
Epoch  4 BCE:  0.6910592913627625
Epoch  5 BCE:  0.6905038356781006
Epoch  6 BCE:  0.6900373697280884
Epoch  7 BCE:  0.6896174550056458
Epoch  8 BCE:  0.6891989707946777
Epoch  9 BCE:  0.6887533068656921
Epoch  10 BCE:  0.6882672905921936
Epoch  11 BCE:  0.6877256035804749
Epoch  12 BCE:  0.6871008276939392
Epoch  13 BCE:  0.6863611936569214
Epoch  14 BCE:  0.6854808926582336
Epoch  15 BCE:  0.6844464540481567
Epoch  16 BCE:  0.6832595467567444
Epoch  17 BCE:  0.6819337010383606
Epoch  18 BCE:  0.6804800629615784
Epoch  19 BCE:  0.6788831353187561
Epoch  20 BCE:  0.6770941615104675
Epoch  21 BCE:  0.6750699281692505
Epoch  22 BCE:  0.6728337407112122
Epoch  23 BCE:  0.6704623699188232
Epoch  24 BCE:  0.6680166721343994
Epoch  25 BCE:  0.66551673412323
Epoch  26 BCE:  0.6626513004302979
Epoch  27 BCE:  0.6592583656311035
Epoch  28 BCE:  0.65551179647445

Evaluate training and test accuracy


In [456]:
y_train_pred = model(x_train)
y_test_pred = model(x_test)

test_loss = criterion(y_test_pred, y_test_categorical)
print("Test BCE: ", test_loss.item())

y_train_pred = torch.round(y_train_pred).detach().numpy()
y_test_pred = torch.round(y_test_pred).detach().numpy()

print("training accuracy: %.2f" % (np.mean(y_train_pred == y_train_categorical.numpy())))
print("testing accuracy: %.2f" % (np.mean(y_test_pred == y_test_categorical.numpy())))

Test BCE:  1.374145746231079
training accuracy: 0.96
testing accuracy: 0.55


In [457]:
pickle.dump(model, open(f'{ticker}_RNN.p', "wb"))