In [21]:
#imports
import pandas as pd
import numpy as np
import json
from collections import OrderedDict
from sklearn.model_selection import train_test_split

In [22]:
#assembles our dataframes
def prepData(symbol):
    #read files
    f = open("stockPrices/{}_prices_2022.json".format(symbol)) # get rid of stockPrices/ if you dont have it in a folder
    price = json.load(f)
    stockFrame = pd.read_json(r'sentimentAnalysis/sentiment_analysis_wallstreetbets_{}_posts_2022.json'.format(symbol)) # get rid of stockSentAnalysis/ if you just have it in the folder
    f = open("topAuthorsPerDay/{}_topAuthorPosts_2022.json".format(symbol))
    topAuthors = json.load(f)
    
    #setup change in price from day to day
    pSetup = dict(OrderedDict(reversed(list(price.items()))))# need to go ordered to reverse it i think
    prev = -1
    pChange = {}
    for key in pSetup:
        if(prev==-1): # for the first run just make sure we have some value
            prev = float(pSetup[key])
        pChange[key] = round(float(pSetup[key]) - prev, 2) # today - yesterday
        prev = float(pSetup[key]) # update our previous for next run

    #setup price dataframe, i did this at first because like i thought i would concat them but now its like already done
    price = pd.DataFrame.from_dict(price, orient= 'index')
    price = price.rename(columns={0: 'price'}) #this gives me the prices

    #prep sentiment analysis data

    #manipulate stock dataframe
    for name, dat in stockFrame.items(): # rename them to be strings and in format
        stockFrame = stockFrame.rename(columns={name: str(name)[0:10]})
    stockFrame = stockFrame.transpose() # easier to put together this way
    stockFrame.insert(3, "price", float("nan")) # sets them naan to default because we wont always really know

    #append them together
    for row, dat in stockFrame.iterrows(): # append our items where they can go
        if(row in price.index):
            stockFrame.loc[row, 'price'] = float(price.loc[row, 'price'])
            stockFrame.loc[row, 'day change'] = pChange[row]
        if(row in topAuthors):
            stockFrame.loc[row, 'top_10_poasters'] = int(topAuthors[row])
        else: 
            stockFrame.loc[row, 'top_10_poasters'] = 0

    

    
    #final prep
    stockFrame = stockFrame.dropna() # get rid of nan values
    stockFrame = pd.get_dummies(stockFrame, columns=['dayAttitude']) # we need dummies for categorical variables

    return stockFrame

In [23]:
symbols = ['SPY', 'NVDA', 'MSFT', 'GOOG', 'GME', 'AI', 'AAPL'] # update this to be whatever symbols ur playing with
frames = {}
for i in symbols:
    frames[i] = prepData(i) # get our dataframes for the stocks we have mined
frames['SPY'].head() # just checking sure that we have a dataframe



Unnamed: 0,positive,negative,price,top_10_poasters,day change,dayAttitude_negative,dayAttitude_neutual,dayAttitude_positive
2022-01-03,4,1,477.71,0.0,0.0,0,0,1
2022-01-04,4,0,477.55,0.0,-0.16,0,0,1
2022-01-05,7,6,468.38,0.0,-9.17,0,0,1
2022-01-06,4,4,467.94,0.0,-0.44,0,1,0
2022-01-07,7,3,466.09,0.0,-1.85,0,0,1


In [24]:
frames['SPY'].transpose()['2022-01-11'] # just using this to check sure that top10 posters got properly appended

positive                     4
negative                     3
price                   469.75
top_10_poasters            1.0
day change                4.24
dayAttitude_negative         0
dayAttitude_neutual          0
dayAttitude_positive         1
Name: 2022-01-11, dtype: object

In [25]:
#we are just going to run a simple machine learning algorithm on our symbols here and see if any patterns can be found for each of the groups
traintest= {}
for i in symbols:
    xtrain, xtest, ytrain, ytest = train_test_split(frames[i].drop(columns = ['day change', 'price']), frames[i]['day change'], test_size=0.2, random_state=47)
    traintest[i] = [xtrain, xtest, ytrain, ytest]

In [26]:
#so first we are going to do regressions on everything and print the results
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
from sklearn.linear_model import Lasso
LA=Lasso(alpha=0.01)



modelsLR = {}
modelsLA = {}
for i in symbols: # create our linear regression models
    modelsLR[i] = LR.fit(traintest[i][0], traintest[i][2])
    modelsLA[i] = LA.fit(traintest[i][0], traintest[i][2])

In [29]:
#now test to see if our models are at all accurate:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

with open('ML_results.txt', 'w') as f:
    for i in symbols:
       ypredLR = modelsLR[i].predict(traintest[i][1]) # run on our test sets
       ypredLA = modelsLA[i].predict(traintest[i][1])

       LRr2 = r2_score(traintest[i][3], ypredLR)
       LAr2 = r2_score(traintest[i][3], ypredLA)

       LRrmse = mean_squared_error(traintest[i][3], ypredLR)
       LArmse = mean_squared_error(traintest[i][3], ypredLR)

       f.write("FOR STOCK: " + i + '\n')
       f.write("\tLINEAR r2: " + str(LRr2) + '\n')
       f.write("\tLINEAR MEAN SQUARED ERROR: " + str(LRrmse) + '\n')
       f.write("\tLASSO r2: " + str(LAr2) + '\n')
       f.write("\tLASSO MEAN SQUARED ERROR: " + str(LArmse) + '\n')


In [36]:
tsum=0
tesum=0
for i in symbols: 
    print(i)
    print("train: " + str(len(traintest[i][0])))
    tsum= tsum + len(traintest[i][0])
    print("test: " + str(len(traintest[i][1])))
    tesum= tesum + len(traintest[i][1])

print(tsum/len(symbols))
print(tesum/len(symbols))

SPY
train: 101
test: 26
NVDA
train: 137
test: 35
MSFT
train: 109
test: 28
GOOG
train: 68
test: 18
GME
train: 199
test: 50
AI
train: 151
test: 38
AAPL
train: 149
test: 38
130.57142857142858
130.57142857142858
