In [31]:
!pip install wandb -qqq
!apt install tree

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tree is already the newest version (1.7.0-5).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


In [32]:
# !wandb login --relogin

In [140]:
import wandb
wandb.login()

True

In [141]:
import random
import torch
import torchvision
from torch.utils.data import TensorDataset
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from math import sqrt
import math, time
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from torch.utils.data import DataLoader

from itertools import cycle
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [142]:
#this block was only used to connect the google drive for colab implementations
#if you're working on your local, please skip this part

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [143]:
#please add sys path to reach the modules of the functions

import sys
sys.path.append('/content/gdrive/MyDrive/Colab Notebooks/DataCraft_notebook/Code files/One py for each application/')

## FUNCTIONS

In [208]:
#import cerated early stopping, LSTM model and sequence functions in py files

from earlyStopping import EarlyStopping
from LSTM_model import LSTM
from sequenceDataset import SequenceDataset

In [209]:
def split(df, SeqLen): #splitdataset
  split = [int(len(df)*0.8), int(len(df)*0.9)] #give split index for train, valid, test
  train = df[:split[0]]  #~%80
  val = df[split[0]-SeqLen:split[1]] # ~%10
  test = df[split[1]-SeqLen:]  # ~%10
  return train, val, test

In [210]:
#create function to get prediction of the test, validation dataset

def predictions(scaled,model,loader,scaler): 
  y_pred = torch.tensor([]).cuda()
  y=torch.tensor([]).cuda()
  y=torch.reshape(y, (-1,))
  y_pred=torch.reshape(y_pred, (-1,))
  model.eval()
  with torch.no_grad():
      for X, Y in loader:
          X=X.cuda()
          Y=Y.cuda()
          y = torch.cat((y, Y), 0)
          predict = model(X)
          predict=torch.reshape(predict, (-1,))
          y_pred = torch.cat((y_pred.cuda(), predict), 0)
          last_seq = X #keep the last sequence for following the prediction proces

  #reshape, concatenate, and apply inverse scaler transform to get predictions
  scaled_open = np.reshape(scaled[:,1], (-1,1))
  y_pred = np.reshape(y_pred.cpu().detach().numpy(), (-1,1))
  pred_open = np.concatenate((y_pred, scaled_open), axis=1)
  pred_open = scaler.inverse_transform(pred_open)
  price_open = np.concatenate((np.reshape(y.cpu().detach().numpy(), (-1,1)), scaled_open), axis=1)
  price_open = scaler.inverse_transform(price_open)

  return (price_open,pred_open,last_seq)

## DATASET

The code user should indicate which dataset will be processed in this notebook. Because there are 3 different datasets belonging to the 3 different companies. These include the stock prices of Apple, Samsung, and Xiaomi companies.

And also each company has its own threshold to measure the high and low changes in stock prices. According to the developed model, predictions of the changes in the stock prices were also calculated. So the F1 scores for evaluating the performance of the predicted high and low changes were calculated at the end of the notebook according to the upper and lower bounds.

In [211]:
#initialize the variable
company = "Samsung"

In [212]:
#get the datset
with wandb.init(project="DSS") as r:
         
        # ✔️ declare which artifact we'll be using
        if company == 'Apple':
          artifact = r.use_artifact('metu_datacraft/DSS/Apple:v2', type='Data')
          upper_bound = 2
          lower_bound = -2
        elif company == 'Samsung':
          artifact = r.use_artifact('metu_datacraft/DSS/Samsung:v2', type='Data')
          upper_bound = 1
          lower_bound = -1
        elif company == 'Xiaomi':
          artifact = r.use_artifact('metu_datacraft/DSS/Xiaomi:v2', type='Data')
          upper_bound = 2
          lower_bound = -3
        else:
          raise Exception("Sorry, please enter the company name correctly")

        table = artifact.get('weekday_data')
        dataset= {"Date": table.get_column("Date"),"Price":table.get_column("Price"),"Open":table.get_column("Open"),
                  "High":table.get_column("High"), "Low":table.get_column("Low") , "Volume": table.get_column("Volume"),
                  "Change":table.get_column("Change")}
        data = pd.DataFrame(dataset)

[34m[1mwandb[0m:   1 of 1 files downloaded.  


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

For only Apple dataset, the dataset had to be filtered to get high performance on the model prediction. This filtering issue was not applied on the other datasets belong to Samsung and Xiaomi. 

In [213]:
#the data was filtered to get the values of stock prices after the year 2020 to get more powerful model
#this block is valid only for Apple dataset
if company == 'Apple':
  data['Date'] = pd.to_datetime(data.Date)
  data = data[data['Date']>'2020-01-01']
  data.reset_index(inplace=True)
  data.drop(columns={'index'},inplace=True)

## GET PREDICTIONS

In this section, predictions were obtained from the pretrained and saved model. The predictions were visualized with actual values.

For each company, the different;
* Number of hidden layers
* Number of nodes
* Sequence length

were obtained from the hyperparameter tuning studies. These parameters will be used to call the saved model for each company.

*Not: The determined batch size was used for the hyperparameter tuning and the model was saved with the determined batch size.*

In [214]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [215]:
#give the certain parameters which were detected in the hyperparameter tuning

if company == 'Apple':
  hidden_size=8
  num_layers=1
  SeqLen = 79
elif company == 'Samsung':
  hidden_size=32
  num_layers=1
  SeqLen = 82
elif company == 'Xiaomi':
  hidden_size=32
  num_layers=3
  SeqLen = 20

input_dim = 1
output_dim = 1
batch_size=1

In [216]:
#load the model
model = LSTM(input_dim=input_dim, hidden_dim=hidden_size, num_layers=num_layers,output_dim=output_dim).to(DEVICE)
model.load_state_dict(torch.load("/content/gdrive/My Drive/Colab Notebooks/DataCraft_notebook/Code files/One py for each application/model_{company}0.pth".format(company=company)))
model.eval()

LSTM(
  (lstm): LSTM(1, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)

The datasets were split and sequenced the same as in the training process. Only validation and test dataset were used to visualize the predictions and actuals of the stock prices.

In [217]:
data['Date'] = pd.to_datetime(data.Date)

#split dataset
train, valid, test = split(data[["Price","Open"]],SeqLen)

#scale the colums and values
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train) #fit the scaler by using train dataset
test_scaled = scaler.transform(test) #scale the test dataset
valid_scaled = scaler.transform(valid) #scale the validation daatset

train_data = pd.DataFrame(train_scaled, columns = ['Price','Open'])
test_data = pd.DataFrame(test_scaled, columns = ['Price','Open'])
valid_data = pd.DataFrame(valid_scaled, columns = ['Price','Open'])

In [218]:
#get sequences of the dataset
train_dataset = SequenceDataset(
    train_data,
    target="Price",
    features=["Open"],
    sequence_length=SeqLen
)


test_dataset = SequenceDataset(
    test_data,
    target="Price",
    features=["Open"],
    sequence_length=SeqLen
)

valid_dataset = SequenceDataset(
    valid_data,
    target="Price",
    features=["Open"],
    sequence_length=SeqLen
)

#create the dataloader for the test and validation dataset 
testloader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False)
validloader = DataLoader(valid_dataset,batch_size=batch_size,shuffle=False)

In [219]:
#get validation predictions from saved model
price_open_valid,pred_open_valid,last_seq_valid = predictions(valid_scaled,model,validloader,scaler)

#get test predictions from saved model
price_open_test,pred_open_test,last_seq_test = predictions(test_scaled,model,testloader,scaler)

In [220]:
#get together the valid and test actual and predictions
valid_price_open_ds = pd.DataFrame(price_open_valid, columns = ['Price_Actual','Open'])
valid_pred_open_ds = pd.DataFrame(pred_open_valid, columns = ['Price_Pred','Open'])
test_price_open_ds = pd.DataFrame(price_open_test, columns = ['Price_Actual','Open'])
test_pred_open_ds = pd.DataFrame(pred_open_test, columns = ['Price_Pred','Open'])

#keep validation set actual values and predictions
valid_ds = pd.concat([valid_price_open_ds, valid_pred_open_ds], axis=1).reindex(valid_price_open_ds.index)

#keep test set actual values and predictions
test_ds = pd.concat([test_price_open_ds, test_pred_open_ds], axis=1).reindex(test_price_open_ds.index)

#concat these valid and test sets to see the predictions and actual values of validation and test set
#be carefull about the sequence len, please don't put same values of stock prices twice
valid_test_ds = pd.concat([valid_ds,test_ds[SeqLen:]])
valid_test_ds = valid_test_ds.reset_index(drop=True)

#get the actual data set
#get only the last part of the dataset which includes the test and validation 
data_to_match = data[int(len(data))-int(len(valid_test_ds)):].reset_index(drop=True)

#concat the orginal dataset and tha dataset which includes the predictions
result = pd.concat([valid_test_ds, data_to_match], axis=1).reindex(valid_test_ds.index)

In [221]:
#visualize the actual and prediction values of the stock prices for validation and test set
names = cycle(['Price_Pred', 'Price_Actual'])

fig = px.line(result, x=result.Date, y=[result['Price_Pred'], result['Price_Actual']],
             labels={'Date': 'Date','value':'Stock Price'},color_discrete_sequence=["#A2AAAD","steelblue"])

fig.for_each_trace(lambda t:  t.update(name = next(names)))
fig.update_layout(width=1300,height=550,title_text='Predictions & Actuals of {company}'.format(company=company), font_size=15, font_color='black',legend_title_text='Stock Parameters',plot_bgcolor="white")

fig.show()

The predictions were calculated only for 7 days, the last sequence of the test dataset was used to calculate the next 7 days.

In [222]:
#predict the next 7 days
from logging import lastResort
test_loss=0 

#create tensor to keep predictions
y_keep_pred = torch.tensor([]).cuda()

#get the last sequences to start the future day prediction
new = last_seq_test #came from test dataset

pred_day = 7

with torch.no_grad():
    for d in range(pred_day):
        model_inp = []
        X = new
        X = X 
        list_x = torch.reshape(X, (-1,)).tolist()

        predict = model(X)
        predict=torch.reshape(predict, (-1,))
        y_keep_pred = torch.cat((y_keep_pred.cuda(), predict), 0)

        new_list = []
        for el in range(len(list_x)-1):
          empt=[list_x[el+1]]
          new_list.append(empt)
        
        new_list.append([predict])
        model_inp.append(new_list)
        new = torch.Tensor(model_inp).cuda()


y_keep_pred.cuda()

#inverse transform the predictions
test_scaled_open=np.reshape(test_scaled[:,1], (-1,1))
y_keep_pred=np.reshape(y_keep_pred.cpu().detach().numpy(), (-1,1))
pred_ = np.concatenate((y_keep_pred, test_scaled_open[-7:]), axis=1)
pred_ = scaler.inverse_transform(pred_) #predictions

In [223]:
#create empty table which includes the future dates
#the future predictions will be put on it
def create_date_table(start='2022-11-01', end='2022-11-07'): #create dataset only for 7 days
   df = pd.DataFrame({"Date": pd.date_range(start, end)})
   return df

data_date = create_date_table()
data_date['Price_Pred']=0
data_date = data_date.reset_index(drop=True)

#put the prediction on it
data_date["Price_Pred"]=pred_[:,0]

#concat all previous and future predictions
result_df = pd.concat([result[['Date','Price_Pred']], data_date], ignore_index=True)

In [224]:
#keep prediction values for the dashboard
#result_df.to_excel('/content/drive/My Drive/Colab Notebooks/DataCraft_notebook/{company} all predictions of valid&test+7.xlsx'.format(company),index=False)

In [225]:
#visualize the predictions with next 7 days
names = cycle(['Price_Pred'])

fig = px.line(result_df, x=result_df.Date, y=[result_df['Price_Pred']],
             labels={'Date': 'Date','value':'Stock Price'},color_discrete_sequence=["pink"])



fig.for_each_trace(lambda t:  t.update(name = next(names)))
fig.update_layout(width=1300,height=550,title_text='Predictions of {company}'.format(company=company), font_size=15, font_color='black',legend_title_text='Stock Parameters',plot_bgcolor="white")

#fig.update_yaxes(gridcolor='gray', griddash='dot',minor_griddash="dot")

fig.show()

In [226]:
#create new change column to see the predicted changes on the stock prices
result['Change_Pred'] = 0
for i in range(1, len(result)):
    result.loc[i, 'Change_Pred'] = (result.loc[i, 'Price_Pred'] - result.loc[i-1, 'Price_Pred']) / result.loc[i-1, 'Price_Pred'] * 100

The performance of the model to detect negative and positive change values of predicted stock prices correctly was calculated below. 

In [227]:
result["Change_Ac_PN"]=0
result["Change_Pr_PN"]=0

result.loc[result["Change"]>0,"Change_Ac_PN"]=1
result.loc[result["Change_Pred"]>0,"Change_Pr_PN"]=1

actual_ch = result[["Change_Ac_PN"]].to_numpy() 
predict_ch = result[["Change_Pr_PN"]].to_numpy() 

import numpy as np
from sklearn.metrics import f1_score

f1_s = f1_score(actual_ch, predict_ch)

print(f"F1 score to detect negative and positive change values of predicted stock prices correctly: {f1_s*100: .2f}%") 

F1 score to detect negative and positive change values of predicted stock prices correctly:  56.19%


The performance of the model to detect high and low change values of predicted stock prices correctly was calculated below with respect to the determined upper and lower bound for each company.

In [228]:
result["Change_Ac_PN"]=0
result["Change_Pr_PN"]=0

result.loc[result["Change"]>upper_bound,"Change_Ac_PN"]=1
result.loc[result["Change"]<lower_bound,"Change_Ac_PN"]=1
result.loc[result["Change_Pred"]>upper_bound,"Change_Pr_PN"]=1
result.loc[result["Change_Pred"]<lower_bound,"Change_Pr_PN"]=1

actual_ch = result[["Change_Ac_PN"]].to_numpy() 
predict_ch = result[["Change_Pr_PN"]].to_numpy() 

import numpy as np
from sklearn.metrics import f1_score

f1_s = f1_score(actual_ch, predict_ch)

print(f"F1 score to detect high and low change values of predicted stock prices correctly with respect to the determined upper and lower bound: {f1_s*100: .2f}%") 

F1 score to detect high and low change values of predicted stock prices correctly with respect to the determined upper and lower bound:  35.52%
