In [1]:
import torch
from torch import nn
import torchvision
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets,transforms
from torchvision.transforms import ToTensor,InterpolationMode
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import random
import requests
import pandas as pd
from pathlib import Path
import numpy as np
import os
import zipfile
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, balanced_accuracy_score, classification_report
from tqdm.auto import tqdm
from collections import Counter

In [2]:
# set device by checking if gpu is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
# set seed for reproducibilty
SEED = 50

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
# # mount google drive
# from google.colab import drive
# drive.mount('/content/drive')

# # root directory where the food-11 zip file is located
# root_dir = 'drive/MyDrive/'

**DATA PREPROCESSING AND VISUALIZATION**

In [5]:
# extract the food-11 zip file
with zipfile.ZipFile('google_stock_price.zip', 'r') as zip_ref:
    zip_ref.extractall('google_stock_price/')

In [6]:
df = pd.read_csv('google_stock_price/Google_Stock_Price_Train.csv')
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1/3/2012,325.25,332.83,324.97,663.59,7380500
1,1/4/2012,331.27,333.87,329.08,666.45,5749400
2,1/5/2012,329.83,330.75,326.89,657.21,6590300
3,1/6/2012,328.34,328.77,323.68,648.24,5405900
4,1/9/2012,322.04,322.29,309.46,620.76,11688800
5,1/10/2012,313.7,315.72,307.3,621.43,8824000
6,1/11/2012,310.59,313.52,309.4,624.25,4817800
7,1/12/2012,314.43,315.26,312.08,627.92,3764400
8,1/13/2012,311.96,312.3,309.37,623.28,4631800
9,1/17/2012,314.81,314.81,311.67,626.86,3832800


In [7]:
class StockDataset(Dataset):
    def __init__(self, data, N):
        data['Volume'] = data['Volume'].str.replace(',', '')

        self.data = data[['Open', 'High', 'Low', 'Close', 'Volume']].apply(pd.to_numeric, errors='coerce')
        self.data.fillna(0, inplace=True)

        self.N = N

    def __len__(self):
        return len(self.data) - self.N

    def __getitem__(self, idx):
        # Get the sequence of features for the last M days
        x = self.data.iloc[idx:idx + self.N, :].values  # Exclude the last column 'Close'
        # Get the closing price of the next day as the target
        y = self.data.iloc[idx + self.N]['Close']

        # Convert to PyTorch tensors
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)

        return x, y

In [8]:
# Define your parameters
N = 30  # Number of days to look back
batch_size = 32  # Batch size for DataLoader

# Instantiate the dataset
train_data = StockDataset(df, N)

In [9]:
# Create the DataLoader
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [10]:
x,y = next(iter(train_dataloader))

print(x.shape,y.shape)

torch.Size([32, 30, 5]) torch.Size([32])


**TRAINING**

In [11]:
class Baseline(nn.Module):
    def __init__(self):

        super().__init__()

        self.rnn = nn.RNN(5,16,batch_first=True)

        self.fc = nn.Linear(16, 1)

    def forward(self, x):

        # embedded = [sent len, batch size, emb dim]

        output, hidden = self.rnn(x)

        hidden = hidden.squeeze()

        out = self.fc(hidden)

        return out

In [12]:
model = Baseline()

In [13]:
optimizer = torch.optim.Adam(model.parameters())

criterion = nn.MSELoss()

In [14]:
num_epochs = 50  # Number of epochs to train
model.train()    # Set the model to training mode

for epoch in range(num_epochs):
    epoch_loss = 0  # Accumulate loss for each epoch

    for x_batch, y_batch in train_dataloader:
        optimizer.zero_grad()  # Zero the gradients

        # Forward pass
        predictions = model(x_batch)
        predictions = predictions.squeeze()  # Ensure predictions have the same shape as y_batch

        # Compute the loss
        loss = criterion(predictions, y_batch)

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        # Accumulate loss for reporting
        epoch_loss += loss.item()

    # Print the average loss for each epoch
    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


Epoch 1/50, Loss: 429967.6042
Epoch 2/50, Loss: 427456.6378
Epoch 3/50, Loss: 426077.4071
Epoch 4/50, Loss: 426220.6843
Epoch 5/50, Loss: 426368.9239
Epoch 6/50, Loss: 423470.0232
Epoch 7/50, Loss: 423969.4880
Epoch 8/50, Loss: 424377.6346
Epoch 9/50, Loss: 422053.1154
Epoch 10/50, Loss: 419398.0529
Epoch 11/50, Loss: 419909.0497
Epoch 12/50, Loss: 419548.4952
Epoch 13/50, Loss: 418758.0545
Epoch 14/50, Loss: 418419.8934
Epoch 15/50, Loss: 416879.4079
Epoch 16/50, Loss: 415961.3550
Epoch 17/50, Loss: 414401.4479
Epoch 18/50, Loss: 414987.1795
Epoch 19/50, Loss: 414599.3237
Epoch 20/50, Loss: 412914.9447
Epoch 21/50, Loss: 412933.8814
Epoch 22/50, Loss: 413266.5793
Epoch 23/50, Loss: 408850.2845
Epoch 24/50, Loss: 409559.1595
Epoch 25/50, Loss: 411028.1418
Epoch 26/50, Loss: 407844.9960
Epoch 27/50, Loss: 411126.7019
Epoch 28/50, Loss: 406195.9295
Epoch 29/50, Loss: 405605.5144
Epoch 30/50, Loss: 406545.2380
Epoch 31/50, Loss: 403587.0617
Epoch 32/50, Loss: 405275.7500
Epoch 33/50, Loss