# Cuda setup
Check if cuda is available.

In [10]:
import torch
import numpy as np

# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

CUDA is available!  Training on GPU ...


# Import data from source
METHOD 1: install yfinance in the environment hosting python and jupyter. I used Anaconda, and installed through conda terminal into my environment.

*pip install yfinance*

Use the yfinance API to retrieve company data

In [11]:
# yahoo finance api to collect stock data
import yfinance as yf
import os

# datetime imports to work with dates
from datetime import date
from dateutil.relativedelta import relativedelta

# alpaca api to collect stock data
from alpaca_trade_api.rest import REST, TimeFrame, TimeFrameUnit

# Process data from csv files
Use pandas library for processing files, and use matplotlib to display graphs and visualizations.

In [12]:
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim

from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
%matplotlib inline

import torch.optim.lr_scheduler as lr_scheduler

In [13]:
#Change directory for Lorne's jupyter notebook
# I am mixing windows and wsl on windows so I need to manaually change the directory, so you won't need to when you run it
if True:
    # See files in current directory
    import os

    current_directory = os.getcwd()
    print(current_directory)

    entries = os.listdir('.')
    files = [entry for entry in entries if os.path.isfile(entry)]

    for file_name in files:
        print(file_name)

    %cd "/mnt/c/Users/LPC/Documents/GitHub/CSCI611-NNSTOCKS"

    current_directory = os.getcwd()
    print(current_directory)

    entries = os.listdir('.')
    files = [entry for entry in entries if os.path.isfile(entry)]

    # Print the names of the files
    for file_name in files:
        print(file_name)

/mnt/c/Users/LPC/Documents/GitHub/CSCI611-NNSTOCKS
AAPL_intraday.csv
AAPL_intraday1.csv
AMD_intraday.csv
AMD_intraday1.csv
first_try.ipynb
README.md
RNN.ipynb
/mnt/c/Users/LPC/Documents/GitHub/CSCI611-NNSTOCKS
/mnt/c/Users/LPC/Documents/GitHub/CSCI611-NNSTOCKS
AAPL_intraday.csv
AAPL_intraday1.csv
AMD_intraday.csv
AMD_intraday1.csv
first_try.ipynb
README.md
RNN.ipynb


In [14]:
class StockDataset(torch.utils.data.Dataset[float]):
    def __init__(self, sequences, targets):
        super(StockDataset).__init__()
        self.sequences = sequences
        self.targets = targets
    def __len__(self):
        return len(self.sequences)    
    def __getitem__(self, index):
        sequence = torch.tensor(self.sequences[index], dtype=torch.float32).unsqueeze(-1)#sequence at index 
        target = torch.tensor(self.targets[index], dtype=torch.float32)#test_value at index
        return sequence, target


In [15]:
# Generate stock price data from yahoo finance 
def get_yahoo_stock_data(name, interval="5m", period="7d"):
    data = yf.download(name, interval=interval, period=period)
    return data

def get_alpaca_stock_data(name, interval="15", months="6"):
    name_of_file = name + "_intraday1.csv"

    start_date = date(2021, 6, 1)
    end_date = start_date + relativedelta(months=int(months))  # Adds months

    api = REST('PKJ41QP5QU0TYS4S1BYB', 'o5HVFGx0XWSMoMyeQdRJwG1apYXtuMNcguWpjqqe')

    data = api.get_bars(name, TimeFrame(int(interval), TimeFrameUnit.Minute), start_date, end_date, adjustment='raw').df

    data = data.rename(columns={"close": "Close", "open": "Price", "high": "High", "low": "Low", "volume": "Volume", "datetime": "Datetime", "ticker": "Ticker"})

    data.to_csv(name_of_file)

    return name_of_file

# Display relevant information for formatting purposes

In [None]:
class RNN_initializer:
    #
    def __init__(self, retrieve, name, recomp, nval, ival, pval, batch_size, num_workers, epochs, learning_rate, lr_scheduler_rate, beta1, beta2):
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.lr_scheduler_rate = lr_scheduler_rate
        self.beta1 = beta1
        self.beta2 = beta2
        if(retrieve == True):
            if (stock_data_source == "yahoo"):
                self.csv_name = self.retrieve_csv(name, recomp, nval, str(ival[0])+ival[1], str(pval[0])+pval[1])
            else:
                self.csv_name = get_alpaca_stock_data(name, stock_interval, stock_period)
        else:
            self.csv_name = name
        self.df=pd.read_csv(name + "_intraday1.csv")
        #format data, and prepare it for RNN
        if(stock_data_source == "yahoo"):
            price = self.df['Close'].to_list()[2:]
        else:
            price = self.df['Close'].to_list()[2:]
        self.axis_labels = self.df['Price'].to_list()[2:]
        date_format_with_time = "%Y-%m-%d %H:%M:%S"
        self.price_inputs = [float(x) for x in price]
        sequence_length = 6
        #Training sets
        self.train_seq = []
        self.train_tar = []
        #Validation sets
        self.valid_seq = []
        self.valid_tar = []
        #Testing sets
        self.test_seq = []
        self.test_tar = []
        #choose a selected time range
        train_range = len(self.price_inputs)//pval[0] * (pval[0]-1)
        test_range_beg = train_range
        test_range_end = train_range + len(self.price_inputs)//pval[0]
        #generate sequences and targets list for loading data
        for i in range(train_range - sequence_length):
            seq = self.price_inputs[i:i+sequence_length]
            self.train_seq.append(seq)
            temp = self.price_inputs[i+sequence_length]
            self.train_tar.append(temp)
        for j in range(test_range_beg, test_range_end-sequence_length):
            seq = self.price_inputs[j:j+sequence_length]
            self.test_seq.append(seq)
            temp = self.price_inputs[j+sequence_length]
            self.test_tar.append(temp)
        train_data = StockDataset(self.train_seq, self.train_tar)
        test_data = StockDataset(self.test_seq, self.test_tar)
        
        self.train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, num_workers=num_workers)
        #valid_loader = torch.utils.data.DataLoader(self.test_vals, batch_size=batch_size, 
        #sampler=valid_sampler, num_workers=num_workers)
        self.test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, 
    num_workers=num_workers)
    # retreive_csv is a file that allows the user to extract stock data from yahoo finance.
    # @param: name, name of file
    # @param: recomp, indicates if file needs to be recompiled
    # @param: num, indicates which file need to be recompiled
    def retrieve_csv(self, name, recomp, nval, ival, pval):
        # Example: Get 1-minute intraday data for Apple (AAPL) for 1 day
        data = get_yahoo_stock_data(stock_name, stock_interval, stock_period)
        ext = ".csv"
        pt2 = "_intraday"
        num = 1;
        file_name = name + pt2 + str(num) + ext
        found = False
        if(recomp!=True):
            while(found!=True):
                if os.path.isfile(file_name):
                    num+=1
                    file_name = name + pt2 + str(num) + ext
                else:
                    found = True
        else:
            if(nval >= 1):
                file_name = name + pt2 + str(nval) + ext
            else:
                file_name = name + pt2 + ext
        data.to_csv(file_name)
        return file_name

    def display_fig(self):
        plt.figure(figsize=(20, 20))
        plt.title(name + " Intraday Stock Price")
        plt.plot(self.axis_labels, self.price_inputs)
        plt.xlabel("time")
        plt.ylabel("price")
        plt.xticks(self.axis_labels[::26])
        plt.yticks(self.price_inputs[::30])
        plt.show()

    def show_df_info(self):
        self.df.head(15)
        self.df.tail(10)
        print("Row count: " + len(self.price_inputs))
        print("Selected range: " + selected_range)
        
    def trainAndTest(self):
        #RNN model
        self.rnn1 = RNN()
        # move tensors to GPU if CUDA is available
        min_loss = np.inf

        if train_on_gpu:
            self.rnn1.cuda()
        #use MSELoss instead of MSEAbsoluteLoss (predicting next price compared to next change)
        error = nn.MSELoss()
        # specify optimizer
        optimizer = torch.optim.Adam(self.rnn1.parameters(), lr=self.learning_rate, betas=(beta1, beta2))
        #optimizer = optim.SGD(model.parameters(), lr=0.01)
        #self.price_tensor = torch.tensor(self.sequences, dtype=torch.float32).unsqueeze(-1)#input 
        #self.y_tensor = torch.tensor(self.test_vals, dtype=torch.float32)#test_values
        # Learning rate scheduler
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=self.lr_scheduler_rate, patience=10)
        valid_loss_min = np.inf
        for epoch in range(epochs):
            train_loss = 0.0
            valid_loss = 0.0
            #TRAINING
            self.rnn1.train()
            for batch_idx, (data, target) in enumerate(self.train_loader):
                # move tensors to GPU if CUDA is available
                if train_on_gpu:
                    data, target = data.cuda(), target.cuda()
                # clear the gradients of all optimized variables
                optimizer.zero_grad()
                # forward pass: compute predicted outputs by passing inputs to the model
                output = self.rnn1(data)
                # calculate the batch loss
                loss_train = error(output, target)
                # backward pass: compute gradient of the loss with respect to model parameters
                loss_train.backward()
                # perform a single optimization step (parameter update)
                optimizer.step()
                train_loss += loss_train.item()*data.size(0)
                if (loss_train < min_loss):
                    min_loss = loss_train
                    #torch.save(self.rnn1.state_dict(), "rnn1.pth")

            scheduler.step(train_loss)  # Update learning rate

            self.rnn1.eval()
            for batch_idx, (data, target) in enumerate(self.test_loader):
                # move tensors to GPU if CUDA is available
                if train_on_gpu:
                    data, target = data.cuda(), target.cuda()
                # forward pass: compute predicted outputs by passing inputs to the model
                output = self.rnn1(data)
                # calculate the batch loss
                loss_valid = error(output, target)
                # perform a single optimization step (parameter update)
                valid_loss += loss_valid.item()*data.size(0)
                if (loss_valid < min_loss):
                    min_loss = loss_valid
                    #torch.save(self.rnn1.state_dict(), "rnn1.pth")
        
            if (epoch+1) % 10 == 0:
                lr = optimizer.param_groups[0]["lr"]
                print(f"Training: Epoch {epoch+1}/{epochs}, Loss: {loss_train.item():.6f}")
                print(f"Validation: Epoch {epoch+1}/{epochs}, Loss: {loss_valid.item():.6f}")

        return min_loss
        
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.rnn1 = nn.RNN(input_size=1, hidden_size=128, num_layers=2, nonlinearity='tanh', bias=True, batch_first=True, dropout=0.0, bidirectional=False, device=None, dtype=None)
        self.fc = nn.Linear(128, 1)
        
    def forward(self, x):
        self.output, self.hidden = self.rnn1(x)
        prediction = self.fc(self.output[:, -1, :])
        return prediction
        


In [17]:
#Stock variables used when collecting stock data
stock_data_source = "alpaca"

if stock_data_source == "yahoo":
    #Stock variables when using yahoo finance api
    stock_name = "AMD"
    stock_interval="5m"
    stock_period="7d"
    epochs = 100
    lr_scheduler_rate = 0.8
    beta1 = 0.9
    beta2 = 0.999

#Stock variables when using alpaca api
if stock_data_source == "alpaca":
    stock_name = "AMD"
    stock_interval="15"
    stock_period="4" #months
    epochs = 100
    lr_scheduler_rate = 0.5
    beta1 = 0.95
    beta2 = 0.999

batch_size = 16
num_workers = 0

In [18]:
#test various learning rates
learning_rate_list = [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
best_lr = [[np.inf, np.inf] , [np.inf, np.inf]] #record two pairs of [loss, learning rate] to tune learning rate later
best_lr_in_loop = np.inf
repeated_loops_per_lr = 1

for lr in learning_rate_list:
    print("\n\n-------------------------------------------------------------------------------")
    print("Learning rate: ", lr, "   Loop: ", repeated_loops_per_lr)
    print(stock_name, "Time interval ", stock_interval, "Time period: ", stock_period)
    print("Batch size: ", batch_size, "Number of workers: ", num_workers, "Epochs: ", epochs)
    print("Learning rate scheduler rate: ", lr_scheduler_rate)
    print("Beta1: ", beta1, "Beta2: ", beta2)
    print("-------------------------------------------------------------------------------")
    
    for i in range(repeated_loops_per_lr):
        rnn1 = RNN_initializer(retrieve=True, name=stock_name, recomp=True, nval=1, ival=[5, "m"], pval=[7,"d"], batch_size=batch_size, num_workers=num_workers, epochs=epochs, learning_rate=lr, lr_scheduler_rate=lr_scheduler_rate, beta1=beta1, beta2=beta2)
        loss = rnn1.trainAndTest()
        if best_lr_in_loop > loss:
            best_lr_in_loop = loss

    if best_lr_in_loop < best_lr[0][0]:
        best_lr[1] = best_lr[0]
        best_lr[0] = [best_lr_in_loop, lr]
    elif best_lr_in_loop < best_lr[1][0]:
        best_lr[1] = [best_lr_in_loop, lr]

    best_lr_in_loop = np.inf

print("\n\nBest learning rate: ", best_lr[0][1], "   Loss: ", best_lr[0][0])
print("Second best learning rate: ", best_lr[1][1], "   Loss: ", best_lr[1][0])


learning_rate_list = []
difference_of_lr = best_lr[0][1] - best_lr[1][1]
number_of_increments = 10
increment = difference_of_lr / number_of_increments

for i in range(number_of_increments):
    learning_rate_list.append(best_lr[0][1] - increment * i)

for lr in learning_rate_list:
    print("\n\n-------------------------------------------------------------------------------")
    print("Learning rate: ", lr, "   Loop: ", repeated_loops_per_lr)
    print(stock_name, "Time interval ", stock_interval, "Time period: ", stock_period)
    print("Batch size: ", batch_size, "Number of workers: ", num_workers, "Epochs: ", epochs)
    print("Learning rate scheduler rate: ", lr_scheduler_rate)
    print("Beta1: ", beta1, "Beta2: ", beta2)
    print("-------------------------------------------------------------------------------")
    
    for i in range(repeated_loops_per_lr):
        rnn1 = RNN_initializer(retrieve=True, name=stock_name, recomp=True, nval=1, ival=[5, "m"], pval=[7,"d"], batch_size=batch_size, num_workers=num_workers, epochs=epochs, learning_rate=lr, lr_scheduler_rate=lr_scheduler_rate, beta1=beta1, beta2=beta2)
        loss = rnn1.trainAndTest()
        if best_lr_in_loop > loss:
            best_lr_in_loop = loss

    if best_lr_in_loop < best_lr[0][0]:
        best_lr[1] = best_lr[0]
        best_lr[0] = [best_lr_in_loop, lr]
    elif best_lr_in_loop < best_lr[1][0]:
        best_lr[1] = [best_lr_in_loop, lr]

    best_lr_in_loop = np.inf


    #Make testing only, no training







-------------------------------------------------------------------------------
Learning rate:  0.005    Loop:  1
AMD Time interval  15 Time period:  4
Batch size:  16 Number of workers:  0 Epochs:  100
Learning rate scheduler rate:  0.5
Beta1:  0.95 Beta2:  0.999
-------------------------------------------------------------------------------


  return F.mse_loss(input, target, reduction=self.reduction)


Training: Epoch 10/100, Loss: 9.030393


UnboundLocalError: local variable 'loss_valid' referenced before assignment

In [None]:
#output,hidden = rnn1(price_tensor)
#print(output.shape)  # (1, 1, 128)
#print(hidden.shape)  # (2, 1, 128)
  # Predict 1 value from hidden_size=128

#prediction = fc(output[:, -1, :])  # Take output at last time step
#print(prediction)

## Test the Trained Network
---
Test your trained model on previously unseen data! Remember we have downloaded `train_data` and `test_data`. We will use `test_data` through `test_loader`.

A "good" result will be a CNN that gets around 70% (or more, try your best!) accuracy on these test images.

The following is working code, but you are encouraged to make your own adjustments and enhance the implementation.

### Specify [Loss Function](http://pytorch.org/docs/stable/nn.html#loss-functions) and [Optimizer](http://pytorch.org/docs/stable/optim.html)
---
Decide on a loss and optimization function that is best suited for this classification task. The linked code examples from above, may be a good starting point; [this PyTorch classification example](https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py) Pay close attention to the value for **learning rate** as this value determines how your model converges to a small error.

The following is working code, but you can make your own adjustments.

**TODO**: try to compare with ADAM optimizer

In [None]:
#error = nn.MSELoss()
#optimizer = torch.optim.Adam(prediction.parameters(), lr=0.001)

#epochs = 50
#for epoch in range(epochs):
#    rnn1.train()
#    fc.train()
    
#      output,hidden = rnn1(price_tensor)
#    prediction = fc(output[:, -1, :])
#    loss = error(prediction, y_tensor)
    
#    optimizer.zero_grad()
#    loss.backward()
#    optimizer.step()

#    if (epoch+1) % 10 == 0:
#        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.6f}")

In [None]:
# Define your models first
#rnn1 = ...  # Your RNN model definition
#fc = ...    # Your fully connected layer definition

# Define loss and optimizer
error = nn.MSELoss()
optimizer = torch.optim.Adam(list(rnn1.parameters()) + list(fc.parameters()), 
                            lr=0.001, weight_decay=0.001)  # L2 regularization
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Add gradient clipping value
clip_value = 1.0

# Add validation set monitoring
best_val_loss = float('inf')
patience = 5
no_improvement = 0

epochs = 50
for epoch in range(epochs):
    rnn1.train()
    fc.train()
    
    # Forward pass
    output, hidden = rnn1(price_tensor)
    prediction = fc(output[:, -1, :])
    loss = error(prediction, y_tensor)
    
    # Backward pass with gradient clipping
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(list(rnn1.parameters()) + list(fc.parameters()), clip_value)
    optimizer.step()
    
    # Update learning rate
    scheduler.step(loss)
    
    # Early stopping check
    if loss.item() < best_val_loss:
        best_val_loss = loss.item()
        no_improvement = 0
    else:
        no_improvement += 1
        if no_improvement >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.6f}")

AttributeError: 'RNN_initializer' object has no attribute 'parameters'