## RL Trader using Policy Networks and Gymnasium environments

In [1]:
import matplotlib.pyplot as plt

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd

import random
import copy

from stocktrader.envs import SimpleTrader, ComplexTrader


#### Step 1: Define Stock Data Class

This is a class includes the following methods:

- `get_state` (`int -> np.array`): Returns all the features constructed from stock data used in the RL algorithm at a given time index.
- `get_sell_price` (`int -> np.array`): Returns the sell price of all the stocks considered as a vector at a given time index.
- `get_buy_price` (`int -> np.array`): Returns the buy price of all the stocks considered as a vector at a given time index.

The Stock Data class should also have a variable `n_stocks` which is the number of stocks.

Here is an example of a placeholder class:

In [2]:
class StockData:

    def __init__(self, df, n_stocks=2):
        self.df = df
        self.n_stocks = n_stocks
        self.shape = self.get_state(0).shape

    def get_state(self, time_index):
        current_state = np.array(self.df.iloc[[time_index]])
        # more logic here to remove irrelevant columns (e.g. remove company name or DateTime)
        return current_state
    
    def get_sell_price(self, time_index):
        # sell at open price of the current time step
        sell_price = np.array(self.df.iloc[[time_index]]['OpenAMD'])
        
        return sell_price

    def get_buy_price(self, time_index):
        # buy at close price of the current time step
        buy_price = np.array(self.df.iloc[[time_index]]['CloseAMD'])
        return buy_price

Here is an example of a placeholder dataset:

In [3]:
mydict = [{'OpenAMD': 1, 'CloseAMD': 2, 'c': 3, 'd': 4},
        {'OpenAMD': 10, 'CloseAMD': 23, 'c': 3, 'd': 4},
        {'OpenAMD': 200, 'CloseAMD': 312, 'c': 3, 'd': 4},
        {'OpenAMD': 213, 'CloseAMD': 321, 'c': 3, 'd': 4},
        {'OpenAMD': 214, 'CloseAMD': 342, 'c': 3, 'd': 4},  
        {'OpenAMD': 220, 'CloseAMD': 349, 'c': 3, 'd': 4},  
       ]
df = pd.DataFrame(mydict)
print(df.head)

stock_data = StockData(df, n_stocks=1)

<bound method NDFrame.head of    OpenAMD  CloseAMD  c  d
0        1         2  3  4
1       10        23  3  4
2      200       312  3  4
3      213       321  3  4
4      214       342  3  4
5      220       349  3  4>


#### Step 2: Define Environment

Using the created classes define the gymnasium environment.

In [5]:
stock_data = StockData(df, n_stocks=1)
trader_env = ComplexTrader(stock_data)

#### Step 3: Define Policy Network

In [None]:
input_size = len(trader_env.state) # the input size will be the number of states within the trader environment
hidden_size = 32
output_size = stock_data.n_stocks # output size will be the number of stocks

policy_network = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, output_size),
    nn.Tanh()
)

learning_rate = 1e-3
optimiser = optim.Adam(policy_network.parameters(), lr=learning_rate)

#### Step 4: Train the Policy Network

TODO...