<a href="https://colab.research.google.com/github/cody-mckeon/Reinforcement_Learning_Trading/blob/collab/rl_trading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import yfinance as yf
from datetime import datetime, timedelta

# Calculate 15 years ago from today
today = datetime.now()
fifteen_years_ago = today - timedelta(days=15*365.25)  # Account for leap years
yesterday = datetime.now() - timedelta(days=1)

data = yf.download('^GSPC', start=fifteen_years_ago, end=yesterday)

data.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-10-01,1054.910034,1054.910034,1029.449951,1029.849976,1029.849976,5791450000
2009-10-02,1029.709961,1030.599976,1019.950012,1025.209961,1025.209961,5583240000
2009-10-05,1026.869995,1042.579956,1025.920044,1040.459961,1040.459961,4313310000
2009-10-06,1042.02002,1060.550049,1042.02002,1054.719971,1054.719971,5029840000
2009-10-07,1053.650024,1058.02002,1050.099976,1057.579956,1057.579956,4238220000


In [2]:
import pandas as pd
data.index = pd.to_datetime(data.index)
data.index

DatetimeIndex(['2009-10-01', '2009-10-02', '2009-10-05', '2009-10-06',
               '2009-10-07', '2009-10-08', '2009-10-09', '2009-10-12',
               '2009-10-13', '2009-10-14',
               ...
               '2024-09-16', '2024-09-17', '2024-09-18', '2024-09-19',
               '2024-09-20', '2024-09-23', '2024-09-24', '2024-09-25',
               '2024-09-26', '2024-09-27'],
              dtype='datetime64[ns]', name='Date', length=3773, freq=None)

In [3]:
date_range = pd.date_range(start=data.index.min(), end=data.index.max())
missing_dates = date_range.difference(data.index)
print(f"Missing Dates: {missing_dates}")

Missing Dates: DatetimeIndex(['2009-10-03', '2009-10-04', '2009-10-10', '2009-10-11',
               '2009-10-17', '2009-10-18', '2009-10-24', '2009-10-25',
               '2009-10-31', '2009-11-01',
               ...
               '2024-08-25', '2024-08-31', '2024-09-01', '2024-09-02',
               '2024-09-07', '2024-09-08', '2024-09-14', '2024-09-15',
               '2024-09-21', '2024-09-22'],
              dtype='datetime64[ns]', length=1703, freq=None)


In [5]:
# Reindex the DataFrame to include all the dates
data_full = data.reindex(date_range)

# Fill missing values with forward fill (propogating the last known value)
data_full.ffill(inplace=True)

missing_values = data_full.isnull().sum()
print(f"Missing values after forward fill: \n{missing_values}")

Missing values after forward fill: 
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


In [6]:
date_range = pd.date_range(start=data_full.index.min(), end=data_full.index.max())
missing_dates = date_range.difference(data_full.index)
print(f"Missing Dates: {missing_dates}")

Missing Dates: DatetimeIndex([], dtype='datetime64[ns]', freq='D')


In [7]:
data_full.head(30)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
2009-10-01,1054.910034,1054.910034,1029.449951,1029.849976,1029.849976,5791450000.0
2009-10-02,1029.709961,1030.599976,1019.950012,1025.209961,1025.209961,5583240000.0
2009-10-03,1029.709961,1030.599976,1019.950012,1025.209961,1025.209961,5583240000.0
2009-10-04,1029.709961,1030.599976,1019.950012,1025.209961,1025.209961,5583240000.0
2009-10-05,1026.869995,1042.579956,1025.920044,1040.459961,1040.459961,4313310000.0
2009-10-06,1042.02002,1060.550049,1042.02002,1054.719971,1054.719971,5029840000.0
2009-10-07,1053.650024,1058.02002,1050.099976,1057.579956,1057.579956,4238220000.0
2009-10-08,1060.030029,1070.670044,1060.030029,1065.47998,1065.47998,4988400000.0
2009-10-09,1065.280029,1071.51001,1063.0,1071.48999,1071.48999,3763780000.0
2009-10-10,1065.280029,1071.51001,1063.0,1071.48999,1071.48999,3763780000.0


In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_full[['Close', 'Volume']])

In [35]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class TradingEnv(gym.Env):
    def __init__(self, data, initial_balance=50000):
        super(TradingEnv, self).__init__()

        self.data = data
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.shares_held = 0
        self.portfolio_value = self.balance

        self.action_space = spaces.Discrete(3)  # Buy, Hold, Sell
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(data.shape[1],), dtype=np.float32)

    def reset(self, seed=None, return_info=False, options=None):
        # Set the seed if provided
        super().reset(seed=seed)

        self.current_step = 0
        self.balance = self.initial_balance
        self.share_held = 0
        self.portfolio_value = self.balance

        # Return the initial observation and an optional info dict (empty for now)
        if return_info:
            return self.data[self.current_step], {}
        else:
            return self.data[self.current_step]


    def step(self, action):
        terminated = False
        truncated = False

        # Action: 0 = Hold, 1 = Buy, 2 = Sell
        current_price = self.data[self.current_step, 0] # Assuming 'Close' price is the 1st column

        # Action Logic
        if action == 1: # Buy
            if self.balance > 0: # Buy if you have cash available
                self.shares_held = self.balance / current_price
                self.balance = 0 # Cash becomes 0 after buying
        elif action == 2: # Sell
            if self.shares_held > 0: # Sell if you hold shares
                self.balance = self.shares_held * current_price
                self.shares_held = 0 # No more shares held after selling

        # Update Portfolio Value
        self.portfolio_value = self.balance + self.shares_held * current_price

        # Reward is based on the portfolio value increase
        reward = self.portfolio_value - self.initial_balance

        # Move to the next step
        self.current_step += 1

        # End of data
        if self.current_step >= len(self.data) - 1:
            truncated = True

        return self.data[self.current_step], reward, terminated, truncated, {}

    def render(self):
        # This can be customized to display the portfolio performance over time
        print(f'Step: {self.current_step}')
        print(f'Portfolio Value: {self.portfolio_value}')
        print(f'Shares Held: {self.shares_held}')
        print(f'Cash Balance: {self.balance}')

In [10]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [39]:
from stable_baselines3 import PPO

env = TradingEnv(data_scaled)
model = PPO("MlpPolicy", env, verbose=1, learning_rate=1e-5)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  and should_run_async(code)


In [22]:
data_scaled

  and should_run_async(code)


array([[1.53933551e-03, 4.96877349e-01],
       [5.56862319e-04, 4.75172551e-01],
       [5.56862319e-04, 4.75172551e-01],
       ...,
       [9.95106632e-01, 2.71026946e-01],
       [1.00000000e+00, 3.50906564e-01],
       [9.98475436e-01, 2.99550392e-01]])

In [13]:
!pip install stable_baselines3

Collecting stable_baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stable_baselines3
Successfully installed stable_baselines3-2.3.2


In [40]:
model.learn(total_timesteps=100000)
model.save("ppo_trading_model")

rewards = []
for _ in range(100):
    obs = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        rewards.append(reward)




-----------------------------
| time/              |      |
|    fps             | 904  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 662       |
|    iterations           | 2         |
|    time_elapsed         | 6         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.1      |
|    explained_variance   | 0         |
|    learning_rate        | 1e-05     |
|    loss                 | 4.03e+11  |
|    n_updates            | 10        |
|    policy_gradient_loss | -5.06e-08 |
|    value_loss           | 8.9e+11   |
---------------------------------------
---------------------------------------
| rollout/                |   

  self.shares_held = self.balance / current_price
  self.portfolio_value = self.balance + self.shares_held * current_price
  self.balance = self.shares_held * current_price


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 5.48e+03  |
|    ep_rew_mean          | 1.85e+07  |
| time/                   |           |
|    fps                  | 615       |
|    iterations           | 11        |
|    time_elapsed         | 36        |
|    total_timesteps      | 22528     |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.1      |
|    explained_variance   | -1.79e-06 |
|    learning_rate        | 1e-05     |
|    loss                 | 3.52e+11  |
|    n_updates            | 100       |
|    policy_gradient_loss | 0.00183   |
|    value_loss           | 6.98e+11  |
---------------------------------------


ValueError: Expected parameter logits (Tensor of shape (64, 3)) of distribution Categorical(logits: torch.Size([64, 3])) to satisfy the constraint IndependentConstraint(Real(), 1), but found invalid values:
tensor([[nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan]], grad_fn=<SubBackward0>)

In [23]:
data_scaled.shape

  and should_run_async(code)


(5476, 2)

In [38]:
if np.isnan(data_scaled).any():
  print("NaN values found in data_scaled")
else:
  print("No NaN values found in data_scaled")

No NaN values found in data_scaled


  and should_run_async(code)


In [41]:
!git --version

git version 2.34.1


  and should_run_async(code)


In [42]:
!git clone https://github.com/cody-mckeon/Reinforcement_Learning_Trading.git

  and should_run_async(code)


Cloning into 'Reinforcement_Learning_Trading'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 11 (delta 3), reused 9 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (11/11), 9.35 KiB | 683.00 KiB/s, done.
Resolving deltas: 100% (3/3), done.


In [43]:
!ls

Reinforcement_Learning_Trading	sample_data


  and should_run_async(code)


In [44]:
!cd Reinforcement_Learning_Trading/

  and should_run_async(code)


In [45]:
!mv /content/rl_trading.ipynb /content/cody-mckeon/Reinforcement_Learning_Trading.git


mv: cannot stat '/content/rl_trading.ipynb': No such file or directory


In [46]:
!ls

Reinforcement_Learning_Trading	sample_data


  and should_run_async(code)


In [47]:
!cd Reinforcement_Learning_Trading/
!ls

  and should_run_async(code)


Reinforcement_Learning_Trading	sample_data


In [48]:
!git config --global user.name "cody-mckeon"
!git config --global user.email "codymckeon@gmail.com"

  and should_run_async(code)


In [49]:
!git clone https://github.com/cody-mckeon/Reinforcement_Learning_Trading.git

fatal: destination path 'Reinforcement_Learning_Trading' already exists and is not an empty directory.


In [50]:
%cd Reinforcement_Learning_Trading/

/content/Reinforcement_Learning_Trading


  and should_run_async(code)


In [51]:
!ls

rl_trading.ipynb


  and should_run_async(code)


In [52]:
!mv /content/rl_trading.ipynb /content/Reinforcement_Learning_Trading/

mv: cannot stat '/content/rl_trading.ipynb': No such file or directory


  and should_run_async(code)


In [53]:
!ls /content/

Reinforcement_Learning_Trading	sample_data


  and should_run_async(code)


In [54]:
!pwd

/content/Reinforcement_Learning_Trading


  and should_run_async(code)


In [55]:
%cd ..

/content


  and should_run_async(code)


In [56]:
!ls

Reinforcement_Learning_Trading	sample_data


  and should_run_async(code)


In [57]:
%cd ..

/


  and should_run_async(code)


In [58]:
!ls

bin			    datalab  lib     media		       proc	   sbin  tools
boot			    dev      lib32   mnt		       python-apt  srv	 usr
content			    etc      lib64   NGC-DL-CONTAINER-LICENSE  root	   sys	 var
cuda-keyring_1.0-1_all.deb  home     libx32  opt		       run	   tmp


  and should_run_async(code)


In [59]:
%cd /content

/content


  and should_run_async(code)


In [60]:
!ls

Reinforcement_Learning_Trading	sample_data


  and should_run_async(code)
