In [12]:
import os
os.chdir("/home/studio-lab-user/sagemaker-studiolab-notebooks/AI-OT-24/Reinforcement-Learning-Stock-Porfolio-Managment")
!pip install -q -r requirements.txt
import sys
sys.path.append('./utils')
from trading_functions import *
# Core Libraries
import os
import sys
import time
import datetime
import warnings
import itertools
#warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0 = all messages, 1 = info, 2 = warnings, 3 = errors

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

# Financial Data
import yfinance as yf
import quantstats as qs
import ta

# Machine Learning - Supervised Learning
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Machine Learning - Deep Learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.losses import BinaryCrossentropy

# Reinforcement Learning and Environments
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import A2C, DDPG, DQN, HER, PPO, SAC, TD3
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import (
    EvalCallback, StopTrainingOnRewardThreshold, StopTrainingOnNoModelImprovement
)
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecCheckNan, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from sb3_contrib import ARS, MaskablePPO, RecurrentPPO, QRDQN, TRPO

# Imitation Learning
from imitation.algorithms import bc
from imitation.testing.reward_improvement import is_significant_reward_improvement
from imitation.data.types import Transitions

# Interactive Brokers API
from ib_insync import *

from typing import Callable

from collections import Counter

clear_output()


In [14]:
# Define constants
SEED = 1
history_length = [1, 2, 4, 5, 6, 8, 10, 16, 20]
reward_type = 'LNR'
stocks = ['AAPL', 'AMZN', 'META', 'MSFT', 'NVDA', 'TSLA']
start_date = '2015-01-01'
end_date = '2023-06-30'
n_envs = 8
n_steps = 16
total_timesteps = 100_000
batch_size = 32
learning_rate = 0.0001
ent_coef = 0.05
log_interval = 1_000
eval_freq = 1_000
model_name= 'A2C'

Stage 1: Set time intervals

In [15]:
# Training environment
env = create_training_env(history_length[-1], reward_type, start_date, end_date, stocks, n_envs)[0]

# Compute the differences with the next number
differences = [history_length[i + 1] - history_length[i] for i in range(len(history_length) - 1)]
# Append a placeholder for the last element
differences.append(0)  # Placeholder value
history_length.reverse()
differences.reverse()

results = []
# Initialize timesplits as a list of empty lists for each timesplit
timesplits = [[] for _ in range(6)]

date_length = env.df_unscaled.shape[0]  # Total number of rows
date = env.df_unscaled.index[-date_length]  # Initial start date

for j in range(6):  # Loop for 6 time splits
    if j != 0:
        date_length = date_length // 2  # Halve the date length each iteration
    date = env.df_unscaled.index[-date_length]  # Update the start date for the split
    
    for i in range(len(history_length)):  # Iterate over history lengths
        start_date = env.df_unscaled.index[-date_length + sum(differences[:i + 1])]
        
        # Append the start_date to the corresponding timesplit
        timesplits[j].append(start_date)
timesplits = pd.DataFrame(timesplits)

Stage 1: Multiple stock learning

In [None]:
# Example list of 18 stock tickers
stock_list = ['NVDA', 'TSLA', 'PLTR', 'AVGO', 'F', 'INTC', 'SMCI', 'MU', 'AAPL', 'AMD', 'PFE', 'T', 'UBER', 'AMZN', 'GOOGL', 'WBD', 'BAC', 'WBA', 'GOOG', 'CVS', 'MSFT', 'CSCO', 'CMCSA', 'WFC', 'KO', 'WMT', 'VZ', 'CCL', 'XOM', 'HBAN', 'AMCR', 'HPE', 'OXY', 'SLB', 'LRCX', 'DVN', 'BA', 'NKE', 'C', 'KDP', 'KHC', 'BMY', 'NEM', 'META', 'KMI', 'PCG', 'CSX', 'AES', 'FCX', 'SCHW', 'GM', 'MRK', 'CVX', 'KEY', 'KVUE', 'MDLZ', 'SBUX', 'DIS', 'HAL', 'UNH', 'MCHP', 'NEE', 'DOW', 'JPM', 'PYPL', 'QCOM', 'EXC', 'JNJ', 'PARA', 'ORCL', 'TFC', 'APH', 'CNC', 'COP', 'ANET', 'USB', 'RF', 'MO', 'HPQ', 'CMG', 'NCLH', 'HST', 'VTRS', 'AMAT', 'WMB', 'ON', 'APA', 'V', 'ADBE', 'WDC', 'GILD', 'EQT', 'MRNA', 'DAL', 'DELL', 'PEP', 'CRM', 'CARR', 'LW', 'MS', 'ABBV', 'EW', 'GIS', 'IPG', 'LUV', 'CTRA', 'GE', 'PG', 'CCI', 'TJX', 'CNP', 'MDT']

history_lengths = history_length
history_length = None

for h_l, history_length in enumerate(history_lengths):
    model_save_dir = f'./models/history_length/{history_length}'
    log_dir = f'./logs/history_length/{history_length}'
    # Split the list into groups of 6 stocks without repetition
    n_groups = len(stock_list) // history_length  # This should be 3 for 18/6
    groups = [stock_list[i * history_length:(i + 1) * history_length] for i in range(n_groups)]

    # Create environments for each group of stocks
    env = None
    vec_env = None

    # Validation environment
    valid_env, vec_valid_env = create_evaluation_env(history_length, reward_type, '2023-07-01', '2023-12-30', ['TGT', 'MARA', 'GOOGL', 'WMT', 'V', 'PG'])

    # Evaluation callback for saving the best model
    eval_callback = EvalCallback(
        vec_valid_env,
        n_eval_episodes=1,
        eval_freq=eval_freq,
        deterministic=True,
        verbose=0,
        best_model_save_path=model_save_dir,
    )

    _, vec_env = create_training_env(history_length, reward_type, start_date, end_date, groups[0], n_envs)

    model = None
    model = A2C('MlpPolicy', 
                    vec_env,
                    learning_rate=0.002,
                    n_steps=8,
                    gamma=0.99,
                    gae_lambda=1.0,
                    ent_coef=0.05,
                    vf_coef=0.5,
                    max_grad_norm=0.5,
                    rms_prop_eps=1e-05,
                    use_rms_prop=True,
                    use_sde=False,
                    sde_sample_freq=-1,
                    rollout_buffer_class=None,
                    rollout_buffer_kwargs=None,
                    normalize_advantage=False,
                    stats_window_size=100,
                    tensorboard_log=log_dir,
                    policy_kwargs=None,
                    verbose=0,
                    seed=0,
                    device='auto',
                    _init_setup_model=True)

    for timesplit in range(timesplits.shape[0]):
        date = timesplits[h_l][timesplit]
        if timesplit != 0:
                param = model.get_parameters()
                param['policy.optimizer']['param_groups'][0]['lr'] = param['policy.optimizer']['param_groups'][0]['lr']/2
                model.set_parameters(param)
        for stocks in groups:
            env, vec_env = create_training_env(history_length, reward_type, date, end_date, stocks, n_envs)
            model.set_env(vec_env)
            model.learn(
                    total_timesteps=10_000,
                    progress_bar=False,
                    log_interval=log_interval,
                    tb_log_name=f"A2C{timesplit}",
                    reset_num_timesteps=False,
                    callback=eval_callback
                )
            model.save(f'{model_save_dir}/{timesplit}')

    env.close()
    vec_env.close()
    valid_env.close()
    vec_valid_env.close()
    del env 
    del vec_env 
    del valid_env 
    del vec_valid_env

Stage 2: Multiple time learning

In [None]:
date = None
date_length = env.df_unscaled.shape[0]
for i in range(4):
    date_length = date_length//2
    print(date_length,date)
    date = env.df_unscaled.index[-date_length]
    _, vec_env = create_training_env(history_length, reward_type, date, end_date, stocks, n_envs)
    model.set_env(vec_env)
    param = model.get_parameters()
    param['policy.optimizer']['param_groups'][0]['lr'] = param['policy.optimizer']['param_groups'][0]['lr']/2
    model.set_parameters(param)
    model.learn(
            total_timesteps=total_timesteps,
            progress_bar=False,
            log_interval=log_interval,
            tb_log_name=f"A2C",
            reset_num_timesteps=False,
            callback=eval_callback
        )
    vec_env.close()

Stage 3: Multiple history length models

In [None]:
# Training environment
env = create_training_env(history_length[-1], reward_type, start_date, end_date, stocks, n_envs)[0]

# Test environment
test_env, _ = create_evaluation_env(history_length[-1], reward_type, '2023-07-01', '2024-12-01', stocks,n_envs=1)

envs = []
vec_envs = []
models = []

# Compute the differences with the next number
differences = [history_length[i + 1] - history_length[i] for i in range(len(history_length) - 1)]
# Append a placeholder for the last element
differences.append(0)  # Placeholder value
history_length.reverse()
differences.reverse()

for i in range(len(history_length)): 
    date = env.df_unscaled.index[differences[i]]
    print(differences[i],date)
    env, vec_env = create_training_env(history_length[i], reward_type, date, end_date, stocks, n_envs)
    envs.append(env)
    vec_envs.append(vec_env)

    model = A2C('MlpPolicy', 
                    vec_envs[i],
                    learning_rate=0.0001,
                    n_steps=16,
                    gamma=0.99,
                    gae_lambda=1.0,
                    ent_coef=0.10,
                    vf_coef=0.5,
                    max_grad_norm=0.5,
                    rms_prop_eps=1e-05,
                    use_rms_prop=True,
                    use_sde=False,
                    sde_sample_freq=-1,
                    rollout_buffer_class=None,
                    rollout_buffer_kwargs=None,
                    normalize_advantage=False,
                    stats_window_size=100,
                    tensorboard_log=log_dir,
                    policy_kwargs=None,
                    verbose=0,
                    seed=0,
                    device='auto',
                    _init_setup_model=True)
    models.append(model)
    models[i].learn(
            total_timesteps=total_timesteps,
            progress_bar=False,
            log_interval=log_interval,
            tb_log_name=f"A2C",
            reset_num_timesteps=False,
            #callback=eval_callback
        )
    
    # Evaluate the model on training and test environments
    print(f"Evaluating model: {model_name,i}")

    # Training evaluation
    mean_train_reward, std_train_reward = evaluate_policy(
        models[i].policy if hasattr(models[i], 'policy') else models[i],
        envs[i],
        n_eval_episodes=1,
        deterministic=True
    )
    print(f"{model_name} Train Mean reward: {mean_train_reward:.2f} ± {std_train_reward:.2f}")
    #evaluate_model(envs[i], models[i], model_name, 1, has_policy=True) if hasattr(models[i], 'policy') else evaluate_model(envs[i], models[i], model_name, 1, has_policy=False)
    print('Steps',envs[i].steps)
    vec_envs[i].close()