In [111]:
import sys
import os

print("=" * 70)
print("NOTEBOOK ENVIRONMENT INFORMATION")
print("=" * 70)
print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")
print(f"Current working directory: {os.getcwd()}")
print(f"Virtual environment: {os.path.basename(sys.prefix)}")
print("=" * 70)

import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import quantstats as qs
import pypfopt as ppo
import torch as th

from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import A2C, PPO, DDPG, SAC
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

from typing import Tuple
from stable_baselines3.common.policies import BasePolicy

import onnx
import onnxruntime as ort
import onnxscript

print("\n✓ All imports successful!")
print(f"NumPy: {np.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"PyTorch: {th.__version__}")
print(f"CUDA available: {th.cuda.is_available()}")
if th.cuda.is_available():
    print(f"CUDA device: {th.cuda.get_device_name(0)}")
else:
    print("Running on CPU (slower than GPU)")

NOTEBOOK ENVIRONMENT INFORMATION
Python executable: c:\Users\abejr\OneDrive\Uniworks\PW25\Thesis\rl_portfolio\.venv_fresh\Scripts\python.exe
Python version: 3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
Current working directory: c:\Users\abejr\OneDrive\Uniworks\PW25\Thesis\rl_portfolio
Virtual environment: .venv_fresh

✓ All imports successful!
NumPy: 2.4.0
Pandas: 2.3.3
PyTorch: 2.9.1+cpu
CUDA available: False
Running on CPU (slower than GPU)


In [112]:
import importlib
import sys

# Clear any cached modules to force fresh import
if 'utils' in sys.modules:
    del sys.modules['utils']
if 'models' in sys.modules:
    del sys.modules['models']
if 'portfolio_env' in sys.modules:
    del sys.modules['portfolio_env']

import utils
import models
import portfolio_env

# Verify the function exists
print("\n✓ Modules imported successfully!")


✓ Modules imported successfully!


In [113]:

# Initialize hyperparameter tracking CSV
import csv
from datetime import datetime

hpt_log_file = 'tables/hpt_log.csv'
hpt_columns = [
    'timestamp',
    'model_name',
    'learning_rate',
    'n_steps',
    'batch_size',
    'gamma',
    'gae_lambda',
    'ent_coef',
    'vf_coef',
    'buffer_size',
    'tau',
    'total_timesteps',
    'annualized_return',
    'sharpe_ratio',
    'max_drawdown',
    'annualized_volatility',
    'sortino_ratio'
]

# Create/initialize CSV file if it doesn't exist
import os
if not os.path.exists(hpt_log_file):
    with open(hpt_log_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=hpt_columns)
        writer.writeheader()

def log_hpt_results(model_name, hyperparams, total_timesteps, eval_results):
    """Log hyperparameter tuning results to CSV"""
    row = {
        'timestamp': datetime.now().isoformat(),
        'model_name': model_name,
        'total_timesteps': total_timesteps,
        'annualized_return': eval_results.get('annualized_return'),
        'sharpe_ratio': eval_results.get('sharpe_ratio'),
        'max_drawdown': eval_results.get('max_drawdown'),
        'annualized_volatility': eval_results.get('annualized_volatility'),
        'sortino_ratio': eval_results.get('sortino_ratio'),
    }
    # Add hyperparameters
    row.update(hyperparams)
    
    with open(hpt_log_file, 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=hpt_columns)
        writer.writerow(row)
    
    print(f"✓ Logged {model_name} results to {hpt_log_file}")

print("✓ Hyperparameter tracking initialized!")


✓ Hyperparameter tracking initialized!


# Data Loading & Preprocessing

## Data Extraction

In [114]:
start = '2010-01-01'
end = '2024-12-31'
# tickers = ['SPY','QQQ','IWM','AGG','TLT','IEF','SHY','GLD','UUP','FXE','FXY','FXB']
tickers=  ['SPY','TLT','GLD','FXY']

raw_data = yf.download(tickers, start=start, end=end, group_by='ticker')

# download close prices of all tickers
data = pd.DataFrame()
for ticker in tickers:
    data[ticker] = raw_data[ticker]['Close']

[*********************100%***********************]  4 of 4 completed


## Exploratory Data Analysis

In [115]:
data.corr()

Unnamed: 0,SPY,TLT,GLD,FXY
SPY,1.0,0.475117,0.725497,-0.818465
TLT,0.475117,1.0,0.199021,-0.383163
GLD,0.725497,0.199021,1.0,-0.307484
FXY,-0.818465,-0.383163,-0.307484,1.0


In [116]:
# export the data
data.to_csv('tables/price_data.csv')

# Benchmark Model

## Buy and Hold

In [117]:
buy_and_hold_results = utils.buy_and_hold(price_data=data, port_initial_date='2022-01-01', initial_capital=100000)
buy_and_hold_results['annualized_return']

'-0.56%'

In [118]:
buy_and_hold_results = pd.DataFrame({k: v for k, v in buy_and_hold_results.items() if k != 'current_weights'})
buy_and_hold_results.to_csv('tables/holding_portfolio_results.csv')

## Mean-Variance Optimization

In [119]:
MVO_1y_lookback = utils.rebalance_portfolio(price_data=data,
                                           port_initial_date='2022-01-01',
                                           lookback_period=21,
                                           bounds=(0,1),
                                           mode='max_sharpe')

MVO_1y_lookback['annualized_return']

'-8.56%'

In [120]:
MVO_1y_lookback = pd.DataFrame({k: v for k, v in MVO_1y_lookback.items() if k != 'current_weights'})
MVO_1y_lookback.to_csv('tables/mvo_portfolio_results.csv')

# Reinforcement Learning

## Environment Building and Testing

In [121]:
env = portfolio_env.PortfolioEnv(price_data=data, lookback_period=252, initial_capital=1000000, bounds=(0,1), transaction_cost=0.0015, risk_aversion=0.05, reward_scale=0.1)
check_env(env)



In [122]:
train_env, test_env = models.split_build_normalize_env(price_data=data,
                                                       port_initial_date='2022-01-01',
                                                       lookback_period=21)

## A2C

### A2C - Training

In [123]:
# train the model with improved hyperparameters
policy_kwargs = {"log_std_init": -2.0}

a2c_hyperparams = {
    'learning_rate': 0.005,  # increased from 0.002 - faster adaptation with new features
    'n_steps': 128,  # reduced from 256 - fresher updates with larger feature space
    'batch_size': None,
    'gamma': 0.99,
    'gae_lambda': 0.93,  # reduced from 0.95 - more recent value weighting
    'ent_coef': 0.05,  # increased from 0.01 - more exploration (5x)
    'vf_coef': 0.5,
    'buffer_size': None,
    'tau': None,
}

a2c_model = A2C(
    policy="MlpPolicy",
    env=train_env,
    learning_rate=a2c_hyperparams['learning_rate'],
    n_steps=a2c_hyperparams['n_steps'],
    gamma=a2c_hyperparams['gamma'],
    gae_lambda=a2c_hyperparams['gae_lambda'],
    ent_coef=a2c_hyperparams['ent_coef'],
    vf_coef=a2c_hyperparams['vf_coef'],
    verbose=1,
    seed=42,
    policy_kwargs=policy_kwargs
)
a2c_model.learn(total_timesteps=600000)

# Copy normalization statistics
test_env.obs_rms = train_env.obs_rms
test_env.ret_rms = train_env.ret_rms

# save the model
a2c_model.save("model/a2c_portfolio_model.zip")
print("✓ A2C model trained and saved")

Using cpu device
------------------------------------
| time/                 |          |
|    fps                | 654      |
|    iterations         | 100      |
|    time_elapsed       | 19       |
|    total_timesteps    | 12800    |
| train/                |          |
|    entropy_loss       | -0.313   |
|    explained_variance | -0.189   |
|    learning_rate      | 0.005    |
|    n_updates          | 99       |
|    policy_loss        | -0.0215  |
|    std                | 0.263    |
|    value_loss         | 0.0518   |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 544      |
|    iterations         | 200      |
|    time_elapsed       | 47       |
|    total_timesteps    | 25600    |
| train/                |          |
|    entropy_loss       | -2.41    |
|    explained_variance | 0.131    |
|    learning_rate      | 0.005    |
|    n_updates          | 199      |
|    policy_loss     

### A2C - Testing

In [124]:
# load and evaluate the model
a2c_model_loaded = A2C.load("model/a2c_portfolio_model.zip")
a2c_results = models.evaluate_model_sb3(a2c_model_loaded, test_env, num_episodes=10)

print(f"A2C Annualized Return: {a2c_results['annualized_return']:.4f}")

# log results to hyperparameter tracking
log_hpt_results(
    model_name='A2C',
    hyperparams=a2c_hyperparams,
    total_timesteps=600000,
    eval_results=a2c_results
)

# save scalar results
a2c_results_df = pd.DataFrame({k: [v] for k, v in a2c_results.items() if not isinstance(v, np.ndarray)})
a2c_results_df.to_csv('tables/a2c_portfolio_results.csv', index=False)

A2C Annualized Return: 0.0050
✓ Logged A2C results to tables/hpt_log.csv


## PPO

### PPO - Training

In [125]:
policy_kwargs = {
    "log_std_init": -1.0,
    "net_arch": {"pi": [256, 256], "vf": [256, 256]},  # increased from [128,128]
}

ppo_hyperparams = {
    'learning_rate': 0.0001,  # reduced from 0.0002 for stability
    'n_steps': 512,  # reduced from 1024 - fresher data updates
    'batch_size': 128,  # reduced from 256 for better batch quality
    'gamma': 0.99,
    'gae_lambda': 0.96,  # reduced from 0.98
    'ent_coef': 0.01,  # TRIPLED from 0.001 - was too conservative!
    'vf_coef': 0.5,
}

ppo_model = PPO(
    policy="MlpPolicy",
    env=train_env,
    learning_rate=ppo_hyperparams['learning_rate'],
    n_steps=ppo_hyperparams['n_steps'],
    batch_size=ppo_hyperparams['batch_size'],
    gamma=ppo_hyperparams['gamma'],
    gae_lambda=ppo_hyperparams['gae_lambda'],
    clip_range=0.2,
    ent_coef=ppo_hyperparams['ent_coef'],
    n_epochs=10,
    max_grad_norm=0.5,
    verbose=1,
    seed=42,
    policy_kwargs=policy_kwargs
)

ppo_model.learn(total_timesteps=600000)

# Copy normalization statistics
test_env.obs_rms = train_env.obs_rms
test_env.ret_rms = train_env.ret_rms

# save the model
ppo_model.save("model/ppo_portfolio_model.zip")
print("✓ PPO model trained and saved")

Using cpu device
----------------------------
| time/              |     |
|    fps             | 581 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 512 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 522         |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 1024        |
| train/                  |             |
|    approx_kl            | 0.016891971 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.67       |
|    explained_variance   | -9.22       |
|    learning_rate        | 0.0001      |
|    loss                 | -0.0785     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.054      |
|    std                  | 0.368       |
|    value_loss           | 0.157       |
------------------------

### PPO - Testing

In [126]:
# load and evaluate the model
ppo_model_loaded = PPO.load("model/ppo_portfolio_model.zip")
ppo_results = models.evaluate_model_sb3(ppo_model_loaded, test_env, num_episodes=10)

print(f"PPO Annualized Return: {ppo_results['annualized_return']:.4f}")

# log results to hyperparameter tracking
log_hpt_results(
    model_name='PPO',
    hyperparams=ppo_hyperparams,
    total_timesteps=600000,
    eval_results=ppo_results
)

# save scalar results
ppo_results_df = pd.DataFrame({k: [v] for k, v in ppo_results.items() if not isinstance(v, np.ndarray)})
ppo_results_df.to_csv('tables/ppo_portfolio_results.csv', index=False)

PPO Annualized Return: 0.0107
✓ Logged PPO results to tables/hpt_log.csv


## DDPG

### DDPG - Training

In [127]:
n_actions = train_env.action_space.shape[-1]

action_noise = OrnsteinUhlenbeckActionNoise(
    mean=np.zeros(n_actions),
    sigma=0.15 * np.ones(n_actions)  # reduced from 0.2 for more deterministic behavior
)

ddpg_hyperparams = {
    'learning_rate': 5e-5,  # reduced from 1e-4 for stability with new features
    'n_steps': None,
    'batch_size': 128,  # reduced from 256
    'gamma': 0.99,
    'gae_lambda': None,
    'ent_coef': None,
    'vf_coef': None,
    'buffer_size': 50000,  # increased from 30000 for more diverse samples
    'tau': 0.01,  # increased from 0.005 for faster target network updates
}

ddpg_model = DDPG(
    policy="MlpPolicy",
    env=train_env,
    learning_rate=ddpg_hyperparams['learning_rate'],
    buffer_size=ddpg_hyperparams['buffer_size'],
    learning_starts=5000,
    batch_size=ddpg_hyperparams['batch_size'],
    tau=ddpg_hyperparams['tau'],
    gamma=ddpg_hyperparams['gamma'],
    action_noise=action_noise,
    verbose=1,
    seed=42
)

ddpg_model.learn(total_timesteps=150000)

# Copy normalization statistics
test_env.obs_rms = train_env.obs_rms
test_env.ret_rms = train_env.ret_rms

# save the model
ddpg_model.save("model/ddpg_portfolio_model.zip")
print("✓ DDPG model trained and saved")

Using cpu device
---------------------------------
| time/              |          |
|    episodes        | 4        |
|    fps             | 67       |
|    time_elapsed    | 176      |
|    total_timesteps | 11996    |
| train/             |          |
|    actor_loss      | 0.0436   |
|    critic_loss     | 7.31e-05 |
|    learning_rate   | 5e-05    |
|    n_updates       | 6995     |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 8        |
|    fps             | 48       |
|    time_elapsed    | 495      |
|    total_timesteps | 23992    |
| train/             |          |
|    actor_loss      | -0.0684  |
|    critic_loss     | 3.13e-05 |
|    learning_rate   | 5e-05    |
|    n_updates       | 18991    |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 12       |
|    fps             | 43       |
|    time_elapsed    | 830     

### DDPG - Testing

In [128]:
# load and evaluate the model
ddpg_model_loaded = DDPG.load("model/ddpg_portfolio_model.zip")
ddpg_results = models.evaluate_model_sb3(ddpg_model_loaded, test_env, num_episodes=10)

print(f"DDPG Annualized Return: {ddpg_results['annualized_return']:.4f}")

# log results to hyperparameter tracking
log_hpt_results(
    model_name='DDPG',
    hyperparams=ddpg_hyperparams,
    total_timesteps=150000,
    eval_results=ddpg_results
)

# save scalar results
ddpg_results_df = pd.DataFrame({k: [v] for k, v in ddpg_results.items() if not isinstance(v, np.ndarray)})
ddpg_results_df.to_csv('tables/ddpg_portfolio_results.csv', index=False)

DDPG Annualized Return: 0.0093
✓ Logged DDPG results to tables/hpt_log.csv


## SAC

### SAC - Training

In [129]:
sac_hyperparams = {
    'learning_rate': 3e-4,  # reduced from 5e-4 for finer-grained learning
    'n_steps': None,
    'batch_size': 256,  # reduced from 512 for better stability
    'gamma': 0.99,
    'gae_lambda': None,
    'ent_coef': 0.02,  # increased from 0.01 - more exploration to break past 0.97%
    'vf_coef': None,
    'buffer_size': 75000,  # increased from 50000 for even more diverse experience
    'tau': 0.015,  # increased from 0.01 for faster target updates
}

sac_model = SAC(
    policy="MlpPolicy",
    env=train_env,
    learning_rate=sac_hyperparams['learning_rate'],
    buffer_size=sac_hyperparams['buffer_size'],
    learning_starts=5000,
    batch_size=sac_hyperparams['batch_size'],
    gamma=sac_hyperparams['gamma'],
    tau=sac_hyperparams['tau'],
    ent_coef=sac_hyperparams['ent_coef'],
    verbose=1,
    seed=42
)

sac_model.learn(total_timesteps=150000)

# Copy normalization statistics
test_env.obs_rms = train_env.obs_rms
test_env.ret_rms = train_env.ret_rms

# save the model
sac_model.save("model/sac_portfolio_model.zip")
print("✓ SAC model trained and saved")

Using cpu device
---------------------------------
| time/              |          |
|    episodes        | 4        |
|    fps             | 55       |
|    time_elapsed    | 214      |
|    total_timesteps | 11996    |
| train/             |          |
|    actor_loss      | -2.31    |
|    critic_loss     | 0.000321 |
|    ent_coef        | 0.02     |
|    learning_rate   | 0.0003   |
|    n_updates       | 6995     |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 8        |
|    fps             | 42       |
|    time_elapsed    | 571      |
|    total_timesteps | 23992    |
| train/             |          |
|    actor_loss      | -3.33    |
|    critic_loss     | 0.000437 |
|    ent_coef        | 0.02     |
|    learning_rate   | 0.0003   |
|    n_updates       | 18991    |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 12      

### SAC - Testing

In [130]:
# load and evaluate the model
sac_model_loaded = SAC.load("model/sac_portfolio_model.zip")
sac_results = models.evaluate_model_sb3(sac_model_loaded, test_env, num_episodes=10)

print(f"SAC Annualized Return: {sac_results['annualized_return']:.4f}")

# log results to hyperparameter tracking
log_hpt_results(
    model_name='SAC',
    hyperparams=sac_hyperparams,
    total_timesteps=150000,
    eval_results=sac_results
)

# save scalar results
sac_results_df = pd.DataFrame({k: [v] for k, v in sac_results.items() if not isinstance(v, np.ndarray)})
sac_results_df.to_csv('tables/sac_portfolio_results.csv', index=False)

SAC Annualized Return: 0.0099
✓ Logged SAC results to tables/hpt_log.csv


In [131]:
# Display the complete hyperparameter tracking log
hpt_log_df = pd.read_csv(hpt_log_file)
print("\n" + "="*100)
print("HYPERPARAMETER TUNING LOG")
print("="*100)
display(hpt_log_df)


HYPERPARAMETER TUNING LOG


Unnamed: 0,timestamp,model_name,learning_rate,n_steps,batch_size,gamma,gae_lambda,ent_coef,vf_coef,buffer_size,tau,total_timesteps,annualized_return,sharpe_ratio,max_drawdown,annualized_volatility,sortino_ratio
0,2026-01-05T06:16:59.242639,A2C,0.001,512.0,,0.99,0.99,0.0001,0.5,,,800000,-0.026862,-0.569022,-0.234634,0.099929,
1,2026-01-05T06:19:05.792765,A2C,0.001,512.0,,0.99,0.99,0.0001,0.5,,,800000,-0.026862,-0.569022,-0.234634,0.099929,
2,2026-01-05T06:48:51.639838,PPO,0.0005,2048.0,512.0,0.99,0.99,0,0.5,,,600000,-0.066686,-0.930306,-0.281837,0.103929,
3,2026-01-05T10:10:46.333540,DDPG,0.0001,,256.0,0.99,,,,30000.0,0.005,120000,0.058895,0.221094,-0.228421,0.130692,
4,2026-01-05T13:53:13.919015,SAC,0.0003,,256.0,0.99,,auto,,30000.0,0.005,120000,0.005882,-0.178872,-0.230759,0.134835,
5,2026-01-05T21:54:36.043960,A2C,0.002,256.0,,0.99,0.95,0.01,0.5,,,600000,-0.053759,-0.547528,-0.312315,0.152977,
6,2026-01-06T03:23:30.536904,A2C,0.002,256.0,,0.99,0.95,0.01,0.5,,,600000,0.009636,0.0,0.0,0.0,0.0
7,2026-01-06T03:24:17.964262,DDPG,0.00015,,256.0,0.99,,,,50000.0,0.001,150000,0.008788,0.0,0.0,0.0,0.0
8,2026-01-06T03:24:32.043354,SAC,0.0003,,256.0,0.99,,auto,,30000.0,0.005,150000,0.008648,0.0,0.0,0.0,0.0
9,2026-01-06T03:54:43.486458,PPO,0.0002,1024.0,256.0,0.99,0.98,0.001,0.5,,,600000,0.009636,0.0,0.0,0.0,0.0
