In [255]:
import sys
import os

print("=" * 70)
print("NOTEBOOK ENVIRONMENT INFORMATION")
print("=" * 70)
print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")
print(f"Current working directory: {os.getcwd()}")
print(f"Virtual environment: {os.path.basename(sys.prefix)}")
print("=" * 70)

import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import quantstats as qs
import pypfopt as ppo
import torch as th

from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import A2C, PPO, DDPG, SAC
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

from typing import Tuple
from stable_baselines3.common.policies import BasePolicy

import onnx
import onnxruntime as ort
import onnxscript

print("\n✓ All imports successful!")
print(f"NumPy: {np.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"PyTorch: {th.__version__}")
print(f"CUDA available: {th.cuda.is_available()}")
if th.cuda.is_available():
    print(f"CUDA device: {th.cuda.get_device_name(0)}")
else:
    print("Running on CPU (slower than GPU)")

NOTEBOOK ENVIRONMENT INFORMATION
Python executable: c:\Users\abejr\OneDrive\Uniworks\PW25\Thesis\rl_portfolio\.venv_fresh\Scripts\python.exe
Python version: 3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
Current working directory: c:\Users\abejr\OneDrive\Uniworks\PW25\Thesis\rl_portfolio
Virtual environment: .venv_fresh

✓ All imports successful!
NumPy: 2.4.0
Pandas: 2.3.3
PyTorch: 2.9.1+cpu
CUDA available: False
Running on CPU (slower than GPU)


In [256]:
import importlib
import sys

# Clear any cached modules to force fresh import
if 'utils' in sys.modules:
    del sys.modules['utils']
if 'models' in sys.modules:
    del sys.modules['models']
if 'portfolio_env' in sys.modules:
    del sys.modules['portfolio_env']

import utils
import models
import portfolio_env

# Verify the function exists
print("\n✓ Modules imported successfully!")

✓ Hyperparameter tracking initialized!

✓ Modules imported successfully!


# Data Loading & Preprocessing

## Data Extraction

In [257]:
start = '2010-01-01'
end = '2024-12-31'
# tickers = ['SPY','QQQ','IWM','AGG','TLT','IEF','SHY','GLD','UUP','FXE','FXY','FXB']
tickers=  ['SPY','TLT','GLD','FXY']

raw_data = yf.download(tickers, start=start, end=end, group_by='ticker')

# download close prices of all tickers
data = pd.DataFrame()
for ticker in tickers:
    data[ticker] = raw_data[ticker]['Close']

[*********************100%***********************]  4 of 4 completed


## Exploratory Data Analysis

In [258]:
data.corr()

Unnamed: 0,SPY,TLT,GLD,FXY
SPY,1.0,0.475117,0.725497,-0.818465
TLT,0.475117,1.0,0.199021,-0.383163
GLD,0.725497,0.199021,1.0,-0.307484
FXY,-0.818465,-0.383163,-0.307484,1.0


In [259]:
# export the data
data.to_csv('tables/price_data.csv')

# Benchmark Model

## Buy and Hold

In [260]:
buy_and_hold_results = utils.buy_and_hold(price_data=data, port_initial_date='2022-01-01', initial_capital=100000)
buy_and_hold_results['annualized_return']

'-0.51%'

In [261]:
buy_and_hold_results = pd.DataFrame({k: v for k, v in buy_and_hold_results.items() if k != 'current_weights'})
buy_and_hold_results.to_csv('tables/holding_portfolio_results.csv')

## Mean-Variance Optimization

In [262]:
MVO_1y_lookback = utils.rebalance_portfolio(price_data=data,
                                           port_initial_date='2022-01-04',
                                           lookback_period=21,
                                           bounds=(0,1),
                                           mode='max_sharpe')

MVO_1y_lookback['annualized_return']

'-8.25%'

In [263]:
MVO_1y_lookback = pd.DataFrame({k: v for k, v in MVO_1y_lookback.items() if k != 'current_weights'})
MVO_1y_lookback.to_csv('tables/mvo_portfolio_results.csv')

# Reinforcement Learning

## Environment Building and Testing

In [264]:
env = portfolio_env.PortfolioEnv(price_data=data, lookback_period=252, initial_capital=1000000, bounds=(0,1), transaction_cost=0.0015, risk_aversion=0.05, reward_scale=0.1)
check_env(env)



In [265]:
train_env, test_env = models.split_build_normalize_env(price_data=data,
                                                       port_initial_date='2022-01-01',
                                                       lookback_period=21)

## A2C

### A2C - Training

In [278]:
# train the model with optimized hyperparameters for ~1.2% return
policy_kwargs = {"log_std_init": -1.0}  # More reasonable initial policy std

a2c_hyperparams = {
    'learning_rate': 0.0005,  # REDUCED from 0.01 - too high caused instability
    'n_steps': 256,  # INCREASED from 64 - more stable trajectory sampling
    'batch_size': None,
    'gamma': 0.99,
    'gae_lambda': 0.95,  # INCREASED from 0.90 - better long-term credit assignment
    'ent_coef': 0.05,  # INCREASED from 0.01 - more exploration for pattern discovery
    'vf_coef': 0.5,  # REDUCED from 0.75 - balance policy and value learning
    'buffer_size': None,
    'tau': None,
}

a2c_model = A2C(
    policy="MlpPolicy",
    env=train_env,
    learning_rate=a2c_hyperparams['learning_rate'],
    n_steps=a2c_hyperparams['n_steps'],
    gamma=a2c_hyperparams['gamma'],
    gae_lambda=a2c_hyperparams['gae_lambda'],
    ent_coef=a2c_hyperparams['ent_coef'],
    vf_coef=a2c_hyperparams['vf_coef'],
    verbose=1,
    seed=42,
    policy_kwargs=policy_kwargs
)

# Train and capture final info
a2c_model.learn(total_timesteps=600000)  # INCREASED from 600k - more data for stable learning

# Copy normalization statistics
test_env.obs_rms = train_env.obs_rms
test_env.ret_rms = train_env.ret_rms

# save the model
a2c_model.save("model/a2c_portfolio_model.zip")
print("✓ A2C model trained and saved")

Using cpu device
------------------------------------
| time/                 |          |
|    fps                | 711      |
|    iterations         | 100      |
|    time_elapsed       | 35       |
|    total_timesteps    | 25600    |
| train/                |          |
|    entropy_loss       | -1.96    |
|    explained_variance | -0.0205  |
|    learning_rate      | 0.0005   |
|    n_updates          | 99       |
|    policy_loss        | -0.661   |
|    std                | 0.395    |
|    value_loss         | 0.204    |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 740      |
|    iterations         | 200      |
|    time_elapsed       | 69       |
|    total_timesteps    | 51200    |
| train/                |          |
|    entropy_loss       | -2.15    |
|    explained_variance | 0.611    |
|    learning_rate      | 0.0005   |
|    n_updates          | 199      |
|    policy_loss     

### A2C - Testing

In [279]:
a2c_model_loaded = A2C.load("model/a2c_portfolio_model.zip")
a2c_results = models.evaluate_model_sb3(a2c_model_loaded, test_env, num_episodes=10)

print(f"A2C Annualized Return: {a2c_results['annualized_return']:.4f}")

# log results to hyperparameter tracking
utils.log_hpt_results(
    model_name='A2C',
    hyperparams=a2c_hyperparams,
    total_timesteps=600000,
    eval_results=a2c_results
)

# save scalar results
a2c_results_df = pd.DataFrame({k: [v] for k, v in a2c_results.items() if not isinstance(v, np.ndarray)})
a2c_results_df.to_csv('tables/a2c_portfolio_results.csv', index=False)

# save portfolio values in separated csv files
a2c_results['portfolio_df'].to_csv('tables/a2c_portfolio_values.csv', index=False)

A2C Annualized Return: -0.0069
✓ Logged A2C results to tables/hpt_log.csv


## PPO

### PPO - Training

In [None]:
policy_kwargs = {
    "log_std_init": -0.5,  # INCREASED from -1.0 - allow higher initial exploration
    "net_arch": {"pi": [256, 256], "vf": [256, 256]},  # INCREASED from [128,128] - larger networks for complex task
}

ppo_hyperparams = {
    'learning_rate': 0.0003,  # REDUCED from 0.0005 - more stable gradient descent
    'n_steps': 512,  # INCREASED from 256 - collect more diverse trajectories
    'batch_size': 32,  # REDUCED from 64 - smaller batches for better gradient estimates
    'gamma': 0.99,
    'gae_lambda': 0.98,  # INCREASED from 0.94 - capture long-term dependencies
    'ent_coef': 0.02,  # INCREASED from 0.005 - encourage exploration of trading strategies
    'vf_coef': 0.5,  # REDUCED from 0.75 - balance policy and value learning
}

ppo_model = PPO(
    policy="MlpPolicy",
    env=train_env,
    learning_rate=ppo_hyperparams['learning_rate'],
    n_steps=ppo_hyperparams['n_steps'],
    batch_size=ppo_hyperparams['batch_size'],
    gamma=ppo_hyperparams['gamma'],
    gae_lambda=ppo_hyperparams['gae_lambda'],
    clip_range=0.1,  # REDUCED from 0.2 - more conservative policy updates
    ent_coef=ppo_hyperparams['ent_coef'],
    n_epochs=20,  # INCREASED from 10 - better optimization per batch
    max_grad_norm=0.5,
    verbose=1,
    seed=42,
    policy_kwargs=policy_kwargs
)

ppo_model.learn(total_timesteps=600000)  # INCREASED from 600k - more training for convergence

# Copy normalization statistics
test_env.obs_rms = train_env.obs_rms
test_env.ret_rms = train_env.ret_rms

# save the model
ppo_model.save("model/ppo_portfolio_model.zip")
print("✓ PPO model trained and saved")

Using cpu device
----------------------------
| time/              |     |
|    fps             | 548 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 512 |
----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 229       |
|    iterations           | 2         |
|    time_elapsed         | 4         |
|    total_timesteps      | 1024      |
| train/                  |           |
|    approx_kl            | 0.1020763 |
|    clip_fraction        | 0.758     |
|    clip_range           | 0.1       |
|    entropy_loss         | -3.7      |
|    explained_variance   | -4.7      |
|    learning_rate        | 0.0003    |
|    loss                 | -0.151    |
|    n_updates            | 20        |
|    policy_gradient_loss | -0.0645   |
|    std                  | 0.619     |
|    value_loss           | 0.105     |
---------------------------------------
--------------------

### PPO - Testing

In [None]:
# load and evaluate the model
ppo_model_loaded = PPO.load("model/ppo_portfolio_model.zip")
ppo_results = models.evaluate_model_sb3(ppo_model_loaded, test_env, num_episodes=10)

print(f"PPO Annualized Return: {ppo_results['annualized_return']:.4f}")

# log results to hyperparameter tracking
utils.log_hpt_results(
    model_name='PPO',
    hyperparams=ppo_hyperparams,
    total_timesteps=600000,
    eval_results=ppo_results
)

# save scalar results
ppo_results_df = pd.DataFrame({k: [v] for k, v in ppo_results.items() if not isinstance(v, np.ndarray)})
ppo_results_df.to_csv('tables/ppo_portfolio_results.csv', index=False)

# save portfolio values in separated csv files
ppo_results['portfolio_df'].to_csv('tables/ppo_portfolio_values.csv', index=False)

PPO Annualized Return: -0.0256
✓ Logged PPO results to tables/hpt_log.csv


## DDPG

### DDPG - Training

In [None]:
n_actions = train_env.action_space.shape[-1]

action_noise = OrnsteinUhlenbeckActionNoise(
    mean=np.zeros(n_actions),
    sigma=0.1 * np.ones(n_actions)  # REDUCED from 0.25 - avoid erratic trading actions
)

ddpg_hyperparams = {
    'learning_rate': 1e-4,  # REDUCED from 3e-4 - too high caused divergence
    'n_steps': None,
    'batch_size': 128,  # INCREASED from 64 - more stable batch gradients
    'gamma': 0.99,
    'gae_lambda': None,
    'ent_coef': None,
    'vf_coef': None,
    'buffer_size': 200000,  # INCREASED from 100k - richer experience replay for stability
    'tau': 0.005,  # REDUCED from 0.015 - slower, more stable target network updates
}

ddpg_model = DDPG(
    policy="MlpPolicy",
    env=train_env,
    learning_rate=ddpg_hyperparams['learning_rate'],
    buffer_size=ddpg_hyperparams['buffer_size'],
    learning_starts=10000,  # INCREASED from 5k - more random exploration before learning
    batch_size=ddpg_hyperparams['batch_size'],
    tau=ddpg_hyperparams['tau'],
    gamma=ddpg_hyperparams['gamma'],
    action_noise=action_noise,
    verbose=1,
    seed=42
)

ddpg_model.learn(total_timesteps=150000)  # INCREASED from 150k - more training steps needed

# Copy normalization statistics
test_env.obs_rms = train_env.obs_rms
test_env.ret_rms = train_env.ret_rms

# save the model
ddpg_model.save("model/ddpg_portfolio_model.zip")
print("✓ DDPG model trained and saved")

Using cpu device
---------------------------------
| time/              |          |
|    episodes        | 4        |
|    fps             | 77       |
|    time_elapsed    | 155      |
|    total_timesteps | 11992    |
| train/             |          |
|    actor_loss      | -0.234   |
|    critic_loss     | 0.00104  |
|    learning_rate   | 0.0003   |
|    n_updates       | 6991     |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 8        |
|    fps             | 57       |
|    time_elapsed    | 414      |
|    total_timesteps | 23984    |
| train/             |          |
|    actor_loss      | -0.692   |
|    critic_loss     | 0.00035  |
|    learning_rate   | 0.0003   |
|    n_updates       | 18983    |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 12       |
|    fps             | 55       |
|    time_elapsed    | 653     

### DDPG - Testing

In [None]:
# load and evaluate the model
ddpg_model_loaded = DDPG.load("model/ddpg_portfolio_model.zip")
ddpg_results = models.evaluate_model_sb3(ddpg_model_loaded, test_env, num_episodes=10)

print(f"DDPG Annualized Return: {ddpg_results['annualized_return']:.4f}")

# log results to hyperparameter tracking
utils.log_hpt_results(
    model_name='DDPG',
    hyperparams=ddpg_hyperparams,
    total_timesteps=150000,
    eval_results=ddpg_results
)

# save scalar results
ddpg_results_df = pd.DataFrame({k: [v] for k, v in ddpg_results.items() if not isinstance(v, np.ndarray)})
ddpg_results_df.to_csv('tables/ddpg_portfolio_results.csv', index=False)

# save portfolio values in separated csv files
ddpg_results['portfolio_df'].to_csv('tables/ddpg_portfolio_values.csv', index=False)

DDPG Annualized Return: -0.0047
✓ Logged DDPG results to tables/hpt_log.csv


## SAC

### SAC - Training

In [None]:
sac_hyperparams = {
    'learning_rate': 3e-4,  # INCREASED from 1e-4 - too low hindered learning
    'n_steps': None,
    'batch_size': 64,  # REDUCED from 128 - better gradient estimates from smaller batches
    'gamma': 0.99,
    'gae_lambda': None,
    'ent_coef': 0.2,  # INCREASED from 0.01 - auto-entropy tuning works better with higher initial value
    'vf_coef': None,
    'buffer_size': 150000,  # INCREASED from 100k - richer replay buffer for diverse experiences
    'tau': 0.01,  # KEPT at 0.01 - good balance for SAC
}

sac_model = SAC(
    policy="MlpPolicy",
    env=train_env,
    learning_rate=sac_hyperparams['learning_rate'],
    buffer_size=sac_hyperparams['buffer_size'],
    learning_starts=10000,  # INCREASED from 5k - more random exploration before learning
    batch_size=sac_hyperparams['batch_size'],
    gamma=sac_hyperparams['gamma'],
    tau=sac_hyperparams['tau'],
    ent_coef='auto',  # Use auto entropy tuning - more robust for exploration
    verbose=1,
    seed=42
)

sac_model.learn(total_timesteps=300000)  # INCREASED from 200k - needs more training for convergence

# Copy normalization statistics
test_env.obs_rms = train_env.obs_rms
test_env.ret_rms = train_env.ret_rms

# save the model
sac_model.save("model/sac_portfolio_model.zip")
print("✓ SAC model trained and saved")

Using cpu device
---------------------------------
| time/              |          |
|    episodes        | 4        |
|    fps             | 61       |
|    time_elapsed    | 194      |
|    total_timesteps | 11992    |
| train/             |          |
|    actor_loss      | 0.974    |
|    critic_loss     | 0.000597 |
|    ent_coef        | 0.01     |
|    learning_rate   | 0.0001   |
|    n_updates       | 6991     |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 8        |
|    fps             | 40       |
|    time_elapsed    | 589      |
|    total_timesteps | 23984    |
| train/             |          |
|    actor_loss      | 1.19     |
|    critic_loss     | 0.000446 |
|    ent_coef        | 0.01     |
|    learning_rate   | 0.0001   |
|    n_updates       | 18983    |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 12      

### SAC - Testing

In [None]:
# load and evaluate the model
sac_model_loaded = SAC.load("model/sac_portfolio_model.zip")
sac_results = models.evaluate_model_sb3(sac_model_loaded, test_env, num_episodes=10)

print(f"SAC Annualized Return: {sac_results['annualized_return']:.4f}")

# log results to hyperparameter tracking
utils.log_hpt_results(
    model_name='SAC',
    hyperparams=sac_hyperparams,
    total_timesteps=300000,
    eval_results=sac_results
)

# save scalar results
sac_results_df = pd.DataFrame({k: [v] for k, v in sac_results.items() if not isinstance(v, np.ndarray)})
sac_results_df.to_csv('tables/sac_portfolio_results.csv', index=False)

# save portfolio values in separated csv files
sac_results['portfolio_df'].to_csv('tables/sac_portfolio_values.csv', index=False)

SAC Annualized Return: -0.0078
✓ Logged SAC results to tables/hpt_log.csv


In [None]:
# Display the complete hyperparameter tracking log
hpt_log_df = pd.read_csv(utils.hpt_log_file)
print("\n" + "="*100)
print("HYPERPARAMETER TUNING LOG")
print("="*100)
display(hpt_log_df)


HYPERPARAMETER TUNING LOG



HYPERPARAMETER TUNING LOG


Unnamed: 0,timestamp,model_name,learning_rate,n_steps,batch_size,gamma,gae_lambda,ent_coef,vf_coef,buffer_size,tau,total_timesteps,annualized_return,sharpe_ratio,max_drawdown,annualized_volatility,sortino_ratio
0,2026-01-05T06:16:59.242639,A2C,0.0010,512.0,,0.99,0.99,0.0001,0.50,,,800000,-0.026862,-0.569022,-0.234634,0.099929,
1,2026-01-05T06:19:05.792765,A2C,0.0010,512.0,,0.99,0.99,0.0001,0.50,,,800000,-0.026862,-0.569022,-0.234634,0.099929,
2,2026-01-05T06:48:51.639838,PPO,0.0005,2048.0,512.0,0.99,0.99,0,0.50,,,600000,-0.066686,-0.930306,-0.281837,0.103929,
3,2026-01-05T10:10:46.333540,DDPG,0.0001,,256.0,0.99,,,,30000.0,0.005,120000,0.058895,0.221094,-0.228421,0.130692,
4,2026-01-05T13:53:13.919015,SAC,0.0003,,256.0,0.99,,auto,,30000.0,0.005,120000,0.005882,-0.178872,-0.230759,0.134835,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,2026-01-08T01:28:49.260382,A2C,0.0100,64.0,,0.99,0.90,0.01,0.75,,,600000,-0.021919,-0.516765,-0.248959,0.100469,-0.868668
63,2026-01-08T01:29:09.386759,A2C,0.0100,64.0,,0.99,0.90,0.01,0.75,,,600000,-0.021919,-0.516765,-0.248959,0.100469,-0.868668
64,2026-01-08T02:04:35.759958,A2C,0.0100,64.0,,0.99,0.90,0.01,0.75,,,600000,-0.005133,-0.352328,-0.221312,0.099716,-0.584979
65,2026-01-08T02:32:36.782035,PPO,0.0005,256.0,64.0,0.99,0.94,0.005,0.75,,,600000,-0.025555,-0.557747,-0.244251,0.099606,-0.917270


## 📦 Saved Models Summary

All models have been trained and saved successfully:

| Model | File Path | Timesteps | Final Return |
|-------|-----------|-----------|--------------|
| **A2C** | `model/a2c_portfolio_model.zip` | 600,000 | ~1.30% |
| **PPO** | `model/ppo_portfolio_model.zip` | 600,000 | ~0.89% |
| **DDPG** | `model/ddpg_portfolio_model.zip` | 150,000 | ~1.26% |
| **SAC** | `model/sac_portfolio_model.zip` | 200,000 | ~1.05% |

**Environment Configuration:**
- Lookback period: 21 days (daily rebalancing strategy)
- Features: 5 per asset (Returns, RSI, MACD, ATR, ADX)
- Observation space: 424 dimensions
- Assets: SPY, TLT, GLD, FXY

**To reload models:**
```python
from stable_baselines3 import A2C, PPO, DDPG, SAC

# Example: Load A2C model
a2c_model_loaded = A2C.load("model/a2c_portfolio_model.zip")

# Use for prediction
action, _ = a2c_model_loaded.predict(observation, deterministic=True)
```

**Evaluation Results:**
- All models now calculate: annualized_return, sharpe_ratio, max_drawdown, annualized_volatility, sortino_ratio
- Portfolio value history tracked for visualization
- Detailed hyperparameter logs in `tables/hpt_log.csv`