# 04 - RL Training

Train Reinforcement Learning agents for trading.

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import yaml
from pathlib import Path

from rl import TradingEnvironment, RLTrainer, RewardFunctions
from rl.environment import FlatActionWrapper

In [None]:
# Load configuration
config_path = Path('../config/default.yaml')
with open(config_path) as f:
    config = yaml.safe_load(f)

print("RL Configuration:")
rl_config = config['rl']
print(f"  Observation features: {rl_config['environment']['observation_features']}")
print(f"  Reward function: {rl_config['environment']['reward_function']}")
print(f"  Episode length: {rl_config['environment']['episode_length']}")

In [None]:
# Load or create data
SYMBOL = config['market']['symbol']
features_path = Path('../data/processed') / f'{SYMBOL}_features.parquet'

if features_path.exists():
    data = pd.read_parquet(features_path)
else:
    # Create synthetic data
    print("Creating synthetic data...")
    n = 10000
    np.random.seed(42)
    returns = np.random.randn(n) * 0.001
    prices = 50000 * np.exp(np.cumsum(returns))
    
    data = pd.DataFrame({
        'open': prices * (1 + np.random.randn(n) * 0.0001),
        'high': prices * (1 + np.abs(np.random.randn(n) * 0.0005)),
        'low': prices * (1 - np.abs(np.random.randn(n) * 0.0005)),
        'close': prices,
        'volume': np.random.randint(10, 100, n) * 0.1,
        'ofi': np.random.randn(n) * 0.3,
        'tfi': np.random.randn(n) * 0.3,
        'rsi': 50 + np.random.randn(n) * 15,
        'adx': 25 + np.random.randn(n) * 10,
        'atr': prices * 0.02,
        'regime': np.random.randint(0, 3, n),
    }, index=pd.date_range(start='2024-01-01', periods=n, freq='1min'))

print(f"Data shape: {data.shape}")

## Create Environment

In [None]:
# Create trading environment
env = TradingEnvironment(config, data)

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")

In [None]:
# Test environment
obs, info = env.reset()
print(f"Initial observation shape: {obs.shape}")
print(f"Initial observation: {obs}")

In [None]:
# Take some random actions
total_reward = 0
for _ in range(100):
    action = {
        'direction': np.random.randint(0, 3),
        'position_size': np.random.randint(0, 5),
        'sl_mult': np.random.randint(0, 5),
        'tp_mult': np.random.randint(0, 5),
    }
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    
    if terminated or truncated:
        break

print(f"Total reward after random actions: {total_reward:.4f}")
print(f"Episode stats: {env.get_episode_stats()}")

## Train RL Agent

In [None]:
# Create trainer
try:
    from stable_baselines3 import PPO
    SB3_AVAILABLE = True
except ImportError:
    SB3_AVAILABLE = False
    print("stable-baselines3 not installed. Install with:")
    print("  pip install stable-baselines3")

if SB3_AVAILABLE:
    # Create wrapped environment
    train_env = TradingEnvironment(config, data)
    wrapped_env = FlatActionWrapper(train_env)
    
    # Create trainer
    trainer = RLTrainer(config, train_env)

In [None]:
if SB3_AVAILABLE:
    # Train PPO agent (reduced timesteps for demo)
    print("Training PPO agent...")
    trainer.create_agent('ppo')
    
    results = trainer.train(
        'ppo',
        total_timesteps=5000,  # Increase for real training
    )
    
    print(f"Training results: {results}")

In [None]:
if SB3_AVAILABLE:
    # Evaluate trained agent
    eval_env = TradingEnvironment(config, data)
    eval_results = trainer.evaluate('ppo', eval_env, n_episodes=5)
    
    print("Evaluation results:")
    for k, v in eval_results.items():
        print(f"  {k}: {v:.4f}")

In [None]:
if SB3_AVAILABLE:
    # Save model
    model_path = Path('../models/rl/ppo_trading')
    model_path.parent.mkdir(parents=True, exist_ok=True)
    trainer.save('ppo', model_path)
    print(f"Saved model to {model_path}")

## Test Reward Functions

In [None]:
# Test different reward functions
returns = [0.01, -0.005, 0.02, 0.015, -0.01, 0.008, 0.005, -0.003] * 5

print("Reward function comparisons:")
print(f"  Sharpe reward: {RewardFunctions.sharpe_reward(returns):.4f}")
print(f"  Sortino reward: {RewardFunctions.sortino_reward(returns):.4f}")
print(f"  P&L reward (100 pnl, 10000 balance): {RewardFunctions.pnl_reward(100, 10000):.4f}")
print(f"  Risk-adjusted (100 pnl, 5% dd): {RewardFunctions.risk_adjusted_reward(100, 5):.4f}")

In [None]:
# Test asymmetric reward
print("\nAsymmetric reward (2x loss penalty):")
print(f"  Win $100: {RewardFunctions.asymmetric_reward(100):.2f}")
print(f"  Lose $100: {RewardFunctions.asymmetric_reward(-100):.2f}")

## Notes for Colab Training

For longer training runs, use Google Colab:

1. Upload this notebook to Colab
2. Enable GPU runtime
3. Install dependencies:
   ```
   !pip install stable-baselines3 gymnasium
   ```
4. Train with more timesteps (100k+)
5. Download trained model