In [None]:
!pip install stable-baselines3[extra] sb3-contrib gymnasium --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.2/93.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/187.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
def calculate_adjusted_price(symbol, df):
    # Step 1: Start with a factor column = 1.0
    df['AdjFactor'] = 1.0

    # Step 2: Apply stock splits (backward adjustment)
    for i in range(len(df)):
        split = df['Stock Splits'].iloc[i]
        if split != 0:  # e.g., 2.0 for 2-for-1
            ratio = 1.0 / split
            df.loc[:df.index[i], 'AdjFactor'] *= ratio

    # Step 3: Apply dividends (backward adjustment)
    # We scale all earlier prices to simulate reinvestment
    for i in range(len(df)):
        dividend = df['Dividends'].iloc[i]
        if dividend != 0:
            close_price = df['Close'].iloc[i]
            ratio = (close_price - dividend) / close_price
            df.loc[:df.index[i], 'AdjFactor'] *= ratio

    # Step 4: Adjusted Close = Close * cumulative adjustment factor
    df['AdjClose'] = df['Close'] * df['AdjFactor']
    df['AdjVolume'] = df['Volume'] * df['AdjFactor']
    return df


In [None]:
import yfinance as yf
import pandas as pd

stock = yf.Ticker("KCHOL.IS")
df_stock = stock.history(start="2010-01-04", end="2025-09-1")
df_stock = df_stock.iloc[1:]
df_stock.index = df_stock.index.date
df_stock = calculate_adjusted_price("KCHOL.IS", df_stock)
df_stock.rename(columns={"Volume": "KOC_Volume"}, inplace=True)
df_stock.rename(columns={"AdjClose": "KOC_AdjClose"}, inplace=True)

sahol = yf.Ticker("SAHOL.IS")
df_sahol = sahol.history(start="2010-01-04", end="2025-09-1")
df_sahol = df_sahol.iloc[1:]
df_sahol.index = df_sahol.index.date
df_sahol = calculate_adjusted_price("SAHOL.IS", df_sahol)
df_sahol.rename(columns={"Volume": "SAHOL_Volume"}, inplace=True)
df_sahol.rename(columns={"AdjClose": "SAHOL_AdjClose"}, inplace=True)

eregli = yf.Ticker("EREGL.IS")
df_eregli = eregli.history(start="2010-01-04", end="2025-09-1")
df_eregli = df_eregli.iloc[1:]
df_eregli.index = df_eregli.index.date
df_eregli = calculate_adjusted_price("EREGL.IS", df_eregli)
df_eregli.rename(columns={"Volume": "EREGL_Volume"}, inplace=True)
df_eregli.rename(columns={"AdjClose": "EREGL_AdjClose"}, inplace=True)

# === Download XU100 index (BIST 100) - Yahoo Finance ticker for BIST 100 is "XU100.IS"
xu100 = yf.Ticker("XU100.IS")
df_xu100 = xu100.history(start="2010-01-04", end="2025-09-1")[["Close"]]
df_xu100.rename(columns={"Close": "XU100_Close"}, inplace=True)
df_xu100 = df_xu100.iloc[1:]
df_xu100.index = df_xu100.index.date

# === Download USD/TRY exchange rate ===
usdtry = yf.Ticker("USDTRY=X")
df_usdtry = usdtry.history(start="2010-01-04", end="2025-09-1")[["Close"]]
df_usdtry.rename(columns={"Close": "USDTRY_Close"}, inplace=True)

df_stock.index = pd.to_datetime(df_stock.index).tz_localize(None)
df_sahol.index = pd.to_datetime(df_sahol.index).tz_localize(None)
df_eregli.index = pd.to_datetime(df_eregli.index).tz_localize(None)
df_xu100.index = pd.to_datetime(df_xu100.index).tz_localize(None)
df_usdtry.index = pd.to_datetime(df_usdtry.index).tz_localize(None)

# === Merge all datasets on Date ===
df = (
    df_stock
    .join(df_sahol, how="left", rsuffix="_sahol")
    .join(df_eregli, how="left", rsuffix="_eregli")
    .join(df_xu100, how="left", rsuffix="_xu100")
    .join(df_usdtry, how="left", rsuffix="_usdtry")
)
df = df[["EREGL_Volume", "EREGL_AdjClose", "SAHOL_Volume", "SAHOL_AdjClose", "KOC_Volume", "KOC_AdjClose", "XU100_Close", "USDTRY_Close"]]

# === Forward fill missing values (holidays, weekends) ===
df.fillna(method="ffill", inplace=True)

# === Save to CSV for reuse ===
df.to_csv("assets.csv")

print(df.head())
print(df.tail())


            EREGL_Volume  EREGL_AdjClose  SAHOL_Volume  SAHOL_AdjClose  \
2010-01-05      73491364        0.013520       5058266        2.207235   
2010-01-06      27708427        0.013580       4992228        2.169179   
2010-01-07      40402260        0.013342       2732948        2.169179   
2010-01-08      46893994        0.013342      20476268        2.226263   
2010-01-11      68805135        0.013520       8156529        2.169179   

            KOC_Volume  KOC_AdjClose  XU100_Close  USDTRY_Close  
2010-01-05    11165434      1.726188   541.148132        1.4727  
2010-01-06     3897700      1.733472   545.471130        1.4714  
2010-01-07     3662297      1.726188   549.726074        1.4715  
2010-01-08     5993209      1.726188   547.976074        1.4580  
2010-01-11     3878719      1.675204   539.208130        1.4502  
            EREGL_Volume  EREGL_AdjClose  SAHOL_Volume  SAHOL_AdjClose  \
2025-08-25     187416068       29.660000      24798986       98.650002   
2025-08-26 

  df.fillna(method="ffill", inplace=True)


In [None]:
print(df.head())
print(df.tail())

            EREGL_Volume  EREGL_AdjClose  SAHOL_Volume  SAHOL_AdjClose  \
2010-01-05      73491364        0.013520       5058266        2.207236   
2010-01-06      27708427        0.013580       4992228        2.169180   
2010-01-07      40402260        0.013342       2732948        2.169180   
2010-01-08      46893994        0.013342      20476268        2.226263   
2010-01-11      68805135        0.013520       8156529        2.169180   

            KOC_Volume  KOC_AdjClose  XU100_Close  USDTRY_Close  
2010-01-05    11165434      1.726188   541.148132        1.4727  
2010-01-06     3897700      1.733472   545.471130        1.4714  
2010-01-07     3662297      1.726188   549.726074        1.4715  
2010-01-08     5993209      1.726188   547.976074        1.4580  
2010-01-11     3878719      1.675204   539.208130        1.4502  
            EREGL_Volume  EREGL_AdjClose  SAHOL_Volume  SAHOL_AdjClose  \
2025-08-25     187416068       29.660000      24798986       98.650002   
2025-08-26 

In [None]:
# ===============================
# Split Data
# ===============================
train_df = df.loc[(df.index < df.index[int(len(df)*0.75)])]  #
val_df   = df.loc[(df.index >= df.index[int(len(df)*0.75)]) & (df.index < df.index[int(len(df)*0.85)])]
test_df  = df.loc[(df.index >= df.index[int(len(df)*0.85)])]

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)

episode_length = len(train_df)
num_episodes = 50   # 50 replays of dataset

# Compute total timesteps
total_timesteps = episode_length * num_episodes

print(f"Episode length: {episode_length} steps")
print(f"Training for {num_episodes} episodes")
print(f"Total timesteps = {total_timesteps}")


Train: (3015, 8) Val: (402, 8) Test: (603, 8)
Episode length: 3015 steps
Training for 50 episodes
Total timesteps = 150750


In [None]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv
from typing import Optional, Any, Dict


class MultiAssetTradingEnv(gym.Env):
    """
    Multi-Asset Trading Environment for Stable-Baselines3 with Gymnasium:
    - You can only hold one asset at a time (or be in cash)
    - Uses action masking to enforce valid actions based on current state
    - Enhanced with portfolio state features and risk metrics
    """

    metadata = {"render_modes": ["human"], "render_fps": 1}

    def __init__(
        self,
        df,
        window_size: int = 30,
        initial_balance: float = 10_000.0,
        fee_pct: float = 0.001,
        normalize_obs: bool = True,
        render_mode: Optional[str] = None
    ):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.assets = ["EREGL_AdjClose", "SAHOL_AdjClose", "KOC_AdjClose"]
        self.other_features = ["EREGL_Volume", "SAHOL_Volume", "KOC_Volume", "XU100_Close", "USDTRY_Close"]
        self.n_assets = len(self.assets)
        self.render_mode = render_mode

        self.window_size = window_size
        self.initial_balance = float(initial_balance)
        self.fee_pct = float(fee_pct)
        self.normalize_obs = normalize_obs

        # Store original data for normalization
        if self.normalize_obs:
            self.asset_means = self.df[self.assets].mean().values
            self.asset_stds = self.df[self.assets].std().values
            # Avoid division by zero
            self.asset_stds = np.maximum(self.asset_stds, 1e-8)

        # Enhanced observation space: prices + portfolio state
        obs_features = len(self.assets) + len(self.other_features)
        if normalize_obs:
            obs_features += 4  # cash_ratio, current_asset_onehot (3 values), portfolio_ratio

        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(window_size * obs_features,),  # Flattened for SB3
            dtype=np.float32
        )

        # Action space:
        # 0 = Hold
        # 1 = Buy Koç, 2 = Sell Koç
        # 3 = Buy XU100, 4 = Sell XU100
        # 5 = Buy USD, 6 = Sell USD
        self.action_space = spaces.Discrete(1 + 2 * self.n_assets)

        # Track performance metrics
        self.episode_trades = 0
        self.episode_fees = 0.0
        self.max_portfolio_value = 0.0
        self.portfolio_history = []

    def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None):
        """Reset environment - Gymnasium style"""
        super().reset(seed=seed)

        self.balance = self.initial_balance
        self.holdings = np.zeros(self.n_assets)
        self.current_step = self.window_size
        self.current_asset = None  # None means in cash

        # Reset episode metrics
        self.episode_trades = 0
        self.episode_fees = 0.0
        self.max_portfolio_value = self.initial_balance
        self.portfolio_history = [self.initial_balance]

        obs = self._get_obs()
        info = {
            "portfolio_value": self.initial_balance,
            "episode_trades": 0,
            "episode_fees": 0.0,
            "cash": self.balance,
            "holdings": self.holdings.copy(),
            "current_asset": self.current_asset
        }

        return obs, info

    def _get_obs(self):
        """Get observation - flattened for SB3"""
        # Get price window
        start_idx = max(0, self.current_step - self.window_size)
        end_idx = self.current_step

        price_window = self.df.loc[start_idx:end_idx-1, self.assets].values
        other_features_window = self.df.loc[start_idx:end_idx-1, self.other_features].values

        # Ensure we have the right window size (pad if necessary at the beginning)
        if price_window.shape[0] < self.window_size:
            padding = np.tile(price_window[0:1], (self.window_size - price_window.shape[0], 1))
            price_window = np.vstack([padding, price_window])

        if self.normalize_obs:
            # Normalize prices
            price_window = (price_window - self.asset_means) / self.asset_stds

            # Add portfolio state features
            current_prices = self.df.loc[self.current_step, self.assets].values
            portfolio_value = self._get_portfolio_value(current_prices)

            # Portfolio state features
            #cash_ratio = self.balance / portfolio_value if portfolio_value > 0 else 0

            # One-hot encoding for current asset (or [0,0,0] if in cash)
            asset_onehot = np.zeros(self.n_assets)
            if self.current_asset is not None:
                asset_onehot[self.current_asset] = 1.0

            portfolio_ratio = portfolio_value / self.initial_balance

            # Combine all state features
            #state_features = np.concatenate([[cash_ratio], asset_onehot, [portfolio_ratio]])
            state_features = np.concatenate([asset_onehot, [portfolio_ratio]])

            # Repeat state features for each time step and concatenate with prices
            state_window = np.tile(state_features, (self.window_size, 1))
            obs_window = np.concatenate([price_window, other_features_window, state_window], axis=1)
        else:
            obs_window = price_window

        return obs_window.flatten().astype(np.float32)

    def action_masks(self):
        """Return valid action mask for SB3 MaskablePPO"""
        mask = np.zeros(self.action_space.n, dtype=bool)

        # Hold is always valid
        mask[0] = True

        if self.current_asset is None:
            # In cash -> only buys are valid (and hold)
            for i in range(self.n_assets):
                mask[1 + 2 * i] = True  # buy_i
        else:
            # Holding an asset -> only sell that asset is valid (and hold)
            idx = self.current_asset
            mask[2 + 2 * idx] = True  # sell_idx

        return mask

    def step(self, action):
        """Step function for SB3 with Gymnasium interface"""
        # Check if action is valid (for debugging)
        if not self.action_masks()[action]:
            print(f"Warning: Invalid action {action} taken. Current asset: {self.current_asset}")

        price_vec = self.df.loc[self.current_step, self.assets].values
        prev_portfolio_value = self._get_portfolio_value(price_vec)

        executed = "hold"
        fee_paid = 0.0
        trade_executed = False

        # Process action
        if action == 0:  # Hold
            pass
        elif action % 2 == 1:  # Buy (odd numbers)
            asset_idx = (action - 1) // 2
            if self.current_asset is None and self.balance > 0:
                price = price_vec[asset_idx]
                fee_paid = self.balance * self.fee_pct
                self.holdings[asset_idx] = (self.balance - fee_paid) / price
                self.balance = 0.0
                self.current_asset = asset_idx
                executed = f"buy_{self.assets[asset_idx]}"
                trade_executed = True
        elif action % 2 == 0 and action > 0:  # Sell (even numbers > 0)
            asset_idx = (action - 2) // 2
            if self.current_asset == asset_idx:
                price = price_vec[asset_idx]
                notional = self.holdings[asset_idx] * price
                fee_paid = notional * self.fee_pct
                self.balance += notional - fee_paid
                self.holdings[asset_idx] = 0.0
                self.current_asset = None
                executed = f"sell_{self.assets[asset_idx]}"
                trade_executed = True

        # Update metrics
        if trade_executed:
            self.episode_trades += 1
            self.episode_fees += fee_paid

        # Advance step
        self.current_step += 1
        terminated = self.current_step >= len(self.df) - 1
        truncated = False

        # Calculate new portfolio value and reward
        price_vec_next = self.df.loc[self.current_step, self.assets].values
        portfolio_value = self._get_portfolio_value(price_vec_next)
        self.portfolio_history.append(portfolio_value)

        # Update max portfolio value for drawdown calculation
        self.max_portfolio_value = max(self.max_portfolio_value, portfolio_value)

        # Enhanced reward function
        reward = self._calculate_reward(prev_portfolio_value, portfolio_value, fee_paid)

        obs = self._get_obs()
        info = {
            "portfolio_value": portfolio_value,
            "executed": executed,
            "fee_paid": fee_paid,
            "holdings": self.holdings.copy(),
            "cash": self.balance,
            "episode_trades": self.episode_trades,
            "episode_fees": self.episode_fees,
            "drawdown": (self.max_portfolio_value - portfolio_value) / self.max_portfolio_value,
            "return": (portfolio_value - self.initial_balance) / self.initial_balance,
            "current_asset": self.current_asset
        }

        return obs, reward, terminated, truncated, info

    def _calculate_reward(self, prev_value, current_value, fee_paid):
        """Enhanced reward function with risk adjustment and fee penalty"""
        # Basic return
        raw_return = (current_value - prev_value) / prev_value if prev_value > 0 else 0

        # Fee penalty (scaled)
        fee_penalty = -fee_paid / self.initial_balance

        # Risk adjustment based on volatility (optional)
        if len(self.portfolio_history) > 10:
            #returns = np.diff(self.portfolio_history[-10:]) / np.array(self.portfolio_history[-11:-1])
            last_10 = np.array(self.portfolio_history[-10:])
            prev_10 = np.array(self.portfolio_history[-11:-1])
            returns = np.diff(last_10) / prev_10[:-1]
            volatility = np.std(returns) if len(returns) > 1 else 0
            # Sharpe-like adjustment (prefer consistent returns)
            risk_adjustment = -volatility * 0.1 if volatility > 0 else 0
        else:
            risk_adjustment = 0

        return raw_return + fee_penalty + risk_adjustment

    def _get_portfolio_value(self, price_vec):
        """Calculate total portfolio value"""
        return self.balance + np.sum(self.holdings * price_vec)

    def render(self):
        """Render environment state"""
        if self.render_mode == "human":
            price_vec = self.df.loc[self.current_step, self.assets].values
            portfolio_value = self._get_portfolio_value(price_vec)

            print(f"\n--- Step: {self.current_step} ---")
            print(f"Portfolio Value: ${portfolio_value:.2f}")
            print(f"Return: {((portfolio_value - self.initial_balance) / self.initial_balance * 100):.2f}%")
            print(f"Cash: ${self.balance:.2f}")
            print(f"Current Asset: {self.current_asset}")
            print(f"Holdings: {self.holdings}")
            print(f"Prices: {dict(zip(self.assets, price_vec))}")
            print(f"Episode Trades: {self.episode_trades}")
            print(f"Episode Fees: ${self.episode_fees:.2f}")
            print("-" * 50)

    def close(self):
        """Close environment"""
        pass


def mask_fn(env: gym.Env) -> np.ndarray:
    """Function to get action mask for ActionMasker wrapper"""
    return env.action_masks()


# Wrapper function to create masked environment
def make_masked_env(df, **kwargs):
    """Create environment with action masking wrapper"""
    def _init():
        env = MultiAssetTradingEnv(df, **kwargs)
        env = ActionMasker(env, mask_fn)
        return env
    return _init


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
# Training utilities
class TradingEnvTrainer:
    """Utility class for training trading agents with SB3"""

    def __init__(self, df_train, df_val=None, env_kwargs=None):
        self.df_train = df_train
        self.df_val = df_val
        self.env_kwargs = env_kwargs or {}

    def create_env(self, df=None, n_envs=1):
        """Create training environment(s)"""
        df = df if df is not None else self.df_train

        if n_envs == 1:
            return make_masked_env(df, **self.env_kwargs)()
        else:
            envs = [make_masked_env(df, **self.env_kwargs) for _ in range(n_envs)]
            return DummyVecEnv(envs)

    def train_agent(self, total_timesteps=100000, **ppo_kwargs):
        """Train MaskablePPO agent"""
        # Create environment
        env = self.create_env(n_envs=4)  # Use 4 parallel environments

        # Default PPO configuration
        default_config = {
            "learning_rate": 3e-4,
            "n_steps": 512,
            "batch_size": 64,
            "n_epochs": 10,
            "gamma": 0.99,
            "gae_lambda": 0.95,
            "clip_range": 0.2,
            "ent_coef": 0.01,
            "vf_coef": 0.5,
            "max_grad_norm": 0.5,
            "verbose": 0,
        }
        default_config.update(ppo_kwargs)

        # Create and train agent
        model = MaskablePPO("MlpPolicy", env, **default_config)
        model.learn(total_timesteps=total_timesteps)

        return model

    def evaluate_agent(self, model, df=None, n_episodes=10):
        """Evaluate trained agent"""
        df = df if df is not None else self.df_val or self.df_train
        env = self.create_env(df)

        episode_returns = []
        episode_trades = []

        for episode in range(n_episodes):
            obs, info = env.reset()
            done = False
            episode_return = 0

            while not done:
                # Get action using the trained model
                # Correct way to get predictions
                action, _states = model.predict(obs, action_masks=env.action_masks(), deterministic=True)
                obs, reward, terminated, truncated, info = env.step(action)
                episode_return += reward
                done = terminated or truncated

            final_portfolio_value = info["portfolio_value"]
            final_return = info["return"]
            episode_returns.append(final_return)
            episode_trades.append(info["episode_trades"])

            print(f"Episode {episode + 1}: Return: {final_return:.2%}, "
                  f"Value: ${final_portfolio_value:.2f}, Trades: {info['episode_trades']}")

        avg_return = np.mean(episode_returns)
        avg_trades = np.mean(episode_trades)

        print(f"\nAverage Return: {avg_return:.2%}")
        print(f"Average Trades per Episode: {avg_trades:.1f}")
        print(f"Return Std: {np.std(episode_returns):.2%}")

        return {
            "avg_return": avg_return,
            "avg_trades": avg_trades,
            "returns": episode_returns,
            "trades": episode_trades
        }


In [None]:
import torch
import torch.nn as nn
import numpy as np
from typing import List, Tuple, Type, Union, Dict, Any
from gymnasium import spaces
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy

class CNNTradingFeaturesExtractor(BaseFeaturesExtractor):
    """
    CNN-based feature extractor for trading time series data.
    Treats the flattened observation as a 2D time series with multiple features.
    """

    def __init__(self, observation_space: spaces.Box, features_dim: int = 128, window_size = 30, n_features = 12):
        # Calculate original 2D dimensions from flattened space
        flat_dim = observation_space.shape[0]

        self.window_size = window_size
        self.n_features = n_features

        print(f"CNN Feature Extractor: window_size={self.window_size}, n_features={self.n_features}")

        super().__init__(observation_space, features_dim)

        # 1D CNN for temporal patterns
        self.cnn = nn.Sequential(
            # Reshape will be done in forward()
            nn.Conv1d(self.n_features, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            #nn.LayerNorm1d(64),
            nn.Conv1d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            #nn.LayerNorm1d(32),
            nn.AdaptiveAvgPool1d(1)  # Global average pooling
        )

        # Final feature layers
        self.fc = nn.Sequential(
            nn.Linear(32, features_dim),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        batch_size = observations.size(0)

        # Reshape from flat to (batch, window_size, n_features)
        obs_2d = observations.view(batch_size, self.window_size, self.n_features)

        # Transpose for conv1d: (batch, n_features, window_size)
        obs_conv = obs_2d.transpose(1, 2)

        # Apply CNN
        cnn_out = self.cnn(obs_conv)  # (batch, 32, 1)
        cnn_flat = cnn_out.squeeze(-1)  # (batch, 32)

        # Final features
        features = self.fc(cnn_flat)
        features = torch.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6) #especially needed if there is NormLayers

        return features


class LSTMTradingFeaturesExtractor(BaseFeaturesExtractor):
    """
    LSTM-based feature extractor for trading time series data.
    Good for capturing long-term temporal dependencies.
    """

    def __init__(self, observation_space: spaces.Box, features_dim: int = 128, window_size = 30, n_features = 12):
        # Calculate original 2D dimensions from flattened space
        flat_dim = observation_space.shape[0]

        self.window_size = window_size
        self.n_features = n_features

        print(f"LSTM Feature Extractor: window_size={self.window_size}, n_features={self.n_features}")

        super().__init__(observation_space, features_dim)

        # LSTM layers
        self.lstm = nn.LSTM(
            input_size=self.n_features,
            hidden_size=64,
            num_layers=2,
            batch_first=True,
            dropout=0.1
        )

        # Final feature layers
        self.fc = nn.Sequential(
            nn.Linear(64, features_dim),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        batch_size = observations.size(0)

        # Reshape from flat to (batch, window_size, n_features)
        obs_2d = observations.view(batch_size, self.window_size, self.n_features)

        # LSTM forward pass
        lstm_out, (hidden, cell) = self.lstm(obs_2d)

        # Use the last hidden state
        last_hidden = lstm_out[:, -1, :]  # (batch, hidden_size)

        # Final features
        features = self.fc(last_hidden)

        return features


class AttentionTradingFeaturesExtractor(BaseFeaturesExtractor):
    """
    Attention-based feature extractor for trading data.
    Uses multi-head attention to focus on important time steps and features.
    """

    def __init__(self, observation_space: spaces.Box, features_dim: int = 128, window_size = 30, n_features = 12):
        flat_dim = observation_space.shape[0]

        self.window_size = window_size
        self.n_features = n_features

        print(f"Attention Feature Extractor: window_size={self.window_size}, n_features={self.n_features}")

        super().__init__(observation_space, features_dim)

        # Multi-head attention
        self.attention = nn.MultiheadAttention(
            embed_dim=self.n_features,
            num_heads=3,
            dropout=0.1,
            batch_first=True
        )

        # Layer norm
        self.layer_norm = nn.LayerNorm(self.n_features)

        # Final feature layers
        self.fc = nn.Sequential(
            nn.Linear(self.n_features, features_dim),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        batch_size = observations.size(0)

        # Reshape from flat to (batch, window_size, n_features)
        obs_2d = observations.view(batch_size, self.window_size, self.n_features)

        # Self-attention
        attn_out, _ = self.attention(obs_2d, obs_2d, obs_2d)

        # Residual connection and layer norm
        obs_attended = self.layer_norm(obs_2d + attn_out)

        # Global average pooling over time dimension
        pooled = torch.mean(obs_attended, dim=1)  # (batch, n_features)

        # Final features
        features = self.fc(pooled)

        return features


class CustomTradingPolicy(MaskableActorCriticPolicy):
    """
    Custom policy that uses our trading-specific feature extractors.
    """

    def __init__(
        self,
        observation_space: spaces.Space,
        action_space: spaces.Space,
        lr_schedule,
        net_arch: List[Union[int, Dict[str, List[int]]]] = None,
        activation_fn: Type[nn.Module] = nn.Tanh,
        features_extractor_class: Type[BaseFeaturesExtractor] = CNNTradingFeaturesExtractor,
        features_extractor_kwargs: Dict[str, Any] = None,
        *args,
        **kwargs,
    ):

        if net_arch is None:
            net_arch = [dict(pi=[128, 128], vf=[128, 128])]

        if features_extractor_kwargs is None:
            features_extractor_kwargs = dict(features_dim=128)

        super().__init__(
            observation_space,
            action_space,
            lr_schedule,
            net_arch=net_arch,
            activation_fn=activation_fn,
            features_extractor_class=features_extractor_class,
            features_extractor_kwargs=features_extractor_kwargs,
            *args,
            **kwargs,
        )


# Training configurations for different models
def get_cnn_model_config():
    """Configuration for CNN-based trading model"""
    return {
        "policy": CustomTradingPolicy,
        "policy_kwargs": {
            "features_extractor_class": CNNTradingFeaturesExtractor,
            "features_extractor_kwargs": {"features_dim": 128},
            "net_arch": [dict(pi=[128, 64], vf=[128, 64])],
            "activation_fn": nn.ReLU,
        },
        "learning_rate": 3e-4,
        "n_steps": 512,
        "batch_size": 32,
        "n_epochs": 5,
        "gamma": 0.99,
        "gae_lambda": 0.95,
        "clip_range": 0.2,
        "ent_coef": 0.01,
        "vf_coef": 0.5,
        "max_grad_norm": 0.5,
    }


def get_lstm_model_config():
    """Configuration for LSTM-based trading model"""
    return {
        "policy": CustomTradingPolicy,
        "policy_kwargs": {
            "features_extractor_class": LSTMTradingFeaturesExtractor,
            "features_extractor_kwargs": {"features_dim": 128},
            "net_arch": [dict(pi=[128, 64], vf=[128, 64])],
            "activation_fn": nn.ReLU,
        },
        "learning_rate": 2e-4,  # Slightly lower for LSTM
        "n_steps": 1024,        # Longer episodes for LSTM
        "batch_size": 32,
        "n_epochs": 5,
        "gamma": 0.99,
        "gae_lambda": 0.95,
        "clip_range": 0.2,
        "ent_coef": 0.01,
        "vf_coef": 0.5,
        "max_grad_norm": 0.5,
    }


def get_attention_model_config():
    """Configuration for Attention-based trading model"""
    return {
        "policy": CustomTradingPolicy,
        "policy_kwargs": {
            "features_extractor_class": AttentionTradingFeaturesExtractor,
            "features_extractor_kwargs": {"features_dim": 128},
            "net_arch": [dict(pi=[128, 64], vf=[128, 64])],
            "activation_fn": nn.ReLU,
        },
        "learning_rate": 2e-4,
        "n_steps": 512,
        "batch_size": 32,
        "n_epochs": 5,
        "gamma": 0.99,
        "gae_lambda": 0.95,
        "clip_range": 0.2,
        "ent_coef": 0.01,
        "vf_coef": 0.5,
        "max_grad_norm": 0.5,
    }


# Enhanced trainer with custom models
class EnhancedTradingTrainer(TradingEnvTrainer):
    """Enhanced trainer with support for custom network architectures"""

    def train_with_custom_model(self, model_type="cnn", total_timesteps=100000):
        """Train with custom model architecture"""

        # Get model configuration
        if model_type.lower() == "cnn":
            config = get_cnn_model_config()
        elif model_type.lower() == "lstm":
            config = get_lstm_model_config()
        elif model_type.lower() == "attention":
            config = get_attention_model_config()
        else:
            raise ValueError("model_type must be 'cnn', 'lstm', or 'attention'")

        # Create environment
        env = self.create_env(n_envs=4) # 4

        print(f"Training with {model_type.upper()} model...")

        # Create and train model
        model = MaskablePPO(env=env, **config)
        model.learn(total_timesteps=total_timesteps)

        return model


# Complete example usage
def complete_training_example(df):
    """Complete training example with your data"""

    # Split data
    split_idx = int(len(df) * 0.8)
    df_train = df.iloc[:split_idx]
    df_val = df.iloc[split_idx:]

    print(f"Training data: {len(df_train)} samples")
    print(f"Validation data: {len(df_val)} samples")

    # Create enhanced trainer
    trainer = EnhancedTradingTrainer(
        df_train,
        df_val,
        env_kwargs={
            "window_size": 30,
            "initial_balance": 10000,
            "fee_pct": 0.001,
            "normalize_obs": True
        }
    )

    # Train different models and compare
    models = {}
    results = {}

    for model_type in ["cnn", "lstm", "attention"]:
        print(f"\n{'='*50}")
        print(f"Training {model_type.upper()} model")
        print(f"{'='*50}")

        try:
            # Train model
            model = trainer.train_with_custom_model(
                model_type=model_type,
                total_timesteps=50000
            )

            # Save model
            model.save(f"trading_model_{model_type}")
            models[model_type] = model

            # Evaluate
            print(f"\nEvaluating {model_type.upper()} model...")
            eval_results = trainer.evaluate_agent(model, df_val, n_episodes=10)
            results[model_type] = eval_results

        except Exception as e:
            print(f"Error training {model_type} model: {e}")
            continue

    # Compare results
    print(f"\n{'='*50}")
    print("MODEL COMPARISON")
    print(f"{'='*50}")

    for model_type, result in results.items():
        print(f"{model_type.upper()} Model:")
        print(f"  Average Return: {result['avg_return']:.2%}")
        print(f"  Return Std: {np.std(result['returns']):.2%}")
        print(f"  Average Trades: {result['avg_trades']:.1f}")
        print(f"  Best Return: {max(result['returns']):.2%}")
        print(f"  Worst Return: {min(result['returns']):.2%}")
        print()

    return models, results

In [None]:
import warnings
import os

def suppress_warnings():
    """
    Suppress common deprecation warnings that don't affect functionality
    """
    # Suppress specific datetime deprecation warning from Jupyter
    warnings.filterwarnings("ignore",
                          message="datetime.datetime.utcnow\\(\\) is deprecated",
                          category=DeprecationWarning)

    # Suppress other common warnings that clutter output
    warnings.filterwarnings("ignore", category=UserWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)

    # Suppress specific gym/gymnasium warnings
    warnings.filterwarnings("ignore",
                          message=".*Box bound precision lowered.*",
                          category=UserWarning)

    # Suppress stable-baselines3 warnings
    warnings.filterwarnings("ignore",
                          message=".*Using `continuous_actions=False`.*",
                          category=UserWarning)


def set_clean_environment():
    """
    Set environment variables for cleaner output
    """

    # Reduce other library verbosity
    os.environ['PYTHONWARNINGS'] = 'ignore'

suppress_warnings()
set_clean_environment()

In [None]:
"""
Multi-Asset Trading Environment with Stable-Baselines3
Works with DataFrame structure containing KOC_Close, XU100_Close, USDTRY_Close
"""

import pandas as pd
import numpy as np
from stable_baselines3.common.env_checker import check_env
from sb3_contrib import MaskablePPO

def quick_test_run(df):
    """Quick test to verify everything works"""

    print("\n" + "="*50)
    print("QUICK TEST RUN")
    print("="*50)

    # Use small subset for quick test
    test_df = df.iloc[-200:] if len(df) > 200 else df
    #print(test_df.head())

    # Create environment
    env = MultiAssetTradingEnv(
        test_df,
        window_size=20,
        initial_balance=10000,
        fee_pct=0.001,
        normalize_obs=True,
        render_mode="human"
    )

    # Validate environment
    check_env(env)
    print("✓ Environment validation passed!")

    # Test a few random steps
    obs, info = env.reset()
    print(f"✓ Environment reset successful. Observation shape: {obs.shape}")

    for i in range(5):
        # Get valid actions
        action_mask = env.action_masks()
        valid_actions = np.where(action_mask)[0]
        #print(action_mask, valid_actions)

        # Take random valid action
        action = np.random.choice(valid_actions)
        obs, reward, terminated, truncated, info = env.step(action)

        print(f"Step {i+1}: Action={action}, Reward={reward:.4f}, "
              f"Portfolio=${info['portfolio_value']:.2f}, Executed={info['executed']}")

        if terminated or truncated:
            break

    print("✓ Quick test completed successfully!")
    return True


def train_simple_model(df, timesteps=10000):
    """Train a simple baseline model"""

    print("\n" + "="*50)
    print("TRAINING SIMPLE BASELINE MODEL")
    print("="*50)

    # Split data
    split_idx = int(len(df) * 0.8)
    df_train = df.iloc[:split_idx]
    print(df_train.head())
    df_val = df.iloc[split_idx:]

    print(f"Training data: {len(df_train)} samples")
    print(f"Validation data: {len(df_val)} samples")

    # Create trainer
    trainer = TradingEnvTrainer(
        df_train,
        df_val,
        env_kwargs={
            "window_size": 30,
            "initial_balance": 10000,
            "fee_pct": 0.001,
            "normalize_obs": True
        }
    )

    # Train simple model
    print("Training baseline model...")
    model = trainer.train_agent(
        total_timesteps=timesteps,
        learning_rate=3e-4,
        n_steps=256,
        batch_size=32,
        n_epochs=5,
        verbose=0
    )

    # Save model
    model.save("simple_trading_model")
    print("✓ Model saved as 'simple_trading_model'")

    # Evaluate
    print("\nEvaluating model...")
    results = trainer.evaluate_agent(model, df_val, n_episodes=5)

    print(f"\nBaseline Results:")
    print(f"Average Return: {results['avg_return']:.2%}")
    print(f"Average Trades: {results['avg_trades']:.1f}")

    return model, results


def train_advanced_models(df, timesteps=20000):
    """Train advanced models with custom architectures"""

    print("\n" + "="*50)
    print("TRAINING ADVANCED MODELS")
    print("="*50)

    # Split data
    split_idx = int(len(df) * 0.8)
    df_train = df.iloc[:split_idx]
    df_val = df.iloc[split_idx:]

    # Create enhanced trainer
    trainer = EnhancedTradingTrainer(
        df_train,
        df_val,
        env_kwargs={
            "window_size": 30,
            "initial_balance": 10000,
            "fee_pct": 0.001,
            "normalize_obs": True
        }
    )

    models = {}
    results = {}

    # Train CNN model
    try:
        print("\nTraining CNN model...")
        cnn_model = trainer.train_with_custom_model("cnn", timesteps)
        cnn_model.save("cnn_trading_model")
        models["cnn"] = cnn_model

        print("Evaluating CNN model...")
        results["cnn"] = trainer.evaluate_agent(cnn_model, df_val, n_episodes=5)

    except Exception as e:
        print(f"CNN training failed: {e}")

    # Train LSTM model
    try:
        print("\nTraining LSTM model...")
        lstm_model = trainer.train_with_custom_model("lstm", timesteps)
        lstm_model.save("lstm_trading_model")
        models["lstm"] = lstm_model

        print("Evaluating LSTM model...")
        results["lstm"] = trainer.evaluate_agent(lstm_model, df_val, n_episodes=5)

    except Exception as e:
        print(f"LSTM training failed: {e}")

    # Train ATTENTION model
    try:
        print("\nTraining ATTENTION model...")
        attn_model = trainer.train_with_custom_model("attention", timesteps)
        attn_model.save("attn_trading_model")
        models["attn"] = attn_model

        print("Evaluating ATTENTION model...")
        results["attn"] = trainer.evaluate_agent(attn_model, df_val, n_episodes=5)

    except Exception as e:
        print(f"ATTENTION training failed: {e}")

    # Compare results
    if results:
        print(f"\n{'='*30}")
        print("ADVANCED MODEL COMPARISON")
        print(f"{'='*30}")

        for model_type, result in results.items():
            print(f"{model_type.upper()}:")
            print(f"  Avg Return: {result['avg_return']:.2%}")
            print(f"  Avg Trades: {result['avg_trades']:.1f}")
            print()

    return models, results


"""Main training pipeline"""

print("Multi-Asset Trading Agent Training Pipeline")
print("="*60)

# Quick validation test
print("\nValidate environment")
print("-" * 30)
quick_test_run(df)

# Train simple baseline
print("\nTrain baseline model")
print("-" * 30)
simple_model, simple_results = train_simple_model(df, timesteps=100000)

# Train advanced models (optional)
print("\nTrain advanced models")
print("-" * 30)
advanced_models, advanced_results = train_advanced_models(df, timesteps=200000)

print("\n" + "="*60)
print("TRAINING COMPLETE!")
print("="*60)
print("Models saved:")

print("\nTo use a trained model:")
print("model = MaskablePPO.load('simple_trading_model')")
print("# Then use model.predict(obs, action_masks...) for trading decisions")


Multi-Asset Trading Agent Training Pipeline

Validate environment
------------------------------

QUICK TEST RUN
✓ Environment validation passed!
✓ Environment reset successful. Observation shape: (240,)
Step 1: Action=1, Reward=-0.0058, Portfolio=$9951.81, Executed=buy_EREGL_AdjClose
Step 2: Action=2, Reward=-0.0020, Portfolio=$9941.86, Executed=sell_EREGL_AdjClose
Step 3: Action=5, Reward=0.0021, Portfolio=$9972.46, Executed=buy_KOC_AdjClose
Step 4: Action=6, Reward=-0.0020, Portfolio=$9962.48, Executed=sell_KOC_AdjClose
Step 5: Action=1, Reward=0.0018, Portfolio=$9990.71, Executed=buy_EREGL_AdjClose
✓ Quick test completed successfully!

Train baseline model
------------------------------

TRAINING SIMPLE BASELINE MODEL
            EREGL_Volume  EREGL_AdjClose  SAHOL_Volume  SAHOL_AdjClose  \
2010-01-05      73491364        0.013520       5058266        2.207235   
2010-01-06      27708427        0.013580       4992228        2.169179   
2010-01-07      40402260        0.013342      