<a href="https://colab.research.google.com/github/gaixen/RebalanceAI_-241027-/blob/main/Rebalance_AI_Final_DQN_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing dependencies

In [20]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import gym
from gym import spaces
import random
from collections import deque, namedtuple
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [43]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Fetching the data

In [22]:
def fetch_data(symbols, start_date, end_date):
    """Fetch data for multiple symbols"""
    all_data = []
    for symbol in symbols:
        try:
            df = yf.download(symbol, start=start_date, end=end_date)
            df = df[['Open', 'High', 'Low', 'Close', 'Volume']].dropna()
            df['Symbol'] = symbol
            all_data.append(df)
            print(f"✓ Fetched {len(df)} records for {symbol}")
        except Exception as e:
            print(f"✗ Error fetching {symbol}: {e}")

    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        print(f"\n📊 Total combined records: {len(combined_df)}")
        return combined_df.reset_index(drop=True) # Reset index here
    return pd.DataFrame()

def add_indicators(df):
    """Add comprehensive technical indicators to stock DataFrame."""
    import numpy as np
    print("Adding technical indicators...")
    result_dfs = []

    for symbol in df['Symbol'].unique():
        symbol_df = df[df['Symbol'] == symbol].copy()
        print(f"Processing {symbol}...")

        # Moving Averages
        symbol_df['SMA_10'] = symbol_df['Close'].rolling(window=10).mean()
        symbol_df['SMA_20'] = symbol_df['Close'].rolling(window=20).mean()
        symbol_df['EMA_12'] = symbol_df['Close'].ewm(span=12, adjust=False).mean()
        symbol_df['EMA_26'] = symbol_df['Close'].ewm(span=26, adjust=False).mean()

        # RSI
        delta = symbol_df['Close'].diff()
        gain = delta.clip(lower=0).rolling(window=14).mean()
        loss = -delta.clip(upper=0).rolling(window=14).mean()
        rs = gain / (loss + 1e-10)  # avoid divide by zero
        symbol_df['RSI'] = 100 - (100 / (1 + rs))

        # MACD
        symbol_df['MACD'] = symbol_df['EMA_12'] - symbol_df['EMA_26']
        symbol_df['MACD_Signal'] = symbol_df['MACD'].ewm(span=9, adjust=False).mean()
        symbol_df['MACD_Hist'] = symbol_df['MACD'] - symbol_df['MACD_Signal']

        # Bollinger Bands
        sma20 = symbol_df['Close'].rolling(window=20).mean()
        std20 = symbol_df['Close'].rolling(window=20).std()
        symbol_df['BB_Upper'] = sma20 + (2 * std20)
        symbol_df['BB_Lower'] = sma20 - (2 * std20)
        symbol_df['BB_Width'] = (symbol_df['BB_Upper'] - symbol_df['BB_Lower']) / sma20
        symbol_df['BB_Position'] = (symbol_df['Close'] - symbol_df['BB_Lower']) / (symbol_df['BB_Upper'] - symbol_df['BB_Lower'] + 1e-10)

        # ATR (Average True Range)
        high_low = symbol_df['High'] - symbol_df['Low']
        high_close = (symbol_df['High'] - symbol_df['Close'].shift()).abs()
        low_close = (symbol_df['Low'] - symbol_df['Close'].shift()).abs()
        true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
        symbol_df['ATR'] = true_range.rolling(window=14).mean()

        # Stochastic Oscillator
        low14 = symbol_df['Low'].rolling(window=14).min()
        high14 = symbol_df['High'].rolling(window=14).max()
        denom = (high14 - low14).replace(0, 1e-10)
        symbol_df['Stoch_K'] = 100 * ((symbol_df['Close'] - low14) / denom)
        symbol_df['Stoch_D'] = symbol_df['Stoch_K'].rolling(window=3).mean()

        # Momentum
        symbol_df['Momentum'] = symbol_df['Close'] / symbol_df['Close'].shift(10) - 1
        symbol_df['ROC'] = symbol_df['Close'].pct_change(10) * 100

        # Volume-based
        symbol_df['Volume_SMA'] = symbol_df['Volume'].rolling(window=20).mean()
        symbol_df['Volume_Ratio'] = symbol_df['Volume'] / (symbol_df['Volume_SMA'] + 1e-10)

        # Price position in recent range
        low_20 = symbol_df['Low'].rolling(window=20).min()
        high_20 = symbol_df['High'].rolling(window=20).max()
        denom_pos = (high_20 - low_20).replace(0, 1e-10)
        symbol_df['Price_Position'] = (symbol_df['Close'] - low_20) / denom_pos

        # Volatility
        symbol_df['Volatility'] = symbol_df['Close'].pct_change().rolling(20).std()

        symbol_df = symbol_df.dropna()
        result_dfs.append(symbol_df)

    final_df = pd.concat(result_dfs, ignore_index=True)
    print(f"final dataset: {len(final_df)} records")
    return final_df


## Environment and agents


In [33]:
class AdvancedTradingEnv(gym.Env):
    """Enhanced Gym environment for multi-stock trading"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df, initial_balance=10000, max_steps=200, normalize=True):
        super(AdvancedTradingEnv, self).__init__()
        self.df = df.reset_index(drop=True)
        self.initial_balance = initial_balance
        self.max_steps = min(max_steps, len(df) - 1)
        self.normalize = normalize
        self.current_symbol = None

        # Shuffle the data
        self.df = self.df.sample(frac=1).reset_index(drop=True)

        # Feature columns
        self.feature_columns = [
            'Close', 'Volume', 'SMA_10', 'SMA_20', 'EMA_12', 'EMA_26',
            'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'BB_Upper', 'BB_Lower',
            'BB_Width', 'BB_Position', 'ATR', 'Stoch_K', 'Stoch_D',
            'Momentum', 'ROC', 'Volume_Ratio', 'Price_Position', 'Volatility'
        ]

        self.original_data = df.copy()
        self.scaler = None
        self.price_scaler = None

        # Normalize features
        if self.normalize:
            self.scaler = StandardScaler()
            self.price_scaler = StandardScaler()
            available_features = [col for col in self.feature_columns if col in df.columns]

            if available_features :
                self.df[available_features] = self.scaler.fit_transform(self.df[available_features])
            else:
                print("No available features for normalization.")

        # Actions: 0 = hold, 1 = buy, 2 = sell
        self.action_space = spaces.Discrete(3)

        # Observation space: features + portfolio state
        n_features = len(self.feature_columns) + 4  # +4 for shares_held, balance, net_worth , current_price
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(n_features,), dtype=np.float32
        )

    def reset(self):
        self.balance = float(self.initial_balance)
        self.net_worth = float(self.initial_balance)
        self.shares_held = 0
        self.current_step = 0
        self.previous_net_worth = self.initial_balance
        return self._get_obs()

    def _get_obs(self):
        if self.current_step >= len(self.df):
            self.current_step = len(self.df) - 1

        row = self.df.iloc[self.current_step]

        current_price = self._get_denormalized_price(row['Close'])

        # Get feature values
        features = [row[col] for col in self.feature_columns]

        max_possible_shares = self.initial_balance / max(current_price, 1)

        # Add portfolio state (normalized)
        portfolio_features = [
            self.shares_held / max(max_possible_shares, 1),  # Normalize shares
            (self.balance - self.initial_balance) / self.initial_balance,  # Normalize balance
            (self.net_worth - self.initial_balance) / self.initial_balance,  # Normalize net worth
            min(current_price / self.initial_balance, 1.0)  # Normalize current price
        ]

        obs = np.array(features + portfolio_features, dtype=np.float32)

        obs = np.nan_to_num(obs, nan=0.0, posinf=1.0, neginf=-1.0)

        return obs

    def _get_denormalized_price(self, normalized_price):

        if self.normalize and self.price_scaler is not None:
            try:

                denormalized = self.price_scaler.inverse_transform([[normalized_price]])[0][0]
                return max(denormalized, 0.01)
            except:
                return max(normalized_price, 0.01)
        return max(normalized_price, 0.01)

    def step(self, action):
        if self.current_step >= len(self.df) - 1:
            return self._get_obs(), 0, True, {}

        row = self.df.iloc[self.current_step]
        current_price = self._get_denormalized_price(row['Close'])

        prev_net_worth = self.net_worth
        transaction_cost = 1e-3

        reward = 0

        # Execute action
        if action == 1:  # Buy
            if self.balance >= current_price * (1 + transaction_cost):
                shares_to_buy = int(self.balance / (current_price * (1 + transaction_cost)))
                if shares_to_buy > 0:
                    cost = shares_to_buy * current_price * (1 + transaction_cost)
                    self.shares_held += shares_to_buy
                    self.balance -= cost
                    reward = 0.01  # Small positive reward for successful buy
                else:
                    reward = -0.05
            else:
                reward = -0.05

        elif action == 2:  # Sell
            if self.shares_held > 0:
                revenue = self.shares_held * current_price * (1 - transaction_cost)
                self.balance += revenue
                self.shares_held = 0
                reward = 0.01
            else:
                reward = -0.05  # Penalty for invalid sell

        # Move to next step
        self.current_step += 1
        done = self.current_step >= self.max_steps or self.current_step >= len(self.df) - 1

        if not done and self.current_step < len(self.df):
            next_row = self.df.iloc[self.current_step]
            next_price = self._get_denormalized_price(next_row['Close'])
            self.net_worth = self.balance + self.shares_held * next_price
        else:
            self.net_worth = self.balance + self.shares_held * current_price

        # Calculate reward
        portfolio_change = (self.net_worth - prev_net_worth) / self.initial_balance
        reward += portfolio_change * 10  # Scale portfolio change

        reward = np.clip(reward, -1.0, 1.0)

        info = {
            'net_worth': self.net_worth,
            'balance': self.balance,
            'shares': self.shares_held,
            'current_price': current_price,
            'action': action
        }

        return self._get_obs(), reward, done, info



In [24]:
class ImprovedDQN(nn.Module):
    """Enhanced DQN with Dueling architecture"""
    def __init__(self, input_dim, action_dim, hidden_dim=512):
        super(ImprovedDQN, self).__init__()

        self.feature_extractor = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),

            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),

            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(0.2),
        )

        self.value_head = nn.Linear(hidden_dim // 2, 1)
        self.advantage_head = nn.Linear(hidden_dim // 2, action_dim)

    def forward(self, x):
        features = self.feature_extractor(x)

        value = self.value_head(features)
        advantage = self.advantage_head(features)

        # Bellman
        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q_values


architecture definition successful


In [25]:
class PrioritizedReplayBuffer:
    def __init__(self, capacity=50000, alpha=0.6):
        self.capacity = capacity
        self.alpha = alpha
        self.buffer = []
        self.pos = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)

    def push(self, state, action, reward, next_state, done):
        max_prio = self.priorities.max() if self.buffer else 1.0

        if len(self.buffer) < self.capacity:
            self.buffer.append((state, action, reward, next_state, done))
        else:
            self.buffer[self.pos] = (state, action, reward, next_state, done)

        self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]

        probs = prios ** self.alpha
        probs /= probs.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]

        total = len(self.buffer)
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()

        return samples, indices, weights

    def update_priorities(self, indices, priorities):
        for idx, prio in zip(indices, priorities):
            self.priorities[idx] = prio

    def __len__(self):
        return len(self.buffer)

## Training

### Fetching the data

In [26]:
# symbols = ['META']
symbols = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN', 'NVDA', 'META' , 'NFLX']
print(f"Training symbols: {symbols}")

df = fetch_data(symbols, '2018-01-01', '2023-01-01')

if df.empty:
    print("No data fetched.")
else:
    print(f"symbols: {len(df['Symbol'].unique())} ")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Training symbols: ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN', 'NVDA', 'META', 'NFLX']
✓ Fetched 1259 records for AAPL
✓ Fetched 1259 records for GOOGL
✓ Fetched 1259 records for MSFT
✓ Fetched 1259 records for TSLA
✓ Fetched 1259 records for AMZN



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


✓ Fetched 1259 records for NVDA
✓ Fetched 1259 records for META
✓ Fetched 1259 records for NFLX

📊 Total combined records: 10072
✅ Successfully fetched data for 8 symbols


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10072 entries, 0 to 10071
Data columns (total 41 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   (Open, AAPL)     1259 non-null   float64
 1   (High, AAPL)     1259 non-null   float64
 2   (Low, AAPL)      1259 non-null   float64
 3   (Close, AAPL)    1259 non-null   float64
 4   (Volume, AAPL)   1259 non-null   float64
 5   (Symbol, )       10072 non-null  object 
 6   (Open, GOOGL)    1259 non-null   float64
 7   (High, GOOGL)    1259 non-null   float64
 8   (Low, GOOGL)     1259 non-null   float64
 9   (Close, GOOGL)   1259 non-null   float64
 10  (Volume, GOOGL)  1259 non-null   float64
 11  (Open, MSFT)     1259 non-null   float64
 12  (High, MSFT)     1259 non-null   float64
 13  (Low, MSFT)      1259 non-null   float64
 14  (Close, MSFT)    1259 non-null   float64
 15  (Volume, MSFT)   1259 non-null   float64
 16  (Open, TSLA)     1259 non-null   float64
 17  (High, TSLA)

### Adding the indicators

In [27]:
def melt_stock_data(df):
    df.columns = pd.MultiIndex.from_tuples([eval(col) if isinstance(col, str) else col for col in df.columns])

    symbols = sorted(set(sym for (_, sym) in df.columns if sym not in (None, '')))
    long_df = []

    for sym in symbols:
        sym_df = df.xs(key=sym, axis=1, level=1).copy()
        sym_df['Symbol'] = sym
        sym_df['Date'] = df.index
        long_df.append(sym_df)

    final_df = pd.concat(long_df, ignore_index=True)
    final_df = final_df[['Date', 'Symbol', 'Open', 'High', 'Low', 'Close', 'Volume']]
    return final_df

In [28]:
df_melted = melt_stock_data(df)
df_with_indicators = add_indicators(df_melted)

Adding technical indicators...
Processing AAPL...
Processing AMZN...
Processing GOOGL...
Processing META...
Processing MSFT...
Processing NFLX...
Processing NVDA...
Processing TSLA...
✓ Technical indicators added. Final dataset: 9912 records


In [29]:
print(f"📈 Dataset ready with {len(df_with_indicators.columns)} features")
print(f"Feature columns: {df_with_indicators.columns.tolist()}")
print("Sample of processed data:")
print(df_with_indicators.head())

📈 Dataset ready with 28 features
Feature columns: ['Date', 'Symbol', 'Open', 'High', 'Low', 'Close', 'Volume', 'SMA_10', 'SMA_20', 'EMA_12', 'EMA_26', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'BB_Upper', 'BB_Lower', 'BB_Width', 'BB_Position', 'ATR', 'Stoch_K', 'Stoch_D', 'Momentum', 'ROC', 'Volume_SMA', 'Volume_Ratio', 'Price_Position', 'Volatility']

📊 Sample of processed data:
   Date Symbol       Open       High        Low      Close       Volume  \
0    20   AAPL  39.161878  39.530335  39.075045  39.293301  129915600.0   
1    21   AAPL  39.232277  39.572569  39.136055  39.375435  188923200.0   
2    22   AAPL  38.957694  39.145443  37.573055  37.666927  346375200.0   
3    23   AAPL  37.338362  38.460155  36.610837  36.725834  290954000.0   
4    24   AAPL  36.336269  38.422618  36.141480  38.260685  272975200.0   

      SMA_10     SMA_20     EMA_12  ...  BB_Position       ATR    Stoch_K  \
0  40.623021  40.856885  40.334648  ...     0.046834  0.658962  17.727309   
1  40.35360

## Training the model

In [30]:
# Training function
def train_multi_stock_dqn(df, episodes=1000, max_steps=300):

    gamma = 0.99
    epsilon_start = 1.0
    epsilon_end = 0.05
    epsilon_decay = 0.9995
    batch_size = 32
    target_update = 20
    learning_rate = 1e-4

    print(f"Training Configuration:")
    print(f"   Episodes: {episodes}")
    print(f"   Max steps per episode: {max_steps}")
    print(f"   Batch size: {batch_size}")
    print(f"   Learning rate: {learning_rate}")

    # Initialize environment
    env = AdvancedTradingEnv(df, initial_balance=10000, max_steps=max_steps, normalize=True)
    print(f"   Observation space: {env.observation_space.shape}")
    print(f"   Action space: {env.action_space.n}")

    # Initialize networks
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"   Device: {device}")

    policy_net = ImprovedDQN(env.observation_space.shape[0], env.action_space.n).to(device)
    target_net = ImprovedDQN(env.observation_space.shape[0], env.action_space.n).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.9)
    memory = PrioritizedReplayBuffer(capacity=50000)

    # Training loop
    epsilon = epsilon_start
    episode_rewards = []
    episode_net_worths = []
    action_counts = [0, 0, 0]
    best_reward = float('-inf')

    print("Starting training...")

    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        episode_actions = []

        for step in range(max_steps):
            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = random.randrange(env.action_space.n)
            else:
                policy_net.eval() # Set policy_net to evaluation mode
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
                    if len(state_tensor.shape) == 1:
                        state_tensor = state_tensor.unsqueeze(0)
                    q_values = policy_net(state_tensor)
                    action = q_values.argmax().item()
                policy_net.train() # Set policy_net back to training mode


            next_state, reward, done, info = env.step(action)
            if np.any(np.isnan(state)) or np.any(np.isnan(next_state)):
                continue
            memory.push(state, action, reward, next_state, done)

            state = next_state
            total_reward += reward
            episode_actions.append(action)
            action_counts[action] += 1

            # Training
            if len(memory) >= batch_size*2:
                samples, indices, weights = memory.sample(batch_size)

                states = torch.FloatTensor([s[0] for s in samples]).to(device)
                actions = torch.LongTensor([s[1] for s in samples]).to(device)
                rewards = torch.FloatTensor([s[2] for s in samples]).to(device)
                next_states = torch.FloatTensor([s[3] for s in samples]).to(device)
                dones = torch.BoolTensor([s[4] for s in samples]).to(device)
                weights_tensor = torch.FloatTensor(weights).to(device)

                current_q_values = policy_net(states).gather(1, actions.unsqueeze(1))
                next_q_values = target_net(next_states).max(1)[0].detach()
                target_q_values = rewards + (gamma * next_q_values * ~dones)

                # Compute loss with importance sampling weights
                td_errors = current_q_values.squeeze() - target_q_values
                loss = (weights_tensor * td_errors.pow(2)).mean()

                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 1.0)
                optimizer.step()

                # Update priorities
                priorities = np.abs(td_errors.detach().cpu().numpy()) + 1e-6
                memory.update_priorities(indices, priorities)

            if done:
                break

        # Update target network
        if episode % target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())

        # Decay epsilon and update scheduler
        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        scheduler.step()

        # Track performance
        episode_rewards.append(total_reward)
        episode_net_worths.append(info.get('net_worth', env.initial_balance))

        # Save best model

        if total_reward > best_reward:
            best_reward = total_reward
            torch.save(policy_net.state_dict(), 'best_dqn_model.pth')

        # Progress reporting
        if episode % 25 == 0:
            avg_reward = np.mean(episode_rewards[-50:])
            avg_net_worth = np.mean(episode_net_worths[-50:])
            action_dist = [c/sum(action_counts)*100 for c in action_counts]
            print(f"Episode {episode:4d}/{episodes} | "
                  f"Avg Reward: {avg_reward:8.2f} | "
                  f"Avg Net Worth: ${avg_net_worth:8.2f} | "
                  f"Epsilon: {epsilon:.3f} | "
                  f"Actions: H{action_dist[0]:.1f}% B{action_dist[1]:.1f}% S{action_dist[2]:.1f}%")

    print("Training completed!")
    print(f"Final action distribution: Hold {action_counts[0]}, Buy {action_counts[1]}, Sell {action_counts[2]}")
    return policy_net, env, episode_rewards, episode_net_worths

In [31]:
def debug_environment(df, initial_balance=10000, max_steps=10):
    """Debug the environment to understand action behavior"""
    env = AdvancedTradingEnv(df, initial_balance=initial_balance, max_steps=max_steps, normalize=True)

    print("=== ENVIRONMENT DEBUG ===")
    print(f"Initial balance: ${initial_balance}")
    print(f"Data shape: {df.shape}")
    print(f"Max steps: {max_steps}")

    state = env.reset()
    print(f"Initial state shape: {state.shape}")
    print(f"Initial state: {state}")
    print(f"Initial net worth: ${env.net_worth}")
    print(f"Initial balance: ${env.balance}")
    print(f"Initial shares: {env.shares_held}")

    for step in range(min(5, max_steps)):
        for action in range(3):
            # Save state
            saved_balance = env.balance
            saved_shares = env.shares_held
            saved_step = env.current_step

            # Try action
            next_state, reward, done, info = env.step(action)

            print(f"\nStep {step}, Action {action} ({'Hold' if action==0 else 'Buy' if action==1 else 'Sell'}):")
            print(f"  Reward: {reward:.4f}")
            print(f"  Net worth: ${info['net_worth']:.2f}")
            print(f"  Balance: ${info['balance']:.2f}")
            print(f"  Shares: {info['shares']}")
            print(f"  Price: ${info['current_price']:.2f}")
            print(f"  Done: {done}")

            # Reset state for next action test
            env.balance = saved_balance
            env.shares_held = saved_shares
            env.current_step = saved_step

        # Actually take hold action to advance
        env.step(0)
        if done:
            break

In [34]:
debug_environment(df_with_indicators, initial_balance=10000, max_steps=10)

=== ENVIRONMENT DEBUG ===
Initial balance: $10000
Data shape: (9912, 28)
Max steps: 10
Initial state shape: (26,)
Initial state: [ 3.5891861e-01 -2.5954154e-01  3.5925537e-01  3.7596849e-01
  3.8079530e-01  4.2420346e-01  4.8856180e-02 -9.0987808e-01
 -1.3026569e+00  1.1251770e+00  4.1165644e-01  3.3114782e-01
  4.0938827e-01 -2.9675043e-01  6.5532506e-01  2.9986805e-01
  8.3912653e-01  4.5360580e-01  4.5360580e-01 -5.5934042e-01
 -5.4327786e-01  2.7346188e-01  0.0000000e+00  0.0000000e+00
  0.0000000e+00  3.5891862e-05]
Initial net worth: $10000.0
Initial balance: $10000.0
Initial shares: 0

Step 0, Action 0 (Hold):
  Reward: 0.0000
  Net worth: $10000.00
  Balance: $10000.00
  Shares: 0
  Price: $0.36
  Done: False

Step 0, Action 1 (Buy):
  Reward: 1.0000
  Net worth: $28324.64
  Balance: $0.23
  Shares: 27833
  Price: $0.36
  Done: False

Step 0, Action 2 (Sell):
  Reward: -1.0000
  Net worth: $10000.00
  Balance: $10000.00
  Shares: 0
  Price: $0.36
  Done: False

Step 1, Action 0

In [35]:
trained_model, trained_env, training_rewards, training_net_worths = train_multi_stock_dqn(
    df_with_indicators,
    episodes=500,
    max_steps=50
)

Training Configuration:
   Episodes: 500
   Max steps per episode: 50
   Batch size: 32
   Learning rate: 0.0001
   Observation space: (26,)
   Action space: 3
   Device: cuda
Starting training...
Episode    0/500 | Avg Reward:    -1.48 | Avg Net Worth: $ 1139.08 | Epsilon: 1.000 | Actions: H34.0% B30.0% S36.0%
Episode   25/500 | Avg Reward:    -3.68 | Avg Net Worth: $812665626.43 | Epsilon: 0.987 | Actions: H30.3% B34.0% S35.7%
Episode   50/500 | Avg Reward:    -3.83 | Avg Net Worth: $1113166913.96 | Epsilon: 0.975 | Actions: H32.4% B34.3% S33.4%
Episode   75/500 | Avg Reward:    -3.32 | Avg Net Worth: $791495108.25 | Epsilon: 0.963 | Actions: H32.4% B34.9% S32.7%
Episode  100/500 | Avg Reward:    -3.29 | Avg Net Worth: $29621047935.27 | Epsilon: 0.951 | Actions: H32.5% B34.9% S32.6%
Episode  125/500 | Avg Reward:    -3.64 | Avg Net Worth: $29972530949.12 | Epsilon: 0.939 | Actions: H32.1% B35.7% S32.1%
Episode  150/500 | Avg Reward:    -3.47 | Avg Net Worth: $1209997915.24 | Epsilon:

### Evaluation

In [37]:
def evaluate_model(policy_net, test_symbols, start_date, end_date , initial_balance = 10000 , max_steps = 200):
    # Fetch test data
    test_df = fetch_data(test_symbols, start_date, end_date)
    test_df = melt_stock_data(test_df)
    test_df = add_indicators(test_df)

    if initial_balance <= 0:
        initial_balance = 10000
        print("initial_balance value reset to 10,000")

    # Create test environment
    test_env = AdvancedTradingEnv(test_df, initial_balance=10000, max_steps=200, normalize=True)

    # Test the trained model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    policy_net.to(device)
    state = test_env.reset()
    dqn_values = [test_env.initial_balance]
    actions_taken = []
    prices = []

    policy_net.eval()
    with torch.no_grad():
        for step in range(min(max_steps, len(test_df) - 1)):
            try:

                state_tensor = torch.FloatTensor(state).to(device)
                if len(state_tensor.shape) == 1:
                    state_tensor = state_tensor.unsqueeze(0)
                q_values = policy_net(state_tensor)

                # Use epsilon-greedy algorithm
                if random.random() < 0.1:  # 10% random actions
                    action = random.randrange(test_env.action_space.n)
                else:
                    action = q_values.argmax().item()

                next_state, reward, done, info = test_env.step(action)

                if np.any(np.isnan(next_state)):
                    print(f"Warning: NaN detected in state at step {step}")
                    break

                state = next_state
                dqn_values.append(info['net_worth'])
                actions_taken.append(action)
                prices.append(info['current_price'])

                if step % 50 == 0:
                    print(f"Step {step}: Action={action}, Price=${info['current_price']:.2f}, "
                          f"Net Worth=${info['net_worth']:.2f}, Balance=${info['balance']:.2f}, "
                          f"Shares={info['shares']}")

                if done:
                    break

            except Exception as e:
                print(f"Error at step {step}: {e}")
                break

    # Calculate buy-and-hold baseline
    if prices:
        initial_price = prices[0]
        shares_bh = initial_balance / initial_price
        buy_hold_values = [shares_bh * price for price in prices]
        # Align lengths
        min_len = min(len(dqn_values), len(buy_hold_values))
        dqn_values = dqn_values[:min_len]
        buy_hold_values = buy_hold_values[:min_len]
    else:
        buy_hold_values = [initial_balance] * len(dqn_values)

    print(f"evaluation completed on {len(dqn_values)} time steps")
    print(f"Action distribution: Hold {actions_taken.count(0)}, Buy {actions_taken.count(1)}, Sell {actions_taken.count(2)}")

    return dqn_values, buy_hold_values, actions_taken

In [38]:
# Evaluate on test data
test_symbols = ['AAPL', 'GOOGL', 'MSFT']  # Subset for testing
dqn_values, buy_hold_values, actions = evaluate_model(
    trained_model, test_symbols, '2022-01-01', '2023-01-01' , initial_balance = 0 , max_steps = 500
)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


✓ Fetched 251 records for AAPL
✓ Fetched 251 records for GOOGL
✓ Fetched 251 records for MSFT

📊 Total combined records: 753
Adding technical indicators...
Processing AAPL...
Processing GOOGL...
Processing MSFT...
✓ Technical indicators added. Final dataset: 693 records
initial_balance value reset to 10,000
Step 0: Action=2, Price=$0.01, Net Worth=$10000.00, Balance=$10000.00, Shares=0
Step 50: Action=0, Price=$0.01, Net Worth=$9990.01, Balance=$0.01, Shares=999000
Step 100: Action=0, Price=$0.01, Net Worth=$1177746.77, Balance=$0.01, Shares=999000
Step 150: Action=0, Price=$1.18, Net Worth=$9990.01, Balance=$0.01, Shares=999000
✅ Evaluation completed on 200 time steps
Action distribution: Hold 187, Buy 6, Sell 7


In [39]:
def plot_results(dqn_values, buy_hold_values, actions):
    """Plot results with action annotations"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Portfolio comparison
    axes[0,0].plot(dqn_values, label='DQN Agent', linewidth=2)
    axes[0,0].plot(buy_hold_values, label='Buy & Hold', linewidth=2)
    axes[0,0].set_title('Portfolio Value Comparison')
    axes[0,0].set_xlabel('Time Steps')
    axes[0,0].set_ylabel('Portfolio Value ($)')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)

    # Action distribution
    action_names = ['Hold', 'Buy', 'Sell']
    action_counts = [actions.count(i) for i in range(3)]

    # Handle case where no actions were taken
    if sum(action_counts) == 0:
        action_counts = [1, 1, 1]  # Equal distribution for visualization

    axes[0,1].pie(action_counts, labels=action_names, autopct='%1.1f%%')
    axes[0,1].set_title('Action Distribution')

    # Actions over time
    if actions:
        axes[1,0].plot(actions, 'o-', alpha=0.6, markersize=2)
        axes[1,0].set_title('Actions Over Time')
        axes[1,0].set_xlabel('Time Steps')
        axes[1,0].set_ylabel('Action (0=Hold, 1=Buy, 2=Sell)')
        axes[1,0].set_ylim(-0.5, 2.5)
        axes[1,0].grid(True, alpha=0.3)

    # Performance metrics
    if len(dqn_values) > 1 and len(buy_hold_values) > 1:
        dqn_return = (dqn_values[-1] - dqn_values[0]) / dqn_values[0] * 100
        bh_return = (buy_hold_values[-1] - buy_hold_values[0]) / buy_hold_values[0] * 100

        metrics = ['Total Return (%)', 'Final Value ($)']
        dqn_metrics = [dqn_return, dqn_values[-1]]
        bh_metrics = [bh_return, buy_hold_values[-1]]

        x = np.arange(len(metrics))
        width = 0.35

        axes[1,1].bar(x - width/2, dqn_metrics, width, label='DQN Agent', alpha=0.8)
        axes[1,1].bar(x + width/2, bh_metrics, width, label='Buy & Hold', alpha=0.8)
        axes[1,1].set_title('Performance Metrics')
        axes[1,1].set_xticks(x)
        axes[1,1].set_xticklabels(metrics)
        axes[1,1].legend()
        axes[1,1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Print summary
    if len(dqn_values) > 1 and len(buy_hold_values) > 1:
        print(f"\n{'='*50}")
        print("PERFORMANCE SUMMARY")
        print(f"{'='*50}")
        print(f"DQN Agent Return: {dqn_return:.2f}%")
        print(f"Buy & Hold Return: {bh_return:.2f}%")
        print(f"Outperformance: {dqn_return - bh_return:.2f}%")
        print(f"Final DQN Value: ${dqn_values[-1]:.2f}")
        print(f"Final B&H Value: ${buy_hold_values[-1]:.2f}")
        print(f"Action Counts: Hold={action_counts[0]}, Buy={action_counts[1]}, Sell={action_counts[2]}")



In [41]:
# plot_results(dqn_values, buy_hold_values, actions)