# Multi-Agent Deep Deterministic Policy Gradient (MADDPG) framework for portfolio optimization

## Summary

- **MADDPG Algorithm**: Five agents, each with an actor (policy) and critic (value estimator) neural network, learn to adjust their asset’s weight. They share the portfolio’s reward to encourage collaboration, using a replay buffer for stable training.
- **Training**: The agents train over 500 episodes, each up to 252 steps (a trading year), updating policies with batched experiences. Exploration noise helps discover optimal allocations.
- **Data**: Uses five years of daily stock returns, ensuring realistic market dynamics.

## Dependencies


In [45]:
# ! pip install --upgrade yfinance plotly nbformat seaborn scipy torch gymnasium --quiet

In [46]:
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces

import torch
import torch.nn as nn
import torch.optim as optim

import yfinance as yf

## Portfolio Environment

In [47]:
class PortfolioEnv(gym.Env):
    def __init__(self, asset_data, transaction_cost=0.001, max_steps=252):
        super(PortfolioEnv, self).__init__()
        self.asset_data = asset_data  # DataFrame with asset prices
        self.n_assets = asset_data.shape[1]
        self.transaction_cost = transaction_cost
        self.max_steps = max_steps
        self.current_step = 0
        
        # State: returns, volatility, weights
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(self.n_assets * 3,), dtype=np.float32
        )
        # Action: weight adjustments for each asset
        self.action_space = spaces.Box(
            low=-1.0, high=1.0, shape=(self.n_assets,), dtype=np.float32
        )
        
        self.weights = np.ones(self.n_assets) / self.n_assets
        
    def reset(self):
        self.current_step = 0
        self.weights = np.ones(self.n_assets) / self.n_assets
        return self._get_state()
    
    def step(self, action):
        # Apply action to adjust weights
        weights_new = self.weights + action * 0.1  # Scale action for stability
        weights_new = np.clip(weights_new, 0, 1)
        weights_new /= np.sum(weights_new)  # Normalize to sum to 1
        
        # Calculate portfolio return
        returns = self.asset_data.iloc[self.current_step].values
        portfolio_return = np.sum(weights_new * returns)
        
        # Calculate portfolio volatility (simplified, based on recent returns)
        window = min(self.current_step, 20)
        recent_returns = self.asset_data.iloc[max(0, self.current_step - window):self.current_step].values
        portfolio_vol = np.std(recent_returns @ weights_new) if window > 0 else 0.0
        
        # Reward: Approximate Sharpe ratio (return - risk penalty)
        reward = portfolio_return - 0.5 * portfolio_vol ** 2
        
        # Transaction costs
        cost = self.transaction_cost * np.sum(np.abs(weights_new - self.weights))
        reward -= cost
        
        self.weights = weights_new
        self.current_step += 1
        
        done = self.current_step >= self.max_steps or self.current_step >= len(self.asset_data)
        state = self._get_state()
        
        return state, reward, done, {}
    
    def _get_state(self):
        returns = self.asset_data.iloc[self.current_step].values
        window = min(self.current_step, 20)
        vols = self.asset_data.iloc[max(0, self.current_step - window):self.current_step].std().values
        return np.concatenate([returns, vols, self.weights]).astype(np.float32)


## Neural Network for Actor and Critic

In [48]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Tanh()
        )
        
    def forward(self, state):
        return self.net(state)

class Critic(nn.Module):
    def __init__(self, state_dim, total_action_dim):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + total_action_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, state, action):
        return self.net(torch.cat([state, action], dim=-1))

## MADDPG Agent

In [49]:
class MADDPGAgent:
    def __init__(self, state_dim, action_dim, total_action_dim, lr_actor=1e-4, lr_critic=1e-3, tau=0.01):
        self.actor = Actor(state_dim, action_dim)
        self.actor_target = Actor(state_dim, action_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        
        self.critic = Critic(state_dim, total_action_dim)
        self.critic_target = Critic(state_dim, total_action_dim)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic)
        
        self.tau = tau
        self.action_dim = action_dim
        
    def select_action(self, state, noise_scale=0.1):
        state = torch.FloatTensor(state).unsqueeze(0)
        action = self.actor(state).detach().numpy()[0]
        noise = noise_scale * np.random.randn(self.action_dim)
        return np.clip(action + noise, -1.0, 1.0)
    
    def update(self, replay_buffer, batch_size, agents, gamma=0.99):
        states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
        
        state = torch.FloatTensor(states[:, 0, :])  # [batch_size, state_dim]
        action = torch.FloatTensor(actions.reshape(batch_size, -1))  # [batch_size, total_action_dim]
        reward = torch.FloatTensor(rewards[:, 0])  # [batch_size]
        next_state = torch.FloatTensor(next_states[:, 0, :])  # [batch_size, state_dim]
        done = torch.FloatTensor(dones[:, 0])  # [batch_size]
        
        # Critic update
        with torch.no_grad():
            next_actions = torch.cat([agent.actor_target(next_state) for agent in agents], dim=-1)  # [batch_size, total_action_dim]
            q_next = self.critic_target(next_state, next_actions)
            q_target = reward + gamma * q_next * (1 - done)
        
        q_value = self.critic(state, action)
        critic_loss = nn.MSELoss()(q_value, q_target)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        # Actor update
        pred_action = self.actor(state)
        all_actions = []
        for i, agent in enumerate(agents):
            if i == 0:  # Current agent's predicted action
                all_actions.append(pred_action)
            else:  # Other agents' actions from replay buffer
                all_actions.append(torch.FloatTensor(actions[:, i, :]))
        all_actions = torch.cat(all_actions, dim=-1)
        actor_loss = -self.critic(state, all_actions).mean()
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # Soft update target networks
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

## Replay Buffer

In [50]:
class ReplayBuffer:
    def __init__(self, max_size, state_dim, action_dim, n_agents):
        self.max_size = max_size
        self.ptr = 0
        self.size = 0
        
        self.state = np.zeros((max_size, n_agents, state_dim))
        self.action = np.zeros((max_size, n_agents, action_dim))
        self.reward = np.zeros((max_size, n_agents))
        self.next_state = np.zeros((max_size, n_agents, state_dim))
        self.done = np.zeros((max_size, n_agents))
        
    def add(self, state, action, reward, next_state, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.reward[self.ptr] = reward
        self.next_state[self.ptr] = next_state
        self.done[self.ptr] = done
        
        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)
    
    def sample(self, batch_size):
        idx = np.random.randint(0, self.size, size=batch_size)
        return (
            self.state[idx],
            self.action[idx],
            self.reward[idx],
            self.next_state[idx],
            self.done[idx]
        )

## Training Set Up

### Parameters

In [51]:
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']
start_date = (datetime.now() - timedelta(days=5*365)).strftime('%Y-%m-%d') # 5 years of data
end_date = datetime.now().strftime('%Y-%m-%d')
max_episodes = 500
max_steps = 252  # ~1 year of trading days
batch_size = 64
replay_buffer_size = 100000

### Fetch data

In [52]:
def fetch_data(tickers, start_date, end_date):
    data = yf.download(tickers, start=start_date, end=end_date, auto_adjust=False,)['Adj Close']
    returns = data.pct_change().dropna()
    return returns

asset_data = fetch_data(tickers, start_date, end_date)
asset_data.sample(frac=1, random_state=42)  # Shuffle data for randomness

[*********************100%***********************]  5 of 5 completed


Ticker,AAPL,AMZN,GOOGL,MSFT,TSLA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-01-21,-0.031916,0.021112,0.010459,-0.001235,-0.005698
2020-09-10,-0.032646,-0.028605,-0.013689,-0.028018,0.013815
2020-06-30,0.008348,0.029264,0.014944,0.025549,0.069807
2020-07-17,-0.002021,-0.012644,0.001274,-0.005100,0.000133
2024-07-19,0.000580,-0.003374,-0.000169,-0.007403,-0.040244
...,...,...,...,...,...
2024-06-11,0.072649,0.000909,0.009200,0.011242,-0.018010
2024-08-23,0.010288,0.005167,0.011111,0.002984,0.045856
2024-10-14,0.016480,-0.006779,0.010537,0.006774,0.006244
2023-09-18,0.016913,-0.002920,0.005895,-0.003513,-0.033201


### Environment and Agents

In [53]:
env = PortfolioEnv(asset_data)
n_agents = len(tickers)
state_dim = env.observation_space.shape[0]  # 15
action_dim = 1  # Each agent controls one asset
total_action_dim = n_agents * action_dim  # 5

# Initialize agents
agents = [
    MADDPGAgent(state_dim, action_dim, total_action_dim)
    for _ in range(n_agents)
]
# Replay buffer
replay_buffer = ReplayBuffer(replay_buffer_size, state_dim, action_dim, n_agents)    

### Training Loop

In [54]:
for episode in range(max_episodes):
    state = env.reset()
    episode_reward = 0

    for step in range(max_steps):
        actions = []
        for agent in agents:
            action = agent.select_action(
                state, noise_scale=0.1 * (1 - episode / max_episodes)
            )
            actions.append(action)
        actions = np.concatenate(actions)

        next_state, reward, done, _ = env.step(actions)

        # Store transition
        replay_buffer.add(
            np.array([state] * n_agents),
            np.array([actions.reshape(n_agents, action_dim)]),
            np.array([reward] * n_agents),
            np.array([next_state] * n_agents),
            np.array([done] * n_agents),
        )

        # Update agents
        if replay_buffer.size > batch_size:
            for agent in agents:
                agent.update(replay_buffer, batch_size, agents)

        state = next_state
        episode_reward += reward

        if done:
            break

    print(f"Episode {episode + 1}/{max_episodes}, Reward: {episode_reward:.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)


Episode 1/500, Reward: nan
Episode 2/500, Reward: nan
Episode 3/500, Reward: nan
Episode 4/500, Reward: nan
Episode 5/500, Reward: nan
Episode 6/500, Reward: nan
Episode 7/500, Reward: nan
Episode 8/500, Reward: nan
Episode 9/500, Reward: nan
Episode 10/500, Reward: nan
Episode 11/500, Reward: nan
Episode 12/500, Reward: nan
Episode 13/500, Reward: nan
Episode 14/500, Reward: nan
Episode 15/500, Reward: nan
Episode 16/500, Reward: nan
Episode 17/500, Reward: nan
Episode 18/500, Reward: nan
Episode 19/500, Reward: nan
Episode 20/500, Reward: nan
Episode 21/500, Reward: nan
Episode 22/500, Reward: nan
Episode 23/500, Reward: nan
Episode 24/500, Reward: nan
Episode 25/500, Reward: nan
Episode 26/500, Reward: nan
Episode 27/500, Reward: nan
Episode 28/500, Reward: nan
Episode 29/500, Reward: nan
Episode 30/500, Reward: nan
Episode 31/500, Reward: nan
Episode 32/500, Reward: nan
Episode 33/500, Reward: nan
Episode 34/500, Reward: nan
Episode 35/500, Reward: nan
Episode 36/500, Reward: nan
E

KeyboardInterrupt: 

### Save models

In [None]:
for i, agent in enumerate(agents):
    torch.save(agent.actor.state_dict(), f"actor_agent_{i}.pth")
    torch.save(agent.critic.state_dict(), f"critic_agent_{i}.pth")