# 1. Libraries & Sample Data
The first step is to load our Python Libraries and download the sample data. The dataset represents Apple stock price (1d bars) for the year 2010

In [1]:
# Load Python Libraries
import random
import numpy as np
import pandas as pd
from collections import deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

%config InlineBackend.figure_format = 'retina'

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [2]:
# load data, cleaned, not normalized, with features
data = pd.read_csv("../datasets/AAPL_2009-2010_6m_features_1d.csv")
state_features = ["Date", "Close", "BB_upper", "BB_lower"]
data = data[state_features]
data.set_index("Date", inplace=True)
data.head()

Unnamed: 0_level_0,Close,BB_upper,BB_lower
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-04-30,4.493929,4.570118,4.055596
2009-05-01,4.544286,4.578017,4.09959
2009-05-04,4.716786,4.642685,4.092351
2009-05-05,4.716786,4.699974,4.083704
2009-05-06,4.732143,4.750376,4.083481


# 2. Train / Test Split
Now that we have loaded our cleaned price dataset, we are ready to feed the data into our model. With this in mind, we select Close as our singular training feature, and split the data ito train and test data (80/20 split)

In [3]:
# split dataset df into train (80%) and test (20%) datasets
train_df = data.sample(frac=0.8, random_state=42)
test_df = data.drop(train_df.index)

In [4]:
# display train and test dfs (ensure no overlap)
print("Train df")
print(train_df.head())
print("Test df")
print(test_df.head())

Train df
               Close  BB_upper  BB_lower
Date                                    
2009-09-23  6.625000  6.735803  5.710983
2009-07-20  5.461071  5.419319  4.678538
2009-07-21  5.411071  5.471442  4.676915
2009-11-16  7.379643  7.522641  6.647538
2009-12-11  6.952500  7.530241  6.692152
Test df
               Close  BB_upper  BB_lower
Date                                    
2009-05-01  4.544286  4.578017  4.099590
2009-05-20  4.495357  4.763943  4.271236
2009-05-29  4.850357  4.901817  4.249398
2009-06-01  4.976786  4.968877  4.225587
2009-06-23  4.786071  5.192462  4.702146


In [5]:
# convert train and test dfs to np arrays with dtype=float
X_train = train_df.values.astype(float)
X_test = test_df.values.astype(float)
# print the shape of X_train to remind yourself how many examples and features are in the dataset
print(X_train.shape)

(137, 3)


# 3. Define the Agent
Now that our data is ready to use, we can define the Reinforcement Learning Agent.

### Define the DQN Model
The first step in defining our agent is the Deep Q-Network model definition. 
- we are creating a model sequential model with four layers. The first three layers have output shape of 64, 32, and 8, respectively, and a RELU activation. 
- The output layer has an output shape of the size of our action space (buy, sell, hold), and a linear activation.

In [6]:
class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size=3, seed=42):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action (default=3 for buy, sell, hold)
            seed (int): Random seed
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.Q = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 8),
            nn.ReLU(),
            nn.Linear(8, action_size),  # Linear activation by default
        )

    def forward(self, state):
        """Build a network that maps state -> action values."""
        actions = self.Q(state)
        return actions


# test model
q_net = QNetwork(state_size=8)  # action_size defaults to 3 (buy, sell, hold)
# fake input, batch size 4
states = torch.rand((4, 8))
# fake output
print(q_net(states).shape)  # Should print torch.Size([4, 3])

torch.Size([4, 3])


### Define Experience Replay Buffer
The Experience Replay Buffer is a key component of our agent implementation. It consists of a deque data structure with a fixed maximum length (buffer_size) that stores experiences as tuples of (state, action, reward, next_state, done). 

The buffer serves two main purposes:
1. Storing and sampling experiences for training - Random batches of experiences are drawn from the buffer to train the agent
2. Maintaining recent experiences - The last n experiences (where n is the batch size) are kept to calculate target Q-values for the current state during training


In [7]:
from collections import namedtuple


class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            batch_size (int): size of each training batch
            buffer_size (int): maximum size of buffer
            seed (int): random seed
        """
        self.batch_size = batch_size
        self.memory = deque(maxlen=buffer_size)
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"],
        )
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def _to_tensor(self, data, dtype=torch.float):
        """Convert numpy array to tensor with specified dtype and device in one operation"""
        return torch.from_numpy(np.vstack(data)).to(device=device, dtype=dtype)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""

        experiences = random.sample(self.memory, k=self.batch_size)
        experiences = [e for e in experiences if e is not None]
        states, actions, rewards, next_states, dones = zip(*experiences)

        states = self._to_tensor(states)
        actions = self._to_tensor(actions, dtype=torch.long)
        rewards = self._to_tensor(rewards)
        next_states = self._to_tensor(next_states)
        dones = self._to_tensor(dones, dtype=torch.uint8)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

### Define Agent Class

The Agent class implements a Deep Q-Learning agent that interacts with a trading environment. This implementation features a Double DQN (Deep Q-Network) architecture with experience replay for stable learning.

#### Main Methods

1. **step(state, action, reward, next_state, done)**
   - Stores experience in replay buffer
   - Triggers learning every `update_step` steps
   - Returns loss value when learning occurs

2. **select_action(state, epsilon)**
   - Implements epsilon-greedy policy for action selection
   - In test mode: Always selects best action
   - In training mode: Balances exploration and exploitation

3. **learn(experiences)**
   - Updates Q-Network parameters using sampled experiences
   - Implements Double DQN learning algorithm:
     1. Computes TD targets using target network
     2. Computes current Q-values using local network
     3. Updates networks using MSE loss
     4. Performs soft update of target network

4. **Network Updates**
   - `soft_update()`: Gradually updates target network (θ_target = α×θ + (1-α)×θ_target)
   - `hard_update()`: Directly copies parameters from local to target network

5. **Model Persistence**
   - `save(filename)`: Saves model parameters to file
   - `load(filename)`: Loads model parameters from file


In [8]:
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(
        self,
        num_features,
        window_size,
        test_mode=False,
        buffer_size=int(1e5),
        batch_size=64,
        gamma=0.99,
        alpha=1e-3,
        lr=5e-4,
        update_step=4,
        seed=42,
    ):
        """Initialize an Agent object.

        Key Features:
            - State space: Concatenated window of features (window_size * num_features)
            - Action space: 3 discrete actions (0=hold, 1=buy, 2=sell)
            - Experience replay: Stores and samples past experiences for stable learning
            - Target network: Updated softly for stable training

        Params
        ======
            buffer_size (int): Maximum size of experience replay buffer (default: 1e5)
            batch_size (int): Size of each training batch (default: 64)
            gamma (float): Discount factor for future rewards (default: 0.99)
            alpha (float): Soft update interpolation parameter (default: 1e-3)
            lr (float): Learning rate for optimizer (default: 5e-4)
            update_step (int): Frequency of network updates (default: 4)
            test_mode (bool): Flag for switching between training and testing behavior
        """
        self.seed = random.seed(seed)
        self.test_mode = test_mode

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.alpha = alpha
        self.update_step = update_step

        self.state_size = window_size * num_features
        self.action_size = 3

        # Q-Network
        self.Q = QNetwork(self.state_size, self.action_size, seed).to(device)
        self.Q_target = QNetwork(self.state_size, self.action_size, seed).to(device)
        self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed)
        # Initialize time step (for updating every update step)
        self.t_step = 0

    def __repr__(self):
        return (
            f"Q Network Arch: {self.Q}\n"
            f"State space size: {self.state_size}\n"
            f"Action space size: {self.action_size}\n"
            f"Current Memory size: {len(self.memory)}"
        )

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every update_step time steps.
        self.t_step = (self.t_step + 1) % self.update_step
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                loss = self.learn(experiences)
                return loss

    def select_action(self, state, epsilon=0.0):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.Q.eval()
        with torch.no_grad():
            actions = self.Q(state)

        if self.test_mode:
            return np.argmax(actions.cpu().data.numpy())

        self.Q.train()
        # Epsilon-greedy action selection
        if random.random() <= epsilon:
            return random.choice(np.arange(self.action_size))
        else:
            return np.argmax(actions.cpu().data.numpy())

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        self.optimizer.zero_grad()

        # get experiences
        states, actions, rewards, next_states, dones = experiences

        # compute td targets using target network
        with torch.no_grad():
            Q_targets_next = torch.max(
                self.Q_target(next_states), dim=-1, keepdim=True
            )[0]
            Q_targets = rewards + (1 - dones) * self.gamma * Q_targets_next

        # compute curr values using local network
        Q_expected = torch.gather(self.Q(states), dim=-1, index=actions)

        # compute mean squared loss using td error
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()

        # update local network parameters
        self.optimizer.step()

        # update target network parameters
        self.soft_update()

        return loss

    def soft_update(self):
        """Soft update model parameters.
        θ_target = alpha*θ + (1 - alpha)*θ_target
        =>
        θ_target = θ_target + alpha*(θ - θ_target)

        Params
        ======
            Q (PyTorch model): weights will be copied from
            Q_target (PyTorch model): weights will be copied to
            alpha (float): interpolation parameter
        """
        for target_param, local_param in zip(
            self.Q_target.parameters(), self.Q.parameters()
        ):
            target_param.data.copy_(
                target_param.data + self.alpha * (local_param.data - target_param.data)
            )

    def hard_update(self):
        """Hard update: θ_target = θ"""
        for target_param, local_param in zip(
            self.Q_target.parameters(), self.Q.parameters()
        ):
            target_param.data.copy_(local_param.data)

    def save(self, filename):
        """Save model parameters."""
        torch.save(self.Q.state_dict(), filename)

    def load(self, filename):
        """Load model parameters."""
        checkpoint = torch.load(filename)
        self.Q.load_state_dict(checkpoint)

#### Usage Modes

1. **Training Mode** (`test_mode=False`)
   - Enables exploration through epsilon-greedy policy
   - Updates networks through learning process
   - Stores experiences in replay buffer

2. **Testing Mode** (`test_mode=True`)
   - Disables exploration (always selects best action)
   - No network updates
   - Useful for evaluation and deployment

In [9]:
agent = Agent(num_features=X_train.shape[1], window_size=1)
print(agent)

Q Network Arch: QNetwork(
  (Q): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=8, bias=True)
    (5): ReLU()
    (6): Linear(in_features=8, out_features=3, bias=True)
  )
)
State space size: 3
Action space size: 3
Current Memory size: 0
