In [None]:
# Cell 1: Imports and Environment Configuration
# Imports necessary libraries for Deep Reinforcement Learning (DRL), numerical computation, and visualization.

import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
# Cell 2: Device Configuration
# Check for CUDA availability to enable GPU acceleration for tensor operations.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Cell 3: Module Import from Definition Notebook
# Imports functions and classes directly from the definition notebook.
import import_ipynb
from parity_definition import qc  # Imports the Quantum Environment class

In [None]:
# Cell 4: Experience Replay Memory
# Implements a cyclic buffer to store transitions (experiences) for off-policy training.
# This breaks temporal correlations in the data sequence, stabilizing the learning process.

# Define a named tuple to represent a single transition in the MDP.
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayMemory(object):
    """
    A cyclic buffer that stores experiences for Experience Replay.
    """

    def __init__(self, capacity):
        """
        Initializes the replay memory with a fixed capacity.
        Args:
            capacity (int): Maximum number of transitions to store.
        """
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """
        Saves a transition to memory.
        """
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """
        Randomly samples a batch of transitions from memory.
        Required for Stochastic Gradient Descent (SGD) updates.
        """
        return random.sample(self.memory, batch_size)

    def __len__(self):
        """
        Returns the current number of elements in memory.
        """
        return len(self.memory)

In [None]:
# Cell 5: Deep Q-Network (DQN) Model Definition
# Defines the neural network architecture used to approximate the Q-value function.

class DQN(nn.Module):
    """
    A Multi-Layer Perceptron (MLP) serving as the function approximator for Q-learning.
    Maps state observations to Q-values for each possible action.
    """

    def __init__(self, n_observations, n_actions):
        """
        Initializes the network layers.
        Args:
            n_observations: Dimension of the input state space.
            n_actions: Dimension of the action space.
        """
        super(DQN, self).__init__()
        self.flatten = nn.Flatten() # Flattens multidimensional input tensors
        self.layer1 = nn.Linear(n_observations, 256) # Hidden Layer 1: Input -> 256 units
        self.layer2 = nn.Linear(256, 256)            # Hidden Layer 2: 256 -> 256 units
        self.layer3 = nn.Linear(256, n_actions)      # Output Layer: 256 -> Action Q-values

    def forward(self, x):
        """
        Forward pass through the network.
        """
        x = self.flatten(x)
        x = F.relu(self.layer1(x)) # Activation function: ReLU
        x = F.relu(self.layer2(x))
        return self.layer3(x)      # Returns raw Q-values

In [None]:
# Cell 6: Hyperparameters Configuration
# Defines key parameters controlling the Reinforcement Learning process.

BATCH_SIZE = 128     # Number of transitions sampled from replay memory per update step
GAMMA = 0.8          # Discount factor for future rewards
EPS_START = 0.9      # Initial Epsilon value for epsilon-greedy policy (High exploration)
EPS_END = 0.05       # Minimum Epsilon value (Low exploration)
EPS_DECAY = 200      # Rate of Epsilon decay
TAU = 0.005          # Soft update rate for target network parameters
LR = 0.001           # Learning Rate for the Adam optimizer

In [None]:
# Cell 7: Environment Initialization
# Sets up the quantum circuit environment and defines state/action dimensions.

# Initialize the RL environment (Parity Problem)
env = qc()

n_actions = env.act_space 
env.reset()

# Define Observation Space Size
# Flattens the observation matrix (Circuit Depth x Gate Features) into a 1D vector.
n_observations = len(env.obs * 4)

In [None]:
# Cell 8: Network Initialization and Optimizer Setup
# Initializes the Policy Network and Target Network for the DQN algorithm.

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict()) # Synchronize initial weights

# Optimizer: AdamW (Adam with Weight Decay Fix)
optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)

# Initialize Experience Replay Buffer
memory = ReplayMemory(10000)

steps_done = 0 # Global step counter for Epsilon decay

In [None]:
# Cell 9: Action Selection Strategy (Epsilon-Greedy)
# Implements the policy for selecting actions based on the current state.

def select_action(state):
    """
    Selects an action according to the Epsilon-Greedy policy.
    """
    global steps_done
    sample = random.random()
    
    # Calculate Epsilon Threshold (Exponential Decay)
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    
    # Exploitation: Select action with highest Q-value
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    # Exploration: Select random action
    else:
        return torch.tensor([[env.sample()]], device=device, dtype=torch.long)

In [None]:
# Cell 10: Model Optimization Function
# Performs a single step of Stochastic Gradient Descent (SGD) to update the Policy Network.

def optimize_model():
    """
    Optimizes the policy network by minimizing the loss between predicted Q-values and target Q-values.
    """
    if len(memory) < BATCH_SIZE:
        return
    
    # Sample a random batch of transitions
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Current Q-Values
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    
    # Compute Next State Values using Target Network
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values = target_net(torch.cat(batch.next_state)).max(1)[0]

    # Compute Expected Q-Values (Bellman Target)
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Loss (Huber Loss)
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimization Step
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100) # Gradient Clipping
    optimizer.step()

In [None]:
# Cell 11: Logging Initialization
# Initializes lists to track performance metrics across all training episodes.

reward_ep_list = []      # History of max rewards per episode
reward_sum_ep_list = []  # History of cumulative rewards per episode
obs_ep_list = []         # History of best circuit structures
outs_ep_list = []        # History of optimization logs

In [None]:
# Cell 12: Training Configuration
num_episodes = 5000 # Total number of training episodes for the Parity problem

In [None]:
# Cell 13: Main Training Loop
# Executes the reinforcement learning process.

# %%time
for i_episode in range(num_episodes):
    env.reset()
    state = env.obs
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    reward_list = []; obs_list = []; outs_list = []
    
    for t in count():
        # 1. Action Selection
        action = select_action(state)
        truncated = not env.step(action.item())

        if truncated:
            print('truncated error')
            break
        
        # 2. Environment Step
        observation = env.obs
        reward = env.reward
        terminated = env.term
        done = env.done

        reward = torch.tensor([reward], device=device)
        next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
        
        # 3. Store Transition
        memory.push(state, action, next_state, reward, done)
        state = next_state

        # 4. Optimize Model
        optimize_model()

        # 5. Soft Update Target Network
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)
        
        # 6. Logging
        reward_list.append(reward)
        obs_list.append(env.draw)
        outs_list.append(env.outs)
        
        # 7. Episode Termination
        if terminated:
            max_value = max(reward_list)
            max_index = reward_list.index(max_value)
            
            reward_ep_list.append(max_value)
            reward_sum_ep_list.append(sum(reward_list))
            obs_ep_list.append(obs_list[max_index])
            outs_ep_list.append(outs_list[max_index])
            
            print("Episode complete : ", i_episode+1,"(", t+1, ")")
            break

In [None]:
# Cell 14: Data Persistence Setup
import pickle

In [None]:
# Cell 15: Saving Experiment Results
# Serializes and saves the training results to disk.

with open('reward_ep_list.pkl', 'wb') as file: pickle.dump(reward_ep_list, file)
with open('reward_sum_ep_list.pkl', 'wb') as file: pickle.dump(reward_sum_ep_list, file)
with open('obs_ep_list.pkl', 'wb') as file: pickle.dump(obs_ep_list, file)
with open('outs_ep_list.pkl', 'wb') as file: pickle.dump(outs_ep_list, file)