In [None]:
# Cell 1: Imports and Environment Configuration (Model C: Pure RL)
# Imports necessary libraries for Deep Reinforcement Learning (DRL), numerical computation, and visualization.

import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Check for CUDA availability to enable GPU acceleration
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device name:", torch.cuda.get_device_name(0))
    print("Torch CUDA version:", torch.version.cuda)

In [None]:
# Cell 2: Module Import from Definition Notebook
# Enables importing functions and classes directly from Jupyter Notebook files (.ipynb).

import import_ipynb

# Imports the Quantum Environment ('qc') and the Optimization Function ('opt_classifier')
# from the definition notebook 'model_c_definition_rl.ipynb'.
from model_c_definition_rl  import qc, opt_classifier

In [None]:
# Cell 3: Experience Replay Memory
# Implements a cyclic buffer to store transitions (experiences) for off-policy training.
# This breaks temporal correlations in the data sequence, stabilizing the learning process.

# Define a named tuple to represent a single transition in the MDP: (state, action, next_state, reward, done)
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayMemory(object):
    """
    A cyclic buffer that stores experiences.
    """

    def __init__(self, capacity):
        """
        Initializes the replay memory with a fixed capacity.
        Args:
            capacity (int): Maximum number of transitions to store.
        """
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """
        Saves a transition to memory.
        """
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """
        Randomly samples a batch of transitions from memory.
        Required for Stochastic Gradient Descent (SGD) updates.
        """
        return random.sample(self.memory, batch_size)

    def __len__(self):
        """
        Returns the current number of elements in memory.
        """
        return len(self.memory)

In [None]:
# Cell 4: Deep Q-Network (DQN) Model Definition
# Defines the neural network architecture used to approximate the Q-value function.

class DQN(nn.Module):
    """
    A Multi-Layer Perceptron (MLP) serving as the function approximator for Q-learning.
    Maps state observations to Q-values for each possible action.
    """

    def __init__(self, n_observations, n_actions):
        """
        Initializes the network layers.
        Args:
            n_observations: Dimension of the input state space (flattened observation vector).
            n_actions: Dimension of the action space (number of candidate gates).
        """
        super(DQN, self).__init__()
        self.flatten = nn.Flatten() # Flattens multidimensional input tensors
        self.layer1 = nn.Linear(n_observations, 256) # Hidden Layer 1: Input -> 256 units
        self.layer2 = nn.Linear(256, 256)            # Hidden Layer 2: 256 -> 256 units
        self.layer3 = nn.Linear(256, n_actions)      # Output Layer: 256 -> Action Q-values

    def forward(self, x):
        """
        Forward pass through the network.
        Applies ReLU activation to hidden layers.
        """
        x = self.flatten(x)
        x = F.relu(self.layer1(x)) # Activation function: Rectified Linear Unit (ReLU)
        x = F.relu(self.layer2(x))
        return self.layer3(x)      # Returns raw Q-values (no activation at output)

In [None]:
# Cell 5: Hyperparameters Configuration
# Defines key parameters controlling the Reinforcement Learning process.

BATCH_SIZE = 128     # Number of transitions sampled from replay memory per update step
GAMMA = 0.8          # Discount factor for future rewards (0.8 emphasizes near-term rewards)
EPS_START = 0.9      # Initial Epsilon value for epsilon-greedy policy (High exploration)
EPS_END = 0.05       # Minimum Epsilon value (Low exploration, high exploitation)
EPS_DECAY = 500      # Rate of Epsilon decay (Slower decay due to large action space: 200 -> 500)
TAU = 0.005          # Soft update rate for target network parameters
LR = 0.001           # Learning Rate for the Adam optimizer
num_episodes = 500   # Total number of training episodes

In [None]:
# Cell 6: Environment Initialization
# Sets up the quantum circuit environment and defines state/action dimensions.

# Initialize the RL environment (Quantum Circuit Search)
env = qc()

# Define Action Space Size (Number of candidate gates)
n_actions = env.act_space 

# Note: Unlike Model B, Model C (Pure RL) does NOT use 'weights_ent_global'.
# No pre-trained entanglement parameters are initialized or passed.

# Reset environment to starting state (No arguments required for Model C)
env.reset()

# Define Observation Space Size
# Flattens the observation matrix (Circuit Depth x Gate Features) into a 1D vector.
# env.obs is a list of [Rot, CNOT, Param1, Param2], so length * 4 gives the total input dimension for DQN.
n_observations = len(env.obs * 4)

In [None]:
# Cell 7: Network Initialization and Optimizer Setup
# Initializes the Policy Network and Target Network for the DQN algorithm.
# The Target Network provides stable Q-value targets (Fixed Q-Targets) to prevent oscillation.

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict()) # Synchronize initial weights

# Optimizer: AdamW (Adam with Weight Decay Fix) is used for parameter updates.
# 'amsgrad=True' ensures long-term convergence properties.
optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)

# Initialize Experience Replay Buffer with a capacity of 10,000 transitions.
memory = ReplayMemory(10000)

steps_done = 0 # Global step counter for Epsilon decay scheduling

In [None]:
# Cell 8: Action Selection Strategy (Epsilon-Greedy)
# Implements the policy for selecting actions based on the current state.
# Balances Exploration (random action) and Exploitation (best known action) using an epsilon decay schedule.

def select_action(state):
    """
    Selects an action according to the Epsilon-Greedy policy.
    Args:
        state (tensor): The current state observation.
    Returns:
        tensor: The index of the selected action.
    """
    global steps_done
    sample = random.random()
    
    # Calculate Epsilon Threshold (Exponential Decay)
    # Threshold decreases as training progresses, shifting focus from exploration to exploitation.
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    
    # Exploitation: Select the action with the highest Q-value from the Policy Network
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) returns (values, indices). We select [1] for indices (actions).
            return policy_net(state).max(1)[1].view(1, 1)
    
    # Exploration: Select a random action from the action space
    else:
        return torch.tensor([[env.sample()]], device=device, dtype=torch.long)

In [None]:
# Cell 9: Model Optimization Function
# Performs a single step of Stochastic Gradient Descent (SGD) to update the Policy Network.
# Utilizes Experience Replay and Fixed Q-Targets to stabilize training.

def optimize_model():
    """
    Optimizes the policy network by minimizing the loss between predicted Q-values and target Q-values.
    Implements the Bellman Optimality Equation: Q(s,a) = r + gamma * max(Q(s',a'))
    """
    # 1. Check Replay Memory Size
    # Ensure there are enough samples to form a batch.
    if len(memory) < BATCH_SIZE:
        return
    
    # 2. Sample a Batch of Transitions
    # Randomly sample 'BATCH_SIZE' transitions to break temporal correlations.
    transitions = memory.sample(BATCH_SIZE)
    
    # Converts batch-array of Transitions to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # 3. Prepare Tensor Batches
    # Concatenate state, action, and reward tensors for batch processing.
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # 4. Compute Current Q-Values (Q(s_t, a_t))
    # Forward pass through Policy Network and select Q-values corresponding to taken actions.
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    
    # 5. Compute Next State Values (V(s_{t+1}) = max_a Q(s_{t+1}, a))
    # Initialize next state values to zero.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    
    # Use Target Network to compute Q-values for next states (Fixed Q-Targets).
    # We detach from the graph because target values are treated as constants (no gradient).
    with torch.no_grad():
        next_state_values = target_net(torch.cat(batch.next_state)).max(1)[0]

    # 6. Compute Expected Q-Values (Bellman Target)
    # Formula: y = r + gamma * max(Q_target(s', a'))
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # 7. Compute Loss
    # Use Huber Loss (SmoothL1Loss) which is less sensitive to outliers than MSE.
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # 8. Backpropagation and Optimization
    optimizer.zero_grad() # Clear previous gradients
    loss.backward()       # Compute gradients
    
    # Gradient Clipping: Clamps gradients to [-100, 100] to prevent exploding gradients.
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    
    optimizer.step()      # Update network parameters

In [None]:
# Cell 10: Training Log Initialization
# Initializes lists to store performance metrics and circuit data across all episodes.

# Reward tracking
reward_ep_list = []      # Detailed reward components per step
reward_sum_ep_list = []  # Total cumulative reward per episode

# Circuit and State tracking
obs_ep_list = []         # History of observation vectors (circuit structures)
outs_ep_list = []        # Optimization logs (cost/accuracy per iteration)
figset_ep_list = []      # Data snapshots for visualization
final_ep_list = []       # Final state summaries

# Performance Metrics tracking (Accuracy & Cost)
acc_train_last_ep = []
acc_val_last_ep   = []
acc_train_max_ep  = []
acc_val_max_ep    = []
cost_ep = []

In [None]:
# Cell 11: Main Training Loop
# Executes the reinforcement learning process over a specified number of episodes.
# Interactions between the agent (DQN) and the environment (Quantum Circuit) occur here.

# Measure execution time for performance profiling
# %%time

tau = 0.05 # Soft update parameter (redundant if TAU is defined in Cell 5, but kept for consistency)

for i_episode in range(num_episodes):
    # 1. Episode Initialization
    # Reset the environment to start a new circuit search.
    # Note: Unlike Model B, Model C does not pass 'weights_ent_global' to reset().
    env.reset()
    
    # Get initial state and wrap it as a tensor
    state = env.obs
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    # Initialize episode-specific logging lists
    reward_list = []; obs_list = []; outs_list = []; figset_list = []
    
    # 2. Step-by-Step Interaction
    for t in count():
        # Action Selection (Epsilon-Greedy)
        action = select_action(state)
        
        # Execute Action in the Environment
        # env.step() updates the circuit and returns success status (True/False).
        truncated = not env.step(action.item())

        if truncated:
            print('truncated error')
            break
        
        # Retrieve Transition Data
        observation = env.obs
        reward = env.reward
        terminated = env.term
        done = env.done

        # Tensor conversion for storage in Replay Memory
        reward = torch.tensor([reward], device=device)
        next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
        
        # Store Transition in Replay Memory
        memory.push(state, action, next_state, reward, done)
        
        # Move to the next state
        state = next_state

        # 3. Model Optimization (DQN Update)
        optimize_model()

        # 4. Target Network Soft Update
        # Slowly update target network parameters towards policy network parameters.
        # theta_target = tau * theta_policy + (1 - tau) * theta_target
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)
        
        # 5. Logging (Per Step)
        reward_list.append(float(reward))
        obs_list.append(env.draw)     # Store circuit diagram
        outs_list.append(env.outs)    # Store optimization logs
        figset_list.append(env.figset)# Store model weights/data
        
        # 6. Episode Termination Handling
        if terminated:
            # Extract performance metrics from the final optimization log
            cost_series = [row[1] for row in env.outs]
            acc_train_series = [row[2] for row in env.outs]
            acc_val_series   = [row[3] for row in env.outs]

            acc_train_last = float(acc_train_series[-1])
            acc_val_last   = float(acc_val_series[-1])
            acc_train_max  = float(max(acc_train_series))
            acc_val_max    = float(max(acc_val_series))
            cost_val = float(cost_series[-1])

            # Append episode metrics to global lists
            acc_train_last_ep.append(acc_train_last)
            acc_val_last_ep.append(acc_val_last)
            acc_train_max_ep.append(acc_train_max)
            acc_val_max_ep.append(acc_val_max)
            cost_ep.append(cost_val)

            # Identify the best step within the episode (highest reward)
            max_value = max(reward_list)
            max_index = reward_list.index(max_value)
            
            # Store metrics corresponding to the best step
            reward_ep_list.append(max_value)
            reward_sum_ep_list.append(sum(reward_list))
            obs_ep_list.append(obs_list[max_index])
            outs_ep_list.append(outs_list[max_index])
            
            # Store visualization data for the best step and the final step
            # Note: Model C stores 'env.gatestream' directly as it contains the full circuit structure.
            figset_ep_list.append([figset_list[max_index], env.gatestream[:max_index+1]])
            final_ep_list.append([figset_list[-1], env.gatestream[:t+1]])

            # 7. Final Episode Processing (Visualization & Saving)
            if i_episode == num_episodes - 1:
                # Re-run optimization/drawing for the final circuit of the last episode
                # Note: opt_classifier call uses only gatestream (no weights_ent).
                outs_final, draw_p, _ = opt_classifier(
                    env.gatestream,
                    iters=0, draw=True
                )
                if draw_p is not None:
                    print(f"\n=== Final circuit (episode #{i_episode+1}, steps={t+1}) ===")
                    print(draw_p)
                    with open("final_circuit.txt", "w", encoding="utf-8") as f:
                        f.write(draw_p)

                # Save the final gate sequence
                with open("final_gatestream.json", "w", encoding="utf-8") as f:
                    json.dump(env.gatestream, f, ensure_ascii=False, indent=2)
                    
            print(f"Episode complete : {i_episode+1} ({t+1}) | cost={cost_val:.3f}, acc_val_last={acc_val_last:.3f}, acc_val_max={acc_val_max:.3f}")
            break

In [None]:
# Cell 12: Training Performance Visualization (Cumulative Reward)
# Plots the total reward accumulated per episode to analyze the agent's learning convergence.
# An upward trend indicates that the agent is successfully learning to construct better circuits.

plt.xlabel('Episode')
plt.ylabel('Reward sum')
plt.plot(reward_sum_ep_list) 
plt.show()

In [None]:
# Cell 13: Training Performance Visualization (Maximum Reward)
# Plots the maximum reward achieved in each episode.
# This metric helps identify if the agent is discovering high-quality solutions (peak performance),
# even if the average performance fluctuates due to exploration.

plt.xlabel('Episode')
plt.ylabel('Max Reward')
plt.plot(reward_ep_list)
plt.show()

In [None]:
# Cell 14: Data Persistence (Saving Experiment Results)
# Serializes and saves all training metrics, circuit structures, and performance logs to disk.
# This ensures that the experimental results are preserved for post-training analysis and visualization.

import pickle

# Save Reward Metrics (Training Trajectory)
with open('reward_ep_list_non.pkl', 'wb') as f: pickle.dump(reward_ep_list, f)
with open('reward_sum_ep_list_non.pkl', 'wb') as f: pickle.dump(reward_sum_ep_list, f)

# Save Circuit Architectures and Optimization Logs
with open('obs_ep_list_non.pkl', 'wb') as f: pickle.dump(obs_ep_list, f)
with open('outs_ep_list_non.pkl', 'wb') as f: pickle.dump(outs_ep_list, f)

# Save Visualization Data and Final Model States
with open('figset_ep_list_non.pkl', 'wb') as f: pickle.dump(figset_ep_list, f)
with open('final_ep_list_non.pkl', 'wb') as f: pickle.dump(final_ep_list, f)

# Save Detailed Performance Metrics (Accuracy and Cost)
with open('acc_val_max_ep_non.pkl', 'wb') as f: pickle.dump(acc_val_max_ep, f)
with open('acc_val_last_ep_non.pkl', 'wb') as f: pickle.dump(acc_val_last_ep, f)
with open('cost_ep_non.pkl', 'wb') as f: pickle.dump(cost_ep, f)