In [1]:
import pennylane as qml
import pennylane.numpy as np
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt

!nvidia-smi

In [2]:
def Cliff2():
    """
    Random 2-qubit Clifford circuit.

    Arguments:
        -nodes (np.ndarray): 
    
    Returns:
        -null
    """
    
    weights = np.random.randint(2, size=(2, 10))
    
    return qml.matrix(qml.RandomLayers(weights=weights,wires=[0,1])).numpy()

In [3]:
def RandomLayers(N_QUBITS, DEPTH):
    """
    Generates brick wall pattern of random 2 qubit Clifford gates

    Arguments:
        -N_QUBITS (int): Number of qubits
        -DEPTH (int): Depth of the circuit

    Returns:
        -random_layers (np.ndarray): Array of 4x4 unitaries (N_QUBITS, DEPTH, 4, 4)
    
    """

    random_layers = []
    for t in range(DEPTH):
        layer = []
        for x in range(0,N_QUBITS,2):
                layer.append(Cliff2())
        random_layers.append(layer)

    return random_layers


In [4]:
N_QUBITS = 2*3
DEPTH = 2

# random_layers = []
# # for t in range(DEPTH):
# #         layer = []
# #         for x in range(0,N_QUBITS,2):
# #                 layer.append(Cliff2())
# #         random_layers.append(layer)

random_layers = RandomLayers(N_QUBITS,DEPTH)

dev = qml.device("default.qubit", wires=N_QUBITS)

@qml.qnode(dev)
def circuit(theta):
    """
    Quantum circuit with random entangling Clifford layers and disentangling layers.
    
    Arguments:
        -theta (np.ndarray): Binary matrix representing the positions of projections. (N_QUBITS, DEPTH)
    
    Returns:
        -Average Von Neumann entropy (float32): Average of 2-qubit Von Neumann entropies over all neighbors.
    """

    theta = theta.T
    DEPTH,N_QUBITS = np.shape(theta)

    for t in range(DEPTH):
        layer = random_layers[t]
        if t%2==0:
            for x in range(0,N_QUBITS,2):
                brick = layer[int(x/2)]
                qml.QubitUnitary(brick,wires=[x,x+1])
        elif t%2==1:
            for x in range(1,N_QUBITS-2,2):
                brick = layer[int((x-1)/2)]
                qml.QubitUnitary(brick,wires=[x,x+1])
            brick = layer[-1]
            qml.QubitUnitary(brick,wires=[N_QUBITS-1,0])
            
        projections = theta[t]
        for x in range(N_QUBITS):
            if projections[x]==1:
                qml.Projector(state=[0],wires=[x])
            
    entropies = []
    for x in range(N_QUBITS-1):
        entropies.append(qml.vn_entropy(wires=[x,x+1]))
    entropies.append(qml.vn_entropy(wires=[N_QUBITS-1,0]))
        
    return entropies

In [5]:
random_layers[0][1]

array([[-4.97490862e-01-2.48823376e-02j, -1.38777878e-17-4.45242875e-01j,
        -3.54012799e-01+5.67876398e-01j,  2.85887162e-01+1.55249344e-01j],
       [ 2.90777382e-01+1.54465493e-01j,  2.85887162e-01-6.09898057e-01j,
         3.71203570e-01-2.40565752e-01j,  2.91926582e-01+3.96228110e-01j],
       [ 9.11822553e-02-6.62943785e-01j,  1.55249344e-01+2.85887162e-01j,
         3.98294357e-01+2.99128557e-01j,  4.45242875e-01+1.38777878e-17j],
       [-2.40565752e-01+3.71203570e-01j,  2.90960383e-02+4.91295496e-01j,
        -1.54465493e-01-2.90777382e-01j,  6.09898057e-01+2.85887162e-01j]])

In [6]:
theta = np.random.randint(2, size=(N_QUBITS,DEPTH))
print(circuit(theta))
print(type(circuit(theta)))
drawer = qml.draw(circuit)

print(drawer(theta))

[0.06723857332377185, 1.4539090546957973e-15, 0.2971991799364353, 4.696891297018427e-15, 0.22996060661266332, 1.6236596776405806e-15]
<class 'list'>
0: ─╭U(M0)────────────────╭U(M5)──|0⟩⟨0|─┤ ╭vnentropy                                            
1: ─╰U(M0)─────────╭U(M3)─│──────────────┤ ╰vnentropy ╭vnentropy                                 
2: ─╭U(M1)──|0⟩⟨0|─╰U(M3)─│──────────────┤            ╰vnentropy ╭vnentropy                      
3: ─╰U(M1)──|0⟩⟨0|─╭U(M4)─│──────────────┤                       ╰vnentropy ╭vnentropy           
4: ─╭U(M2)─────────╰U(M4)─│──────────────┤                                  ╰vnentropy ╭vnentropy
5: ─╰U(M2)──|0⟩⟨0|────────╰U(M5)─────────┤                                             ╰vnentropy

  ╭vnentropy
  │         
  │         
  │         
  │         
  ╰vnentropy

M0 = 
[[-0.41310137+0.10651587j  0.2179702 +0.33404486j -0.75994442+0.18379512j
  -0.11647631-0.18450404j]
 [-0.12048319-0.1814011j   0.47829843-0.61791042j -0.21578124-0.33574001j
  

In [7]:
class Disentangler(gym.Env):
    """
    Reinforcement learning environment for the disentangler.
    """
    
    def __init__(self, n_qubits, depth):
        super(Disentangler, self).__init__()
        
        self.N_QUBITS = n_qubits
        self.DEPTH = depth

        self.action_space = gym.spaces.Discrete(self.N_QUBITS * self.DEPTH)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(self.N_QUBITS, self.DEPTH), dtype=np.int8)
        self.state = np.zeros((self.N_QUBITS, self.DEPTH), dtype=np.int8)

    def step(self, action):
        # Initialize reward and done
        reward = 0
        done = False
        truncate = False

        # Apply the action
        h = np.zeros(self.N_QUBITS * self.DEPTH, dtype=np.int8)
        h[action] = 1
        h = h.reshape((self.N_QUBITS, self.DEPTH))
        self.state = (self.state + h) % 2

        # Calculate entropy (assumes circuit is a predefined function)
        entropies = circuit(self.state)
        entropy = np.mean(entropies)

        # Check if the state is trivial
        trivial1 = (np.sum(self.state[:, -1]) == self.N_QUBITS)
        trivial2 = (np.sum(self.state[:, -1]) == self.N_QUBITS - 1)
        trivial = trivial1 or trivial2

        # Determine reward and done conditions
        if entropy < 1e-17:
            reward = 100
            done = True
        elif trivial:
            reward = -1000
            truncate = True
        
        # Return the state, reward, done flag, and info
        info = {}
        return self.state, reward, done, truncate, info
    
    def reset(self, seed=None):
        # Seed the random number generator if a seed is provided
        if seed is not None:
            np.random.seed(seed)
        
        # Reset the state to an all-zero matrix
        self.state = np.zeros((self.N_QUBITS, self.DEPTH), dtype=np.int8)

        info = {}
        return self.state, info
    
    def render(self):
        print()

    def close(self):
        # Optional: Implement any cleanup
        pass


In [8]:
env = Disentangler(n_qubits=N_QUBITS,depth=DEPTH)
env = gym.wrappers.TimeLimit(env, max_episode_steps=10)
env = Monitor(env, allow_early_resets=True)

In [9]:
# Testing the environment
env = Disentangler(n_qubits=N_QUBITS, depth=DEPTH)
obs = env.reset()
print(f"Initial Observation: \n {obs}")

action = env.action_space.sample()
obs, reward, done, truncate, info = env.step(action)
print(f"Observation: \n {obs}, Reward: {reward}, Done: {done}, Truncate: {truncate}, Info: {info}")

Initial Observation: 
 (tensor([[0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]], dtype=int8, requires_grad=True), {})
Observation: 
 [[0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]], Reward: 0, Done: False, Truncate: False, Info: {}


In [10]:
model = PPO('MlpPolicy', env, verbose=1, learning_rate=0.01)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [11]:
model.learn(total_timesteps=100)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 36.3     |
|    ep_rew_mean     | -686     |
| time/              |          |
|    fps             | 128      |
|    iterations      | 1        |
|    time_elapsed    | 15       |
|    total_timesteps | 2048     |
---------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x13a32773a90>

In [12]:
model.save("disentangler_ppo")

In [13]:
env = Disentangler(n_qubits=N_QUBITS,depth=DEPTH)

In [19]:
env.training = False
model = PPO.load("disentangler_ppo", env=env)

num_episodes = 10
total_rewards = []

for _ in range(num_episodes):
    obs = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward

    total_rewards.append(episode_reward)
    print(f"Episode Reward: {episode_reward}")

avg_reward = np.mean(total_rewards)
print(f"Average Reward: {avg_reward}")
plt.plot(total_rewards)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


ValueError: You have passed a tuple to the predict() function instead of a Numpy array or a Dict. You are probably mixing Gym API with SB3 VecEnv API: `obs, info = env.reset()` (Gym) vs `obs = vec_env.reset()` (SB3 VecEnv). See related issue https://github.com/DLR-RM/stable-baselines3/issues/1694 and documentation for more information: https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html#vecenv-api-vs-gym-api