# Alessio Code

In [1]:
import gymnasium as gym
import numpy as np
import pennylane as qml

from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import SubprocVecEnv

# Plotting and monitoring
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common import results_plotter
from stable_baselines3.common.results_plotter import ts2xy, load_results
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback

import os
LOG_DIR = os.path.join( os.getcwd(), 'log')
os.makedirs(LOG_DIR, exist_ok=True)

## Chapter 1: Single quibt superposition state

In [37]:
X, Y, Z = qml.PauliX, qml.PauliY, qml.PauliZ
Id = qml.Identity(wires=[0]).matrix()

def layer(params):
    U = qml.RX(params[1],wires=0).matrix() @ qml.RZ(params[0],wires=0).matrix()
    return U

def profit(state,target_state):
    return np.abs(np.dot(target_state.conj(), state))**2

# Solution:
# [0, np.pi/2], [np.pi/2, 0] = [2, 4], [4, 2]
# [0, -np.pi/2], [-np.pi/2, 0] = [2, 0], [0, 2]
# target_state = qml.Hadamard(wires=0).matrix() @ np.array([1,0])
# state = layer([np.pi/2, 0]) @ layer([0, np.pi/2]) @ np.array([1,0])
# profit(target_state, state)

In [3]:
psi0 = np.array([1,0])

class Parametric_env(gym.Env):
    MAX_STEPS = 4
    INFIDELITY_THRESHOLD = 1e-06

    def __init__(self, env_conf):
        self.target = env_conf["Target"]
        self.alpha = env_conf["Lagrange_time"]

        self.U = Id
        self.psi=psi0
        self.psi0=psi0
        
        self.action_space = gym.spaces.MultiDiscrete([5,5])
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float64)
        
    def _get_obs(self):
        ob = np.concatenate([np.real(self.psi),np.imag(self.psi)])
        return ob
    
    def reset(self,seed=None, options=None):
        super().reset(seed=seed)
        self.U = Id
        self.psi=psi0
        self.fidelity = profit(self.psi,self.target)
        self.count = 0
        self.reward = 0
        self.done = False
        self.duration = 0
        observation = self._get_obs()
        self.info = {}

        return observation,{}
    
    def _get_params(self,para):
        az = 0
    
        if para==0:
            az = -np.pi/2
        elif para == 1:
            az = -np.pi/4
        elif para == 2:
            az = 0
        elif para == 3:
            az = np.pi/4
        elif para == 4:
            az = np.pi/2
        return az

    def step(self, action):
        if self.done:
            print("EPISODE DONE!!!")
        elif (self.count == self.MAX_STEPS):
            self.done = True
        else:
            assert self.action_space.contains(action)
            self.count += 1
            
        self.params = [self._get_params(action[0]),self._get_params(action[1])] 
        op = layer(self.params)

        self.U = op @ self.U  
        self.psi = self.U @ self.psi0

        self.fidelity = profit(self.psi,self.target)
        self.info = {"Fidelity": self.fidelity}

        if self.done:
            self.reward = self.fidelity
        else:
            self.reward = 0

        observation = self._get_obs()
        return [observation, self.reward, self.done,self.done, self.info]

In [4]:
rho_target = np.array([0,1])
env = Parametric_env(env_conf={"Target": rho_target,"Lagrange_time":1})
check_env(env, warn=True)

In [5]:
target_gate = qml.Hadamard(wires=0).matrix()
target_state = target_gate @ psi0

array([0.70710678, 0.70710678])

In [6]:
vec_env_test = make_vec_env(lambda:Parametric_env(env_conf={"Target": target_state, "Lagrange_time":0})
, n_envs=1)

vec_env = make_vec_env(lambda:Parametric_env(env_conf={"Target": target_state, "Lagrange_time":0})
, n_envs=2,seed=0, vec_env_cls=SubprocVecEnv)

model = PPO("MlpPolicy", vec_env, verbose=1)

Using cpu device


In [7]:
mean_reward, std_reward = evaluate_policy(model, vec_env_test, n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward:.2f}")

Mean reward: 0.6401649999999999 +/- 0.00


In [8]:
model.learn(50_000)
# model.save("Simple_example")
# model_loaded = PPO.load("Simple_example")

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5        |
|    ep_rew_mean     | 0.519    |
| time/              |          |
|    fps             | 641      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 4096     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.52        |
| time/                   |             |
|    fps                  | 456         |
|    iterations           | 2           |
|    time_elapsed         | 17          |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.012180576 |
|    clip_fraction        | 0.138       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.21       |
|    explained_variance   | -1.22       |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x2627858bf90>

In [9]:
mean_reward, std_reward = evaluate_policy(model, vec_env_test, n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward:.2f}")

Mean reward: 1.0 +/- 0.00


In [10]:
obs = vec_env.reset()
n_steps = 8
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic= False)
    print(f"Step {step + 1}")
    print("Action: ", action)
    obs, reward, done, info = vec_env.step(action)
    print("reward=", reward)
    print("Fidelity:", info)
    if done.all():
        if info[0]["Fidelity"] > 0.99:
            print("Goal reached", "Fidelity=", info)
        else:
            print(" Max number of layers", "Fidelity=", info[0]["Fidelity"],"Fidelity=",info[1]["Fidelity"])
        break

Step 1
Action:  [[1 1]
 [3 2]]
reward= [0 0]
Fidelity: ({'Fidelity': np.float64(0.5000000000000001), 'TimeLimit.truncated': False}, {'Fidelity': np.float64(0.4999999999999999), 'TimeLimit.truncated': False})
Step 2
Action:  [[0 4]
 [1 3]]
reward= [0 0]
Fidelity: ({'Fidelity': np.float64(0.8535533905932742), 'TimeLimit.truncated': False}, {'Fidelity': np.float64(0.4999999999999999), 'TimeLimit.truncated': False})
Step 3
Action:  [[3 0]
 [3 0]]
reward= [0 0]
Fidelity: ({'Fidelity': np.float64(1.0000000000000004), 'TimeLimit.truncated': False}, {'Fidelity': np.float64(0.7499999999999999), 'TimeLimit.truncated': False})
Step 4
Action:  [[2 4]
 [1 3]]
reward= [0 0]
Fidelity: ({'Fidelity': np.float64(1.0000000000000009), 'TimeLimit.truncated': False}, {'Fidelity': np.float64(0.9267766952966368), 'TimeLimit.truncated': False})
Step 5
Action:  [[2 4]
 [2 2]]
reward= [1.        0.9267767]
Fidelity: ({'Fidelity': np.float64(1.0000000000000009), 'episode': {'r': 1.0, 'l': 5, 't': 136.604352}, 'Ti

### Monitoring

In [14]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    def __init__(self, check_freq, log_dir, verbose=1):
        
        super().__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, "best_model")
        self.best_mean_reward = -np.inf

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), "timesteps")
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print(f"Num timesteps: {self.num_timesteps}")
                    print(
                        f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}"
                    )

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print(f"Saving new best model to {self.save_path}.zip")
                    self.model.save(self.save_path)

        return True

In [None]:
vec_env = Parametric_env(env_conf={"Target": target,"Lagrange_time":alpha})

vec_env = Monitor(vec_env, LOG_DIR)
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=LOG_DIR)

model_monitor = PPO("MlpPolicy", vec_env, verbose=1)
model_monitor.learn(total_timesteps=50_000, callback=callback)

Using cpu device
Wrapping the env in a DummyVecEnv.
Num timesteps: 1000
Best mean reward: -inf - Last mean reward per episode: 0.53
Saving new best model to C:\Users\Dennis Herb\OneDrive\2_Uni\Doktor\python_projects\NVcenter\development\log\best_model.zip
Num timesteps: 2000
Best mean reward: 0.53 - Last mean reward per episode: 0.51
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5        |
|    ep_rew_mean     | 0.49     |
| time/              |          |
|    fps             | 676      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
Num timesteps: 3000
Best mean reward: 0.53 - Last mean reward per episode: 0.52
Num timesteps: 4000
Best mean reward: 0.53 - Last mean reward per episode: 0.50
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.493     

In [None]:
def moving_average(values, window):
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, "valid")

def plot_results(log_dir, title="Learning Curve"):

    x, y = ts2xy(load_results(log_dir), "timesteps")
    y = moving_average(y, window=50)
    # Truncate x
    x = x[len(x) - len(y) :]

    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel("Number of Timesteps")
    plt.ylabel("Rewards")
    plt.title(title + " Smoothed")
    plt.show()

In [None]:
plot_results(LOG_DIR)

## Chapter 2: Theory