# Reinforcement Learning

In [1]:
%load_ext autoreload
%autoreload 2

import os
ROOT_DIR = os.getcwd()[:os.getcwd().rfind('NVcenter')]+ 'NVcenter'
os.chdir(ROOT_DIR)

from NVcenter import *
plt.style.use('NVcenter-default')

import numpy as np 
import matplotlib.pyplot as plt
import qutip as q

## Alessio

In [2]:
# relaxation
T1 = 6e-3 
relax_rate = 0#1/T1 # relaxation
relax_op = np.sqrt(relax_rate) * q.tensor(q.sigmap(), q.qeye(2))

# dephasing
T2 = 35e-6
deph_rate = 0#(-1/T1+2/T2)/2 # dephasing
deph_op = np.sqrt(0.5 * deph_rate) * q.tensor(q.sigmaz(), q.qeye(2))

rabi_freq = 0.5e6
proj_NV0 = q.ket2dm(q.basis(2,0))
proj_NV1 = q.ket2dm(q.basis(2,1))

def H_Suter():   
    H = (-0.158e6) * q.tensor(proj_NV0, 0.5*q.sigmaz()) + (-0.158e6 + 0.152e6) * q.tensor(proj_NV1, 0.5*q.sigmaz()) + (-0.110e6) * q.tensor(proj_NV1, 0.5*q.sigmax()) 
    return 2 * np.pi * H

def H_MW(phi):
    H = rabi_freq * np.cos(phi)/2 * q.tensor(q.sigmax(), q.qeye(2)) + rabi_freq * np.sin(phi)/2 * q.tensor(q.sigmay(), q.qeye(2))
    return 2 * np.pi * H

def U_free_super(free_time):
    H = H_Suter()
    c_ops = [relax_op, deph_op]
    L = q.liouvillian(H, c_ops)
    return (free_time*L).expm()

def U_pulse_super(pulse_time, phi):
    H = H_Suter() + H_MW(phi)
    c_ops = [relax_op, deph_op]
    L = q.liouvillian(H, c_ops)
    return (pulse_time*L).expm()
    # return scipy.linalg.expm(pulse_time*L.full())

def calc_superop(pulse_seq, num_pulses):
    U = 1
    for i in range(num_pulses):
        free_time, pulse_time, phi = pulse_seq[i], pulse_seq[i+num_pulses], pulse_seq[i+2*num_pulses]
        U = U_pulse_super(pulse_time, phi) * U_free_super(free_time) * U
    if len(pulse_seq) != 3*num_pulses:
        U = U_free_super(pulse_seq[-1]) * U
    return U

In [3]:
num_pulses = 3
pulse_seq = [0.74e-6, 0.22e-6, 0.43e-6, 0.23e-6, 1.26e-6, 1.50e-6, 3*np.pi/2, 3*np.pi/2, np.pi/2, 0.89e-6] # Suter Hadamard
superop = calc_superop(pulse_seq, num_pulses)

target_hada = q.tensor(q.qeye(2), q.gates.hadamard_transform())
target_superop = q.tensor(target_hada.conj(), target_hada)
target_superop.dims = [target_hada.dims, target_hada.dims]

calc_fidelity(superop, target_superop)

np.float64(0.9437899973536489)

In [4]:
num_pulses = 3
pulse_seq = [3.78e-6, 2.11e-6, 2.15e-6, 1.88e-6, 3.96e-6, 1.9e-6, 0, np.pi/5, np.pi/2, 0.63e-6] # Suter CNOT
superop = calc_superop(pulse_seq, num_pulses)

target_cnot = q.tensor(proj_NV0, q.qeye(2))  -1j * q.tensor(proj_NV1, q.sigmax())
target_superop = q.tensor(target_cnot.conj(), target_cnot)
target_superop.dims = [target_cnot.dims, target_cnot.dims]

calc_fidelity(superop, target_superop)

np.float64(0.9797014459277309)

## Lindblad Environment 

In [8]:
import gymnasium as gym

class Parametric_env(gym.Env):
    MAX_STEPS = 2
    INFIDELITY_THRESHOLD = 0.1

    def __init__(self, target_superop):
        self.target_superop = target_superop

        # action and observation spaces
        self.action_space = gym.spaces.Box(low=-1, high= 1, shape=(3,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(512,), dtype=np.float64)

        self.reset()

    def _get_obs(self):
        A = self.superop.full()
        real = A.flatten().real.tolist()
        imag = A.flatten().imag.tolist()
        return np.array(real + imag)

    def reset(self,seed=None, options=None):
        super().reset(seed=seed)
        self.superop = U_free_super(0)
        self.fidelity = calc_fidelity(self.superop, self.target_superop)
        self.count = 0
        self.reward = 0
        self.done = False
        self.duration = 0
        self.observation = self._get_obs()
        self.info = {}

        return self.observation,{}

    def step(self, action):
        if self.done:
            print("EPISODE DONE!!!")
        elif (self.count == self.MAX_STEPS):
            self.done = True
        else:
            assert self.action_space.contains(action)
            self.count += 1
            
        pulse_seq = [
            2.0e-6*np.abs(action[0]), #+ np.abs(0.1),
            2.0e-6*np.abs(action[1]), #+ np.abs(0.1),
            2*np.pi*action[2]
        ]
        superop_layer =  calc_superop(pulse_seq, 1)
        
        self.superop = superop_layer @ self.superop
        self.fidelity = calc_fidelity(self.superop, self.target_superop)
        self.info = {"Fidelity": self.fidelity}

        if 1-self.fidelity < self.INFIDELITY_THRESHOLD:
            self.done = True

        if self.done:
            self.reward = -np.log(1-self.fidelity)
        else:
            self.reward = 0

        self.observation = self._get_obs()

        return (self.observation, self.reward, self.done,self.done, self.info)

In [10]:
target_hada = q.tensor(q.qeye(2), q.gates.hadamard_transform())
target_superop = q.tensor(target_hada.conj(), target_hada)
target_superop.dims = [target_hada.dims, target_hada.dims]

param_env = Parametric_env(target_superop=target_superop) 
param_env.step( [0.74/2, 0.23/2, 3/4] )
param_env.step( [0.22/2, 1.26/2, 3/4] )
param_env.step([0.43/2, 1.50/2, 1/4] )
param_env.step( [0.89/2, 0, 0] )
param_env.fidelity

EPISODE DONE!!!


np.float64(0.9437899973536489)

In [9]:
from stable_baselines3.common.env_checker import check_env

param_env = Parametric_env(target_superop = U_free_super(0))
check_env(param_env, warn=True)

In [10]:
from stable_baselines3 import PPO

target_hada = q.tensor(q.qeye(2), q.gates.hadamard_transform())
target_superop = q.tensor(target_hada.conj(), target_hada)
target_superop.dims = [target_hada.dims, target_hada.dims]

param_env = Parametric_env(target_superop=target_superop)
model = PPO("MlpPolicy", param_env, learning_rate=0.00005,
            gamma=0.99,n_steps=1024,batch_size=256,
            clip_range=0.3, n_epochs=10,ent_coef=0.003, 
            verbose = 1,tensorboard_log="./ppo_cartpole_tensorboard/diss3/").learn(10000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./ppo_cartpole_tensorboard/diss3/PPO_5
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3        |
|    ep_rew_mean     | 0.14     |
| time/              |          |
|    fps             | 697      |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1024     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 3            |
|    ep_rew_mean          | 0.166        |
| time/                   |              |
|    fps                  | 665          |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 6.786431e-05 |
|    clip_fraction        | 0            |


## Cluster Expansion Environment

In [2]:
C13_pos = (8.728883757198979e-10, 0.0, 1.8558998769620693e-10) # Dominik
register_config = [('NV', (0, 0, 0), 0, {}), ('C13', C13_pos, 0, {})]

directory = os.getcwd()
filename = os.path.join('baths', 'dominik_bath')
bath_configs_nested = load_spin_baths(filename, directory)
bath_configs = [item for sublist in bath_configs_nested for item in sublist]

# Bell (Hadamard and CNOT)
init_state = q.tensor( q.fock_dm(2,0), q.fock_dm(2,0) )
bell_gate = get_cnot_gate(2, 0, 1) * get_hada_gate(2, 0)
bell_state = bell_gate * init_state * bell_gate.dag()

kwargs = dict(verbose=False, env_approx_level="gCCE1", bath_configs=bath_configs, suter_method=True, target=bell_state)
env = Environment2(register_config, **kwargs)

In [3]:
env.reset()
env.step([2*0-1, 2*0.99813568/(2*np.pi)-1, 2*0.69459264/(2*np.pi)-1 ])
env.step([ 2*4.06620465/5-1, 2*3.57557112/(2*np.pi)-1, 2*1.97327426/(2*np.pi)-1])
env.step([2*1.57022726/5-1, 2*1.68300382/(2*np.pi)-1, 2*0.50816523/(2*np.pi)-1])
env.step([2*1.50788214/5-1, 2*0-1, 2*0-1 ], instant_pulses=True)
env.fidelity

np.float64(0.9847317310965369)

In [4]:
from stable_baselines3.common.env_checker import check_env
check_env(env, warn=True)

In [5]:
from stable_baselines3 import PPO

kwargs = dict(verbose=False, env_approx_level="gCCE1", bath_configs=bath_configs, suter_method=True, target=bell_state)
env = Environment2(register_config, **kwargs)

model = PPO("MlpPolicy", env, learning_rate=0.00005,
            gamma=0.99,n_steps=1024,batch_size=256,
            clip_range=0.3, n_epochs=10,ent_coef=0.003, 
            verbose = 1,tensorboard_log="./ppo_cartpole_tensorboard/diss3/").learn(300)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./ppo_cartpole_tensorboard/diss3/PPO_29
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4        |
|    ep_rew_mean     | 0.322    |
| time/              |          |
|    fps             | 50       |
|    iterations      | 1        |
|    time_elapsed    | 20       |
|    total_timesteps | 1024     |
---------------------------------


In [17]:
from stable_baselines3 import PPO

kwargs = dict(verbose=False, env_approx_level="gCCE0", bath_configs=bath_configs, suter_method=True, target=bell_state)
env = Environment2(register_config, **kwargs)

model = PPO("MlpPolicy", env, learning_rate=0.00005,
            gamma=0.99,n_steps=1024,batch_size=256,
            clip_range=0.3, n_epochs=10,ent_coef=0.003, 
            verbose = 1,tensorboard_log="./ppo_cartpole_tensorboard/diss3/").learn(300)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./ppo_cartpole_tensorboard/diss3/PPO_28
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4        |
|    ep_rew_mean     | 0.375    |
| time/              |          |
|    fps             | 117      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


In [31]:
from stable_baselines3.common.env_util import make_vec_env
vec_env = make_vec_env(lambda:Environment2(register_config, **kwargs), n_envs=1)

obs = vec_env.reset()
n_steps = 17
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic= True)
    print(f"Step {step + 1}")
    print("Action: ", [action[0][i] for i in range(3)])
    obs, reward, done, info = vec_env.step(action)
    #print("obs=", obs, "reward=", reward, "done=", done)
    print("reward=", reward,"fidelity=",info[0]["Fidelity"], "done=", done)
    #vec_env.render()
    if done:
        # Note that the VecEnv resets automatically
        # when a done signal is encountered
        if reward > 0.99:
            print("Goal reached", "Fidelity=", info[0])
        else:
            print(" Max number of layers", "Fidelity=", info)
        break

Step 1
Action:  [np.float32(-0.00935421), np.float32(-0.020883545), np.float32(-0.0066122487)]
reward= [0.] fidelity= 0.0005567095795421482 done= [False]
Step 2
Action:  [np.float32(-0.007822891), np.float32(-0.0066960733), np.float32(0.011714209)]
reward= [0.] fidelity= 0.2300443675468704 done= [False]
Step 3
Action:  [np.float32(-0.007455373), np.float32(-0.013428304), np.float32(-0.0022534973)]
reward= [0.] fidelity= 0.27681449731612284 done= [False]
Step 4
Action:  [np.float32(0.003688519), np.float32(-0.0028641312), np.float32(0.009643178)]
reward= [0.6000857] fidelity= 0.45123537254091994 done= [ True]
 Max number of layers Fidelity= [{'Fidelity': np.float64(0.45123537254091994), 'episode': {'r': 0.600086, 'l': 4, 't': 0.031826}, 'TimeLimit.truncated': False, 'terminal_observation': array([ 9.24476903e-01,  2.61246171e-01,  2.78832107e-02, -1.13344043e-02,
        2.61246171e-01,  7.40681101e-02,  7.98507638e-03, -2.86109335e-03,
        2.78832107e-02,  7.98507638e-03,  9.523362

## Open Quantum Systems

A good source is Hashim 2024 (https://arxiv.org/abs/2408.12064)

In [None]:
def calc_process_fidelity(U, U_target):
    """ Calculates the process fidelity in Eq. (237). """
    
    dim = U.shape[0]
    PTM_U = calc_pauli_transfer_matrix([U])
    PTM_U_target = calc_pauli_transfer_matrix([U_target])
    return (PTM_U * PTM_U_target.inv()).tr() / dim**2

# Martinez2020 (10.1109/ACCESS.2020.3025619): amplitude-phase damping (APD) superoperator channel

# gamma = 1-np.exp(-t/T1) # amplitude damping channel, eq.(8)
# lam = 1-np.exp(t/T1-2*t/T2)  # dephasing channel, eq.(13)

# eq. (15)
# E0 = ((1+np.sqrt(1-gamma-(1-gamma)*lam))/2)*Id+((1-np.sqrt(1-gamma-(1-gamma)*lam))/2)*Z # eq.()
# E1 = (np.sqrt(gamma)/2)*X+ 1j*(np.sqrt(gamma)/2)*Y
# E2 = (np.sqrt((1-gamma)*lam)/2)*Id-(np.sqrt((1-gamma)*lam)/2)*Z

# Schlimgen2022 (10.1103/PhysRevResearch.4.023216): eq.(3)