In [1]:
import cheetah
from gym import spaces
import numpy as np
import torch
from torch import distributions
from torch import nn
from torch import optim

from environments import ARESlatticeStage3v1_9 as lattice
from environments import utils
from environments.absolute import ARESEAAbsolute

initializing ocelot...


## Create model

In [2]:
obs_dim = 13
act_dim = 5

class GaussianActor(nn.Module):

    def __init__(self, obs_dim, act_dim):
        super().__init__()

        self.log_std = nn.Parameter(0.5 * torch.ones(act_dim))
        self.mu_net = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, act_dim)
        )
    
    def forward(self, observation, action=None):
        mu = self.mu_net(observation)
        std = torch.exp(self.log_std)
        pi = distributions.Normal(mu, std)

        if action is None:
            return pi
        else:
            log_probs = pi.log_prob(action).sum(axis=-1)
            return pi, log_probs

policy = GaussianActor(obs_dim, act_dim)
optimizer = optim.Adam(policy.parameters())

## Create simulation

In [3]:
screen_resolution = (2448, 2040)
pixel_size = (3.3198e-6, 2.4469e-6)

cell = utils.subcell_of(lattice.cell, "AREASOLA1", "AREABSCR1")

segment = cheetah.Segment.from_ocelot(cell)
segment.AREABSCR1.resolution = screen_resolution
segment.AREABSCR1.pixel_size = pixel_size
segment.AREABSCR1.is_active = True

segment.AREABSCR1.binning = 4



In [4]:
actuator_space = spaces.Box(
    low=np.array([-30, -30, -30, -3e-3, -6e-3], dtype=np.float32),
    high=np.array([30, 30, 30, 3e-3, 6e-3], dtype=np.float32)
)
goal_space = spaces.Box(
    low=np.array([-2e-3, -2e-3, 0, 0], dtype=np.float32),
    high=np.array([2e-3, 2e-3, 5e-4, 5e-4], dtype=np.float32)
)

## Gather experience

In [5]:
batch_size = 4

In [6]:
def track(actuators, incoming):
    segment.AREAMQZM1.k1, segment.AREAMQZM2.k1, segment.AREAMQZM3.k1 = actuators[:3]
    segment.AREAMCVM1.angle, segment.AREAMCHM1.angle = actuators[3:]

    outgoing = segment(incoming)
    
    return np.array([
        segment.AREABSCR1.read_beam.mu_x,
        segment.AREABSCR1.read_beam.mu_y,
        segment.AREABSCR1.read_beam.sigma_x,
        segment.AREABSCR1.read_beam.sigma_y
    ])

In [7]:
incoming = [cheetah.Beam.make_random(
    n=int(1e5),
    mu_x=np.random.uniform(-3e-3, 3e-3),
    mu_y=np.random.uniform(-3e-4, 3e-4),
    mu_xp=np.random.uniform(-1e-4, 1e-4),
    mu_yp=np.random.uniform(-1e-4, 1e-4),
    sigma_x=np.random.uniform(0, 2e-3),
    sigma_y=np.random.uniform(0, 2e-3),
    sigma_xp=np.random.uniform(0, 1e-4),
    sigma_yp=np.random.uniform(0, 1e-4),
    sigma_s=np.random.uniform(0, 2e-3),
    sigma_p=np.random.uniform(0, 5e-3),
    energy=np.random.uniform(80e6, 160e6)
) for _ in range(batch_size)]
initial_actuators = [actuator_space.sample() for _ in range(batch_size)]
desired = [goal_space.sample() for _ in range(batch_size)]
achieved = [track(actuators=x, incoming=y) for x, y in zip(initial_actuators, incoming)]

actuators_normalized = [a / actuator_space.high for a in initial_actuators]
desired_normalized = [d / goal_space.high for d in desired]
achieved_normalized = [a / goal_space.high for a in achieved]

observations = [np.concatenate([act, des, ach]) for act, des, ach in zip(actuators_normalized, desired_normalized, achieved_normalized)]
observations = torch.tensor(observations, dtype=torch.float32)

In [8]:
actuators_normalized = policy(observations).sample()
actuators = actuators_normalized.detach().numpy() * actuator_space.high
actuators

array([[ 2.8479309e+01, -3.4326981e+01,  2.8535709e+01, -2.5276872e-03,
        -3.0465245e-03],
       [-9.6333328e+01, -3.2546627e+01,  8.5156364e+00,  2.9103868e-03,
         1.9680706e-03],
       [-9.2171669e+01, -4.0584469e+01,  2.7397198e+01,  4.8694834e-03,
        -8.1009697e-03],
       [ 4.2317543e+01, -6.7360062e+01, -4.8868019e+01, -3.5976479e-04,
        -2.5459831e-03]], dtype=float32)

In [9]:
new_achieved = [track(a, i) for a, i in zip(actuators, incoming)]

In [10]:
new_achieved

[array([-0.00189775, -0.00600762,  0.00181811,  0.00821099]),
 array([-0.12700735,  0.00299701,  0.04130615,  0.00225607]),
 array([0.01800249, 0.01066626, 0.00688471, 0.01080656]),
 array([ 0.21646318, -0.00201749,  0.03719341,  0.0121554 ])]

In [11]:
def objective_fn(achieved, desired):
    offset = achieved - desired
    weights = np.array([1, 1, 2, 2])

    return np.log((weights * np.abs(offset)).sum())

In [12]:
objectives = torch.tensor([objective_fn(a, d) for a, d in zip(new_achieved, desired)], dtype=torch.float32)
objectives

tensor([-3.5180, -1.5374, -2.7500, -1.1577])

## Update model

In [13]:
policy_distributions = policy(observations)
log_probs = policy_distributions.log_prob(actuators_normalized).sum(axis=-1)
log_probs

tensor([-7.5909, -9.0304, -8.9039, -9.9233], grad_fn=<SumBackward1>)

In [14]:
mean = objectives.mean()
std = objectives.std().clamp_min(1e-12)
normalized_objectives = (objectives - mean) / std
normalized_objectives

tensor([-1.1727,  0.6458, -0.4676,  0.9945])

In [15]:
loss = (log_probs * normalized_objectives).mean()
loss

tensor(-0.6588, grad_fn=<MeanBackward0>)

In [16]:
policy.zero_grad()
loss.backward()
optimizer.step()

In [4]:
def rollout(env, policy, batch_size):
    observations = torch.zeros(batch_size, env.observation_space.shape[0])
    actions = torch.zeros(batch_size, env.action_space.shape[0])
    objectives = torch.zeros(batch_size)

    for i in range(batch_size):
        observation = env.reset()
        observation = torch.tensor(observation, dtype=torch.float32)
        observations[i] = observation

        observation = torch.unsqueeze(observation, 0)
        action = policy(observation)
        action = torch.squeeze(action)
        actions[i] = action
        
        action = action.detach().numpy()
        _, objective, _, _ = env.step(action)
        objectives[i] = objective
    
    return observations, actions, objectives

In [5]:
n_steps = 1000
batch_size = 64

for i in range(n_steps):
    observations, actions, objectives = rollout(env, policy, batch_size)

    loss = objectives.mean()
    
    policy.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Finished step {i}")

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
class GaussianActor(nn.Module):

    def __init__(self, obs_dim, act_dim):
        super().__init__()

        self.log_std = nn.Parameter(0.5 * torch.ones(act_dim))
        self.mu_net = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, act_dim)
        )
    
    def forward(self, observation, action=None):
        mu = self.mu_net(observation)
        std = torch.exp(self.log_std)
        pi = distributions.Normal(mu, std)

        if action is None:
            return pi
        else:
            log_probs = pi.log_prob(action).sum(axis=-1)
            return pi, log_probs


class OneShotPolicyGradient:

    def __init__(self, env, batch_size=64):
        self.env = env
        self.batch_size = batch_size

        self.actor = GaussianActor(env.observation_space.shape, env.action_space.shape)
        self.optimizer = optim.Adam(self.actor.parameters)
    
    def learn(self, n_steps=1000):
        step = 0
        while step < n_steps:
            
            
            

            step += self.batch_size


if __name__ == "__main__":
    env = ARESEAAbsolute()
    env = FlattenObservation(env)