### Evolution Strategies

$\alpha$ is the Learning rate, $\sigma$ is the noise std deviation and $\theta_0$ is the initial policy parameters
- We sample a batch of noise $\epsilon_i$, $i \in \N(0,1)$
- Compute Returns $R_i = R(\theta_t + \sigma \epsilon_i)$
- Update the weights as : $\theta_{t+1} = \theta_t + \alpha \frac{1}{n \sigma} \sum_{i=1}^n R_i \epsilon_i$

In [4]:
import gym
import time
import numpy as np

import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tensorboardX import SummaryWriter

In [2]:
MAX_BATCH_EPISODES = 100
MAX_BATCH_STEPS = 10000
NOISE_STD = 0.01
LEARNING_RATE = 0.001

class Net(nn.Module):
    def __init__(self, obs_size, action_size):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, 32),
            nn.ReLU(),
            nn.Linear(32, action_size),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.net(x)

In [7]:
def evaluate(env, net):
    obs, _ = env.reset()
    reward = 0.0
    steps = 0
    while True:
        obs_v = torch.FloatTensor([obs])
        act_prob = net(obs_v)
        acts = act_prob.max(dim=1)[1]
        obs, r, done, truncated, _ = env.step(acts.data.numpy()[0])
        done = done | truncated
        reward += r
        steps += 1
        if done:
            break
    return reward, steps

def sample_noise(net):
    """Calculate a random noise and a negative noise for mirrored sampling"""
    pos = []
    neg = []
    for p in net.parameters():
        noise_t = torch.from_numpy(np.random.normal(size=p.data.size()).astype(np.float32))
        pos.append(noise_t)
        neg.append(-noise_t)
    return pos, neg

def eval_with_noise(env, net, noise):
    """Compute Returns $R_i = R(\theta_t + \sigma \epsilon_i)$"""
    old_params = net.state_dict()
    for p, p_n in zip(net.parameters(), noise):
        p.data += NOISE_STD * p_n
    r, s = evaluate(env, net)
    net.load_state_dict(old_params)
    return r, s

def train_step(net, batch_noise, batch_reward, writer, step_idx):
    weighted_noise = None
    norm_reward = np.array(batch_reward)
    norm_reward -= np.mean(norm_reward)
    s = np.std(norm_reward)
    if abs(s) > 1e-6:
        norm_reward /= s

    for noise, reward in zip(batch_noise, norm_reward):
        if weighted_noise is None:
            weighted_noise = [reward * p_n for p_n in noise]
        else:
            for w_n, p_n in zip(weighted_noise, noise):
                w_n += reward * p_n
    m_updates = []
    for p, p_update in zip(net.parameters(), weighted_noise):
        update = p_update / (len(batch_reward) * NOISE_STD)
        p.data += LEARNING_RATE * update
        m_updates.append(torch.norm(update))
    writer.add_scalar("update_l2", np.mean(m_updates), step_idx)

In [8]:
writer = SummaryWriter(comment="-cartpole-es")
env = gym.make("CartPole-v0")

net = Net(env.observation_space.shape[0], env.action_space.n)
print(net)

step_idx = 0
while True:
    t_start = time.time()
    batch_noise = []
    batch_reward = []
    batch_steps = 0
    for _ in range(MAX_BATCH_EPISODES):
        noise, neg_noise = sample_noise(net)
        batch_noise.append(noise)
        batch_noise.append(neg_noise)
        reward, steps = eval_with_noise(env, net, noise)
        batch_reward.append(reward)
        batch_steps += steps
        reward, steps = eval_with_noise(env, net, neg_noise)
        batch_reward.append(reward)
        batch_steps += steps
        if batch_steps > MAX_BATCH_STEPS:
            break

    step_idx += 1
    m_reward = np.mean(batch_reward)
    if m_reward > 199:
        print("Solved in %d steps" % step_idx)
        break

    train_step(net, batch_noise, batch_reward, writer, step_idx)
    writer.add_scalar("reward_mean", m_reward, step_idx)
    writer.add_scalar("reward_std", np.std(batch_reward), step_idx)
    writer.add_scalar("reward_max", np.max(batch_reward), step_idx)
    writer.add_scalar("batch_episodes", len(batch_reward), step_idx)
    writer.add_scalar("batch_steps", batch_steps, step_idx)
    speed = batch_steps / (time.time() - t_start)
    writer.add_scalar("speed", speed, step_idx)
    print("%d: reward=%.2f, speed=%.2f f/s" % (step_idx, m_reward, speed))

  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


Net(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=2, bias=True)
    (3): Softmax(dim=1)
  )
)
1: reward=9.34, speed=15046.72 f/s
2: reward=9.26, speed=17307.45 f/s
3: reward=9.36, speed=16948.38 f/s
4: reward=9.40, speed=17628.25 f/s
5: reward=9.43, speed=17670.83 f/s
6: reward=9.35, speed=17564.64 f/s
7: reward=9.23, speed=17755.73 f/s
8: reward=9.43, speed=17852.25 f/s
9: reward=9.39, speed=17672.25 f/s
10: reward=9.28, speed=17613.80 f/s
11: reward=9.27, speed=17647.84 f/s
12: reward=9.43, speed=17771.32 f/s
13: reward=9.35, speed=17960.27 f/s
14: reward=9.43, speed=18146.71 f/s
15: reward=9.32, speed=18026.71 f/s
16: reward=9.31, speed=17982.86 f/s
17: reward=9.29, speed=17906.90 f/s
18: reward=9.23, speed=17980.61 f/s
19: reward=9.32, speed=17603.97 f/s
20: reward=9.38, speed=17594.07 f/s
21: reward=9.36, speed=17988.44 f/s
22: reward=9.38, speed=17868.69 f/s
23: reward=9.30, speed=1792