In [1]:
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical

In [2]:
def mlp(layer_sizes, activation=nn.Tanh, output_activation=nn.Identity):
    layers = []

    for i in range(len(layer_sizes)-1):
        act_func = activation if (i < len(layer_sizes)-2) else output_activation
        layers += [nn.Linear(layer_sizes[i], layer_sizes[i+1]), act_func()]

    return nn.Sequential(*layers)

In [3]:
mlp([10, 32, 2])

Sequential(
  (0): Linear(in_features=10, out_features=32, bias=True)
  (1): Tanh()
  (2): Linear(in_features=32, out_features=2, bias=True)
  (3): Identity()
)

In [4]:
import gym
import numpy as np

import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline

In [5]:
gym_env = "CartPole-v0"

env = gym.make(gym_env)

assert isinstance(env.observation_space, gym.spaces.Box), "This environment only works for environment with continuous state spaces"
assert isinstance(env.action_space, gym.spaces.Discrete), "This environment only works for environment with discrete action spaces"



In [6]:
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n
obs_dim, act_dim

(4, 2)

In [7]:
HIDDEN_SIZES = [32]

logits_net = mlp([obs_dim]+HIDDEN_SIZES+[act_dim])
logits_net

Sequential(
  (0): Linear(in_features=4, out_features=32, bias=True)
  (1): Tanh()
  (2): Linear(in_features=32, out_features=2, bias=True)
  (3): Identity()
)

In [8]:
def get_policy(obs):
    logits  = logits_net(obs)
    return Categorical(logits=logits)

In [9]:
def get_action(obs):
    return get_policy(obs).sample().item()

In [10]:
def compute_loss(obs, acts, weights):
    logp = get_policy(obs).log_prob(acts)
    return -(logp * weights).mean()

In [11]:
obs = torch.randn(100, obs_dim) # batch, obs_dim
policy = get_policy(obs) # Categorical Policy - a distribution is returned

print("Policy:", policy)
print(policy.sample())
acts = torch.randint(0, 2, [100])
policy.log_prob(acts)

Policy: Categorical(logits: torch.Size([100, 2]))
tensor([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
        0, 1, 1, 1])


tensor([-0.8048, -0.7958, -0.6597, -0.6978, -0.6389, -0.5997, -0.4492, -0.4949,
        -0.6686, -1.0140, -0.8576, -0.7843, -0.5691, -0.6071, -0.5967, -0.6030,
        -0.5098, -0.9116, -0.6056, -0.6143, -0.4514, -0.5895, -0.6495, -0.8055,
        -0.5390, -0.9810, -0.9446, -0.4827, -0.6688, -0.7687, -0.4493, -0.8314,
        -0.7401, -0.8872, -0.7082, -0.7315, -0.7846, -0.5859, -0.7569, -0.6626,
        -0.4447, -0.5974, -0.8226, -0.7267, -0.8007, -0.7033, -0.9083, -0.6631,
        -0.7833, -0.5968, -0.7593, -0.6720, -0.7442, -0.8379, -0.5073, -0.6141,
        -0.8344, -0.9061, -0.7730, -0.6391, -0.5788, -0.4734, -0.7579, -0.4852,
        -0.5709, -0.6351, -0.7606, -0.6004, -0.5593, -0.4578, -0.7238, -0.7585,
        -0.6900, -0.4528, -0.6389, -0.7694, -0.5779, -0.8883, -0.5350, -0.7532,
        -0.9592, -0.9247, -0.6001, -0.6218, -0.7731, -0.6907, -0.6701, -0.8141,
        -0.5559, -0.6480, -0.6216, -0.5374, -0.7134, -0.7037, -0.4731, -0.8225,
        -0.6979, -1.0025, -0.5567, -0.94

In [12]:
def reward_to_go(ep_rews):
    n = len(ep_rews)
    rtgs = np.zeros_like(ep_rews) # same size as ep_rews
    for i in reversed(range(n)):
        rtgs[i] = ep_rews[i] + (rtgs[i+1] if i + 1 < n else 0)
    return rtgs

In [13]:
def train_one_epoch(env, optimizer, batch_size, render=True):
    batch_obs = []
    batch_acts = []
    batch_weights = []
    batch_rets = []
    batch_lens = []
    
    # get initial observation from starting distribution
    obs = env.reset()
    done = False
    ep_rews = []
    
    # Render the first episode of the epoch
    finish_rendering_this_epoch = False
    
    while True:
        # rendering
        if (not finish_rendering_this_epoch) and render:
            env.render()
            # plt.imshow(env.render(mode="rgb_array"))
            # display.display(plt.gcf())
            # display.clear_output(wait=True)
        
        # save the current observation
        batch_obs.append(obs.copy())
        
        # get action for the current observation
        act = get_action(torch.as_tensor(obs, dtype=torch.float32))
        obs, rew, done, _ = env.step(act)
        
        # save the action and reward
        ep_rews.append(rew)
        batch_acts.append(act)
        
        if done:
            # record info about the episode
            ep_ret = sum(ep_rews)
            ep_len = len(ep_rews)
            
            batch_rets.append(ep_ret)
            batch_lens.append(ep_len)
            # batch_weights += [ep_ret] * ep_len # the weights are the returns for each episode, broadcasted to support
                                                # the operation in function compute_loss
            # changes for the reward-to-go policy gradient
            batch_weights += list(reward_to_go(ep_rews))
            
            #reset the environment
            obs = env.reset()
            done = False
            ep_rews = []
            
            # won't render again after first episode in the epoch
            finish_rendering_this_epoch = True
            
            # end experience loop if we have enough of it
            if len(batch_obs) > batch_size:
                break
        
    # perform a single update step
    optimizer.zero_grad()
    batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32), 
                              acts=torch.as_tensor(batch_acts, dtype=torch.float32),
                              weights=torch.as_tensor(batch_weights, dtype=torch.float32))
    batch_loss.backward()
    optimizer.step()
    env.close()

    return batch_loss.item(), batch_rets, batch_lens
        

In [14]:
LR = 0.1

optimizer = torch.optim.Adam(logits_net.parameters(), lr=LR)

In [15]:
BATCH_SIZES = 5_000
EPOCHS = 50

# Training Loop
for epoch in range(EPOCHS):
    batch_loss, batch_rets, batch_lens = train_one_epoch(env, optimizer, BATCH_SIZES, render=True)
    print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {batch_loss:.2f} Average Return: {np.mean(batch_rets):.1f} Average Steps: {np.mean(batch_lens):.1f}")

Epoch [1/50], Loss: 9.43 Average Return: 20.5 Average Steps: 20.5
Epoch [2/50], Loss: 10.75 Average Return: 26.0 Average Steps: 26.0
Epoch [3/50], Loss: 18.53 Average Return: 51.9 Average Steps: 51.9
Epoch [4/50], Loss: 19.67 Average Return: 65.6 Average Steps: 65.6
Epoch [5/50], Loss: 18.45 Average Return: 71.3 Average Steps: 71.3
Epoch [6/50], Loss: 19.12 Average Return: 79.5 Average Steps: 79.5
Epoch [7/50], Loss: 19.66 Average Return: 94.9 Average Steps: 94.9
Epoch [8/50], Loss: 23.87 Average Return: 127.5 Average Steps: 127.5
Epoch [9/50], Loss: 30.05 Average Return: 178.1 Average Steps: 178.1
Epoch [10/50], Loss: 28.96 Average Return: 186.4 Average Steps: 186.4
Epoch [11/50], Loss: 25.63 Average Return: 177.1 Average Steps: 177.1
Epoch [12/50], Loss: 25.16 Average Return: 182.9 Average Steps: 182.9
Epoch [13/50], Loss: 25.23 Average Return: 195.1 Average Steps: 195.1
Epoch [14/50], Loss: 24.83 Average Return: 199.2 Average Steps: 199.2
Epoch [15/50], Loss: 22.97 Average Return: 2

KeyboardInterrupt: 

In [None]:
# you have to adjust the functions get_policy, get_action, compute_loss to have parameter logits_net

env2 = gym.make("LunarLander-v2")

obs_dim = env2.observation_space.shape[0]
act_dim = env2.action_space.n
HIDDEN_SIZES2 = [32]

logits_net2 = mlp([obs_dim] + HIDDEN_SIZES2 + [act_dim])

# Training Loop
# for epoch in range(EPOCHS):
#     batch_loss, batch_rets, batch_lens = train_one_epoch(env2, optimizer, BATCH_SIZES, render=True)
#     print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {batch_loss:.2f} Average Return: {np.mean(batch_rets):.1f} Average Steps: {np.mean(batch_lens):.1f}")