In [1]:
!pip install gym



In [2]:
import torch
from torch import nn
from torch.distributions.categorical import Categorical
from torch import optim

import numpy as np
import gym
from gym.spaces import Discrete, Box

In [3]:
print(f"torch version: {torch.__version__}")
print(f"gym version: {gym.__version__}")

torch version: 1.7.0
gym version: 0.18.0


## Define Model

In [28]:
def mlp(sizes, hidden_act_fn=nn.Tanh, output_act_fn=nn.Identity):
    '''
    sizes is list of integers specifying the number of nodes
        in each layer of the network, including input and output layers.
    Returns a torch.nn.Sequential object.
    '''
    assert isinstance(sizes, list)
    layers = []
    num_gaps = len(sizes) - 1
    for i in range(num_gaps):
        act_fn = hidden_act_fn if i < num_gaps-1 else output_act_fn
        layers.extend([nn.Linear(sizes[i], sizes[i+1]), act_fn()])
    return nn.Sequential(*layers)

In [29]:
net = mlp([4,32,2])
net

Sequential(
  (0): Linear(in_features=4, out_features=32, bias=True)
  (1): Tanh()
  (2): Linear(in_features=32, out_features=2, bias=True)
  (3): Identity()
)

## Cumulative Future Discounted Rewards helper function

In [25]:
def accumulate_discount(trajec_rewards, gamma=0.99):
    '''
    trajec_rewards must be a list of scalar reward values for each step.
    Returns a list of reverse-accumlated, discounted rewards, where each value
        represents the cumulative discounted rewards from that step onwards up to the end of the trajectory.
    '''
    assert isinstance(trajec_rewards, list)
    trajec_len = len(trajec_rewards)
    cum_disc_rewards = [None for i in range(trajec_len)]
    for step in reversed(range(trajec_len)):
        cum_disc_rewards[step] = trajec_rewards[step] + gamma * (cum_disc_rewards[step+1] if step+1 < trajec_len else 0)
    return cum_disc_rewards

In [26]:
ep_rewards = np.random.randint(-2, 6, size=(12,)).tolist()
ep_rewards

[5, 3, 0, -1, -1, 2, 0, -2, -1, 3, 5, 1]

In [27]:
accumulate_discount(ep_rewards)

[13.312010071311759,
 8.395969769001777,
 5.45047451414321,
 5.505529812265868,
 6.571242234611988,
 7.647719428900999,
 5.704767099899999,
 5.762391009999999,
 7.840798999999999,
 8.9301,
 5.99,
 1.0]

## Training Loop

Test action sampling:

In [30]:
state = np.random.randn(4)
state

array([2.30437415, 0.7246936 , 1.55096719, 0.12279306])

In [31]:
activations = net(torch.tensor(state, d)
activations

AttributeError: 'numpy.ndarray' object has no attribute 'dim'

In [None]:
def train(env_name='CartPole-v0', 
          hidden_sizes=[32], 
          lr=1e-2, 
          num_epochs=50, 
          step_batch_size=5000, 
          render=True
         ):
    
    env = gym.make(env_name)
    assert isinstance(env.observation_space, Box), \
        "This example only works for envs with continuous state spaces."
    assert isinstance(env.action_space, Discrete), \
        "This example only works for envs with discrete action spaces."
    
    obs_dim = env.observation_space.shape[0]
    n_acts = env.action_space.n
    
    net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])
    
    optimizer = optim.Adam(net.parameters(), lr=lr)
    
    for epoch in range(1, num_epochs+1):
        
        # Epoch-specific variables, resets each epoch
        batch_states = []      # State at each step, shape is (num steps over all episodes this epoch ie. >= step_batch_size, obs_dim)
        batch_acts = []        # Action at each step, shape is (num steps over all episodes this epoch, n_acts)
        batch_weights = []     # Cumulative future discounted reward at each step, shape is (num steps over all episodes this epoch)
        batch_ep_rets = []     # Returns for each episode in epoch, shape is (num episodes this epoch)
        batch_ep_lens = []     # Lengths (number of steps) of each episode in epoch, shape is (num episodes this epoch)
        
        # Episode-specific variables, resets each episode
        cur_state = env.reset()
        done = False
        ep_rewards = []
        render_episode = True
        
        while True:
            
            activations = net(torch.tensor(cur_state, dtype=float32))
            action = Categorical(logits=activations).sample().item()
            
            if render_episode and render:
                env.render()
            
            if done:
                
                if len(batch_obs) >= step_batch_size:
                    '''
                    We are only allowed to break at the end of an episode.
                    If at the end of this episode we finally have enough steps,
                        then we take this opportunity to break and call it an epoch.
                    '''
                    break