In [1]:
!pip install gym



In [2]:
import torch
from torch import nn
from torch.distributions.categorical import Categorical
from torch import optim

import numpy as np
import gym
from gym.spaces import Discrete, Box

In [3]:
print(f"torch version: {torch.__version__}")
print(f"gym version: {gym.__version__}")

torch version: 1.7.0
gym version: 0.18.0


In [47]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

## Define Model

In [4]:
def mlp(sizes, hidden_act_fn=nn.Tanh, output_act_fn=nn.Identity):
    '''
    sizes is list of integers specifying the number of nodes
        in each layer of the network, including input and output layers.
    Returns a torch.nn.Sequential object.
    '''
    assert isinstance(sizes, list)
    layers = []
    num_gaps = len(sizes) - 1
    for i in range(num_gaps):
        act_fn = hidden_act_fn if i < num_gaps-1 else output_act_fn
        layers.extend([nn.Linear(sizes[i], sizes[i+1]), act_fn()])
    return nn.Sequential(*layers)

In [48]:
net = mlp([4,32,2]).to(device)
net

Sequential(
  (0): Linear(in_features=4, out_features=32, bias=True)
  (1): Tanh()
  (2): Linear(in_features=32, out_features=2, bias=True)
  (3): Identity()
)

## Cumulative Future Discounted Rewards helper function

In [6]:
def accumulate_discount(trajec_rewards, gamma=0.99):
    '''
    trajec_rewards must be a list of scalar reward values for each step.
    Returns a list of reverse-accumlated, discounted rewards, where each value
        represents the cumulative discounted rewards from that step onwards up to the end of the trajectory.
    '''
    assert isinstance(trajec_rewards, list)
    trajec_len = len(trajec_rewards)
    cum_disc_rewards = [None for i in range(trajec_len)]
    for step in reversed(range(trajec_len)):
        cum_disc_rewards[step] = trajec_rewards[step] + gamma * (cum_disc_rewards[step+1] if step+1 < trajec_len else 0)
    return cum_disc_rewards

In [7]:
ep_rewards = np.random.randint(-2, 6, size=(12,)).tolist()
ep_rewards

[1, -1, 4, 5, 3, 5, 2, 0, -1, 5, -2, 1]

In [8]:
accumulate_discount(ep_rewards)

[21.03300922553339,
 20.235362854074133,
 21.44986146876175,
 17.62612269571894,
 12.753659288604991,
 9.852181099601001,
 4.9011930299,
 2.9304980099999995,
 2.9600989999999996,
 4.0001,
 -1.01,
 1.0]

## Loss Function

The combination of chosen actions and weights sort of behaves like labels. This loss function is essentially just cross entropy loss (negative log likelihood loss), except the loss for each sample is weighted by the expected return at that timestep.

In [9]:
step_batch_size = 5

In [10]:
batch_states = [np.random.randn(4) for i in range(step_batch_size)]
batch_states

[array([-0.19327361, -1.47855409,  1.27444737, -0.41548711]),
 array([ 0.51988474,  1.54889377,  1.2939353 , -0.45033854]),
 array([ 0.11627388, -0.95530747, -0.72564121, -1.69475162]),
 array([ 1.98532673,  2.0308256 , -0.31792879,  0.97846826]),
 array([ 0.22967791,  1.3047228 , -0.30795531, -0.75400581])]

In [12]:
activations = net(torch.tensor(batch_states, dtype=torch.float32))
activations

tensor([[-0.3359, -0.3889],
        [ 0.0339, -0.2518],
        [ 0.2909, -0.2671],
        [ 0.0165,  0.1521],
        [ 0.3116, -0.0699]], grad_fn=<AddmmBackward>)

In [13]:
actions = np.random.randint(0, 2, size=(5,)).tolist()
actions

[0, 0, 1, 0, 1]

In [14]:
cum_disc_rewards = accumulate_discount(np.random.randint(-2, 6, size=(5,)).tolist())
cum_disc_rewards

[2.8419900199999995, 3.8807979999999995, 5.9402, 3.98, 2.0]

### Loss function using weird PyTorch objects eww.

In [15]:
def loss_func(activations, chosen_acts, weights):
    '''
    outputs must be a tensor, shape is (~step_batch_size, n_acts), dtype is torch.float32
    chosen_acts must be a tensor, shape is (~step_batch_size) dtype is torch.int32
    weights must be a tensor, shape is (~step_batch_size) dtype is torch.float32
    '''
    assert activations.dtype == torch.float32
    assert chosen_acts.dtype == torch.int32
    assert weights.dtype == torch.float32
    selected_log_probs = Categorical(logits=activations).log_prob(chosen_acts)  # Returns a batch of nll losses
        # log_prob does cross entropy loss (softmax --> take prob corres to chosen class --> log --> negative)
    return -(selected_log_probs * weights).mean()

batch_loss = loss_func(activations=torch.tensor(activations, dtype=torch.float32), 
                       chosen_acts=torch.tensor(actions, dtype=torch.int32), 
                       weights=torch.tensor(cum_disc_rewards, dtype=torch.float32)
                      )
batch_loss

  batch_loss = loss_func(activations=torch.tensor(activations, dtype=torch.float32),


tensor(2.9831)

### Loss function manually yay.

In [16]:
def loss_func_manual(activations, chosen_acts, weights):
    '''
    outputs must be a tensor, shape is (~step_batch_size, n_acts), dtype is torch.float32
    chosen_acts must be a tensor, shape is (~step_batch_size), dtype is torch.int64 because torch.gather is picky
    weights must be a tensor, shape is (~step_batch_size), dtype is torch.float32
    '''
    assert activations.dtype == torch.float32
    assert chosen_acts.dtype == torch.int64
    assert weights.dtype == torch.float32
    log_probs = nn.LogSoftmax(dim=-1)(activations) # Output has autoograd relationship to activations, for back propagation
    selected_log_probs = torch.gather(log_probs, -1, chosen_acts.unsqueeze(1)).squeeze()
    return -(selected_log_probs * weights).mean() # Negative expected return to trick optimizer into doing gradient ascent

batch_loss = loss_func_manual(activations=torch.tensor(activations, dtype=torch.float32), 
                       chosen_acts=torch.tensor(actions, dtype=torch.int64), # torch.gather only works with int64
                       weights=torch.tensor(cum_disc_rewards, dtype=torch.float32)
                      )
batch_loss

  batch_loss = loss_func_manual(activations=torch.tensor(activations, dtype=torch.float32),


tensor(2.9831)

`nn.LogSoftmax` returns a function object or whatever it's called. Its argument `dim` is the dimension over which to softmax.

In [17]:
log_probs = nn.LogSoftmax(dim=-1)(activations)
chosen_acts = torch.randint(0,2,size=(5,), dtype=torch.int64)
log_probs, chosen_acts

(tensor([[-0.6670, -0.7200],
         [-0.5605, -0.8462],
         [-0.4526, -1.0106],
         [-0.7632, -0.6277],
         [-0.5205, -0.9020]], grad_fn=<LogSoftmaxBackward>),
 tensor([0, 1, 1, 1, 0]))

`torch.gather` works like this. `log_probs` has shape (step_batch_size, n_acts) while chosen_acts has shape (step_batch_size). You need to put each of the indices in chosen_acts in its own array for `gather` to work, so use `unsqueeze` in the second dimension to make the shape (step_batch_size, n_acts). Finally, after gather, squeeze out the redundant second dimension.

In [35]:
selected_log_probs = torch.gather(log_probs, -1, chosen_acts.unsqueeze(1)).squeeze()
selected_log_probs

tensor([-0.6670, -0.8462, -1.0106, -0.6277, -0.5205],
       grad_fn=<SqueezeBackward0>)

In [36]:
weights = torch.tensor(cum_disc_rewards, dtype=torch.float32)
weights

tensor([2.8420, 3.8808, 5.9402, 3.9800, 2.0000])

In [37]:
selected_log_probs * weights

tensor([-1.8955, -3.2838, -6.0032, -2.4982, -1.0410], grad_fn=<MulBackward0>)

## Training Loop

### Test forward propagation:

In [38]:
state = np.random.randn(4)
state

array([ 0.03926912,  2.31835182, -0.97210445, -0.22401287])

Looks like the `nn.Sequential` object will take a single sample with no problem.

In [39]:
activations = net(torch.tensor(state, dtype=torch.float32))
activations

tensor([0.3876, 0.1584], grad_fn=<AddBackward0>)

### Test action sampling:

`Categorical` exhibits weird behaviour...

`Categorical`'s `probs` argument takes in a tensor of 'probabilities' in range `[0, inf)` ie. non-negative but does not need to sum to 1, as the class will automatically normalize the values to make the distribution. Make sure to sigmoid or softmax activations before passing this argument.

`Categorical`'s `logits` argument takes a tensor of values in range `(-inf, inf)` and will turn it into a probability distribution that sums to 1, probably with softmax but idk.

In [40]:
probs_list = [0.25, 0.25, 0.21, 0.56]
dist = torch.distributions.categorical.Categorical(probs=torch.tensor(probs_list))
print(f"Normalize: {dist.probs}\nThen natural log: {dist.logits}")

Normalize: tensor([0.1969, 0.1969, 0.1654, 0.4409])
Then natural log: tensor([-1.6253, -1.6253, -1.7997, -0.8188])


In [41]:
logits_list = [-1.05, -0.15, 0.41, 1.20]
dist = torch.distributions.categorical.Categorical(logits=torch.tensor(logits_list))
print(f"Softmax of logits: {dist.probs}\nThen natural log: {dist.logits}")

Softmax of logits: tensor([0.0580, 0.1426, 0.2496, 0.5499])
Then natural log: tensor([-2.8480, -1.9480, -1.3880, -0.5980])


### Training loop:

In [53]:
def train(env_name='CartPole-v0', 
          hidden_sizes=[32], 
          lr=1e-2, 
          num_epochs=50, 
          step_batch_size=5000, 
          render=False
         ):
    
    env = gym.make(env_name)
    assert isinstance(env.observation_space, Box), \
        "This example only works for envs with continuous state spaces."
    assert isinstance(env.action_space, Discrete), \
        "This example only works for envs with discrete action spaces."
    
    obs_dim = env.observation_space.shape[0]
    n_acts = env.action_space.n
    action_space = np.arange(n_acts, dtype=np.int64)
    
    net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts]).to(device)
    
    optimizer = optim.Adam(net.parameters(), lr=lr)
    
    for epoch in range(1, num_epochs+1):
        
        # Epoch-specific variables, resets each epoch
        batch_states = []      # State at each step, shape is (num steps over all episodes this epoch ie. >= step_batch_size, obs_dim)
        batch_acts = []        # Action at each step, shape is (num steps over all episodes this epoch, n_acts)
        batch_weights = []     # Cumulative future discounted reward at each step, shape is (num steps over all episodes this epoch)
        batch_ep_rets = []     # Returns for each episode in epoch, shape is (num episodes this epoch)
        batch_ep_lens = []     # Lengths (number of steps) of each episode in epoch, shape is (num episodes this epoch)
        
        # Episode-specific variables, resets each episode
        cur_state = env.reset()
        done = False
        ep_rewards = []
        render_episode = True
        
        while True:
            
            activations = net(torch.tensor(cur_state, dtype=torch.float32).to(device)).clone().detach()
                # clone: make deep copy
                # detach: separate from autograd computation graph,
                    # to prevent accidentally changing gradient buffers with this calculation
            action_probs = nn.Softmax(dim=-1)(activations).cpu().numpy()
#             action = Categorical(probs=action_probs).sample().item()
            action = np.random.choice(action_space, p=action_probs)
            
            next_state, reward, done, _ = env.step(action)
            
            batch_states.append(cur_state.copy())
            batch_acts.append(action)
            ep_rewards.append(reward)
            
            cur_state = next_state
            
            if render_episode and render:
                env.render()
            
            if done:
                # If episode over record info about episode
                ep_ret, ep_len = sum(ep_rewards), len(ep_rewards)
                batch_ep_rets.append(ep_ret)
                batch_ep_lens.append(ep_len)
                
                batch_weights.extend(accumulate_discount(ep_rewards, gamma=0.99))
                
                # Reset episode-specific variables
                cur_state = env.reset()
                done = False
                ep_rewards = []        
                render_episode = False
                
                if len(batch_states) >= step_batch_size:
                    '''
                    We are only allowed to break at the end of an episode.
                    If at the end of this episode we finally have enough steps,
                        then we take this opportunity to break and call it an epoch.
                    '''
                    break

                    
        optimizer.zero_grad()
        batch_loss = loss_func_manual(activations=net(torch.tensor(batch_states, dtype=torch.float32).to(device)), 
                                      chosen_acts=torch.tensor(batch_acts, dtype=torch.int64).to(device), 
                                      weights=torch.tensor(batch_weights, dtype=torch.float32).to(device)
                                     )
        # Gradient flows from batch_loss -> activations -> parameters & hidden activations in network
        batch_loss.backward()
        optimizer.step()
        print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
                (epoch, batch_loss, np.mean(batch_ep_rets), np.mean(batch_ep_lens)))

In [52]:
train()

epoch:   1 	 loss: 6.855 	 return: 17.106 	 ep_len: 17.106
epoch:   2 	 loss: 7.233 	 return: 18.087 	 ep_len: 18.087
epoch:   3 	 loss: 8.578 	 return: 20.946 	 ep_len: 20.946
epoch:   4 	 loss: 8.806 	 return: 21.996 	 ep_len: 21.996
epoch:   5 	 loss: 10.113 	 return: 24.995 	 ep_len: 24.995
epoch:   6 	 loss: 11.703 	 return: 29.763 	 ep_len: 29.763
epoch:   7 	 loss: 10.670 	 return: 28.764 	 ep_len: 28.764
epoch:   8 	 loss: 12.359 	 return: 33.939 	 ep_len: 33.939
epoch:   9 	 loss: 13.270 	 return: 37.722 	 ep_len: 37.722
epoch:  10 	 loss: 14.625 	 return: 40.248 	 ep_len: 40.248
epoch:  11 	 loss: 15.362 	 return: 44.009 	 ep_len: 44.009
epoch:  12 	 loss: 15.018 	 return: 45.964 	 ep_len: 45.964
epoch:  13 	 loss: 14.695 	 return: 45.090 	 ep_len: 45.090
epoch:  14 	 loss: 16.569 	 return: 53.617 	 ep_len: 53.617
epoch:  15 	 loss: 15.742 	 return: 51.429 	 ep_len: 51.429
epoch:  16 	 loss: 17.410 	 return: 57.598 	 ep_len: 57.598
epoch:  17 	 loss: 17.067 	 return: 59.388 	