Source https://github.com/rgilman33/simple-A2C/blob/master/3_A2C-nstep-TUTORIAL.ipynb


In [19]:
import gym
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable



Adding another head to A2C: a next-state predictor. If we can train the model to accurately predict its own next state and reward, we could use it to generate additional training data. Inspired by how humans do "mental practice" by imagining scenarios in their head. Like that study with basketball players taking free throws: Those who practiced mentally performed better, even with same amount of "live" data. This sort of sample efficiency isn't really necessary when we have access to an env simulator, eg Gym, but could be very helpful for robotics.


In [20]:
#N_STEPS = 5
SEED = 1
N_GAMES = 1000
N_ACTIONS = 2
N_INPUTS = 4

states = []
actions = []
rewards = []

env = gym.make('CartPole-v0')
env.reset()

array([-0.02601533, -0.0124399 , -0.04527301,  0.02549711])

In [21]:
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        
        self.common_nn = nn.Sequential(
            nn.Linear(N_INPUTS, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        
        self.actor = nn.Linear(64, N_ACTIONS)
        self.critic = nn.Linear(64, 1)
        self.predictor = nn.Linear(64, N_INPUTS)
    
    def forward(self, x):
        x = self.common_nn(x)     
        return x
    
    def get_action_probs(self, x):
        # convert states, compute logits, use softmax to get probability
        torch_states = torch.as_tensor(x, dtype=torch.float32)
        common_result = self(torch_states)
        actor_result = self.actor(common_result)
        return F.softmax(actor_result, dim=1).detach().numpy()

    def evaluate_actions(self, x):
        x = self(x)
       
        action_probs = F.softmax(self.actor(x))
        state_values = self.critic(x)
        next_state = self.predictor(x)
        
        return action_probs, state_values, next_state

In [22]:
def generate_session(model, t_max=1000):
    """ 
    play a full session with agent and train at the session end.
    returns sequences of states, actions and rewards
    """
    states, actions, rewards = [], [], []
    s = env.reset()
    global action_probs
    for t in range(t_max):
        
        action_probs = model.get_action_probs(np.array([s]))[0]
        # Sample action with given probabilities.
        a = np.random.choice(N_ACTIONS, p=action_probs)
        next_s, r, done, info = env.step(a)
        
        # record session history to train later
        states.append(s)
        actions.append(a)
        rewards.append(r)
        
        s = next_s
        if done:
            break
    return states, actions, rewards

In [23]:
# test it
states, actions, rewards = generate_session(ActorCritic())
print(actions)

[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1]


In [24]:
def get_cumulative_rewards(rewards,  # rewards at each step
                           gamma=0.99  # discount for reward
                           ):
    """
    take a list of immediate rewards r(s,a) for the whole session 
    compute cumulative returns (a.k.a. G(s,a) in Sutton '16)
    G_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...

    The simple way to compute cumulative rewards is to iterate from last to first time tick
    and compute G_t = r_t + gamma*G_{t+1} recurrently

    You must return an array/list of cumulative rewards with as many elements as in the initial rewards.
    """
    G = [rewards[-1]]
    
    for r in rewards[-2::-1]:
        G.append(r + gamma * G[-1])
    return G[::-1]


In [25]:
def to_one_hot(y_tensor, ndims):
    """ helper: take an integer vector and convert it to 1-hot matrix. """
    y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
    y_one_hot = torch.zeros(
        y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
    return y_one_hot

In [32]:
model = ActorCritic()
optimizer = optim.Adam(model.parameters(), 1e-3)

In [33]:
print(torch.tensor([1,2,3]))
print(torch.tensor([1,2,3]).unsqueeze(1))

tensor([1, 2, 3])
tensor([[1],
        [2],
        [3]])


In [36]:
def train_on_session(states, actions, rewards, gamma=0.99, entropy_coef=1e-3):
    """
    Takes a sequence of states, actions and rewards produced by generate_session.
    Updates agent's weights by following the policy gradient above.
    Please use Adam optimizer with default parameters.
    """
    # cast everything into torch tensors
    states = torch.tensor(states, dtype=torch.float32)
    # print("**********************")
    # print("states", states.shape)
    # print("must be length * state_dim")
    actions = torch.tensor(actions, dtype=torch.int32)
    
    # print(states)
    # print("**********************")
    # print("actions", actions.shape)
    # print("must be length * actions_dim")
    # print(actions)
    # 
    # print("**********************")
    cumulative_returns = np.array(get_cumulative_rewards(rewards, gamma))
    cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)
    
    # print("cum_returns", cumulative_returns.shape)    
    # print("must be length * 1")
    action_probs, state_values, next_states = model.evaluate_actions(states)

    next_state_pred_loss = (states[1:] - next_states[:-1]).pow(2).mean()
    
    # print("next_state_pred_loss", next_state_pred_loss)
    # print("must be single number computed as mean of  St-S'")
    # print("**********************")
    # print("action_probs", action_probs.shape)
    # print("must be length * action_dim")
    # print(action_probs)
    log_probs = action_probs.log()
    # print("log_probs", log_probs.shape)
    # print(log_probs)
    log_probs_for_actions = torch.sum(log_probs * to_one_hot(actions, 
                                                             env.action_space.n), 
                                      dim=1).unsqueeze(1)
    # print("**********************")
    # print("log_probs_for_actions", log_probs_for_actions.shape)
    # print("must be length * 1")
    # print(log_probs_for_actions)
    advantages = cumulative_returns.unsqueeze(1) - state_values
    # print("**********************")
    # print("advantages", advantages.shape)
    # print("must be length * 1")
    # print(advantages)
    # print("**********************")
    # print("state_values", state_values.shape)
    # print("must be length * 1")
    # print(state_values)
    # print("**********************")
    # print("log_probs_for_actions * advantages", (log_probs_for_actions * advantages).shape)
    # print(log_probs_for_actions * advantages)
    # print("**********************")
    action_gain = (log_probs_for_actions * advantages).mean()
    # print("action_gain", action_gain.shape)
    # print("must be single number")
    # print(action_gain)
    # print("**********************")
    entropy = -(action_probs * log_probs).sum(-1).mean()
    # print("**********************")
    # print("entropy", entropy.shape)
    # print("entropy", entropy)
    # print("**********************")
    value_loss = advantages.pow(2).mean()
    
    total_loss = value_loss/50.0 - action_gain - entropy_coef*entropy + next_state_pred_loss
  

    nn.utils.clip_grad_norm(model.parameters(), 0.5)

    total_loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    return np.sum(rewards)


In [37]:
for i in range(100):
    rewards = [train_on_session(*generate_session(model))
               for _ in range(100)]  # generate new sessions
    print("mean reward:%.3f" % (np.mean(rewards)))
    if np.mean(rewards) > 500:
        print("You Win!")  # but you can train even further
        break



mean reward:26.640


mean reward:34.580


mean reward:41.070


mean reward:114.130


mean reward:112.070


mean reward:179.210


mean reward:191.950


mean reward:192.030


mean reward:152.510


mean reward:181.890


mean reward:147.070


mean reward:200.000


mean reward:197.920


mean reward:197.480


mean reward:195.460


mean reward:195.260


mean reward:199.250


mean reward:191.260


mean reward:200.000


mean reward:199.810


mean reward:200.000


mean reward:200.000


mean reward:193.240


mean reward:198.770


mean reward:190.670


mean reward:200.000


mean reward:200.000


KeyboardInterrupt: 

**********************
states torch.Size([10, 4])
tensor([[ 2.2296e-02,  3.5152e-02,  5.8884e-03,  3.8122e-03],
        [ 2.2999e-02,  2.3019e-01,  5.9647e-03, -2.8701e-01],
        [ 2.7603e-02,  4.2523e-01,  2.2452e-04, -5.7780e-01],
        [ 3.6107e-02,  6.2034e-01, -1.1332e-02, -8.7042e-01],
        [ 4.8514e-02,  8.1562e-01, -2.8740e-02, -1.1666e+00],
        [ 6.4827e-02,  1.0111e+00, -5.2073e-02, -1.4682e+00],
        [ 8.5049e-02,  1.2068e+00, -8.1436e-02, -1.7767e+00],
        [ 1.0919e-01,  1.4028e+00, -1.1697e-01, -2.0935e+00],
        [ 1.3724e-01,  1.2090e+00, -1.5884e-01, -1.8392e+00],
        [ 1.6142e-01,  1.4055e+00, -1.9562e-01, -2.1767e+00]])
**********************
actions torch.Size([10])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 1, 1], dtype=torch.int32)
**********************
cum_returns torch.Size([10])
**********************
action_probs  torch.Size([10, 2])
tensor([[0.0742, 0.9258],
        [0.0250, 0.9750],
        [0.0152, 0.9848],
        [0.0151, 0.9849],
        [0



RuntimeError: grad can be implicitly created only for scalar outputs