In [1]:
%matplotlib notebook

import torch
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.distributions import Categorical

import numpy as np
import scipy.signal
import cv2
import matplotlib.pyplot as plt

from statistics import stdev, mean
import copy

import gym
import gym_2048

from model import Network
from utils import Batch, normalize, plot, set_seed

In [2]:
def discount(delta, g_l):
    return scipy.signal.lfilter([1], [1, -g_l], delta.detach().numpy()[::-1], axis=0)[::-1]

In [3]:
def gae(rewards, state_values, dones, gamma=0.9, lam=0.15):
    return discount(rewards[:-1] + gamma*state_values[1:]*(1 - dones[1:]) - state_values[:-1], gamma*lam)

In [4]:
def closure():
    optimizer.zero_grad()
    output = model(input)
    loss = loss_fn(output, target)
    loss.backward()
    return loss

In [5]:
def ac_update(model, state_values, log_probs, old_log_probs, entropies, rewards, 
              dones, clip=0.2, gamma=0.9, lam=0.95, beta=0.1):
    R = 0
    returns = []
    policy_loss = []
    critic_loss = []

    # calculate return from each time step ('reward-to-go')
    for r in rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
        
    # convert lists of required values into tensors
    returns = torch.tensor(returns).unsqueeze(1)
    state_values = torch.cat(state_values)
    log_probs = torch.cat(log_probs).unsqueeze(1)
    dones = torch.tensor(dones).unsqueeze(1).type(torch.FloatTensor)
    
    # calculate entropy
    # want high entropy (initially), as it corresponds to 'surprise' that a state was encountered,
    # which means that to encourage entropy is to encourage exploration
    entropy = torch.cat(entropies).detach().mean()
    
    # calculate GAE, normalize advantage estimate, and set to correct size
    advantage = gae(returns, state_values, dones, gamma, lam)
    advantage = torch.tensor((advantage-advantage.mean())/(advantage.std()+1e-8), 
                             dtype=torch.float32)   
    advantage = torch.cat([advantage, torch.tensor([0], dtype=torch.float32).unsqueeze(0)])

    # normalize returns
    # !!!!!!!!!!!!!!!!! NEED TO CALCULATE GAE BEFORE NORMALIZING RETURNS !!!!!!!!!!!!!!!!!!!!!
    returns = (returns-returns.mean())/(returns.std()+1e-8)
    
    # policy (actor) loss
#     if old_log_probs is not None:
#         # ratio of old policy to new one
#         # helps ensure that that update step is not too big
#         print(log_probs.size(), old_log_probs.size())
#         ratio = (log_probs - old_log_probs).exp()
#         obj = ratio * advantages
#         # cliped update
#         obj_clipped = ratio.clamp(1.0 - clip,
#                                   1.0 + clip) * advantages
#         # set loss to the minimum 
#         policy_loss = -torch.min(obj, obj_clipped).mean() - beta * entropy
#     else:
    policy_loss = -log_probs * advantage
        
    # critic loss
    critic_loss = F.smooth_l1_loss(state_values, returns, reduction='none')
    # calculate combined loss
    loss = policy_loss.sum() + critic_loss.sum()

    # backwards pass of network, parameter update, and lr decay
    model['optim'].zero_grad()
    loss.backward()
#     nn.utils.clip_grad_norm_(model['net'].parameters(), clip)
    model['optim'].step()
    model['scheduler'].step()
    
#     with torch.no_grad():
#         for param in model['net'].parameters():
#             param.add_(torch.randn(param.size()) * 0.00001)
            
    return log_probs

In [6]:
def select_action(action_probs):
    m = Categorical(action_probs)    # multinomial over actions
    action = m.sample()
    return action.item(), m.log_prob(action), m.entropy()

In [7]:
def update(model, state_values, log_probs, old_log_probs, entropies, rewards, dones):
    old_log_probs = ac_update(model, state_values, log_probs, old_log_probs, entropies, rewards, dones)
    return [], [], [], [], [], old_log_probs

In [8]:
def train(env, model, rollout=7, rollout_double=20, n_episodes=200, max_timesteps=2000):
    fig, ax = plt.subplots(1,1, figsize=(9,4))
    
    avg_returns, returns, stds = [], [], [0,0]
    max_values, n_zeros, steps = [], [], []
    
    state_values, rewards, log_probs, entropies, dones = [], [], [], [], []
    old_log_probs = None
    
    for episode in range(1, n_episodes+1):
        r, mv, zeros = [], [], []
        
        obs = env.reset()

        for ts in range(1, max_timesteps+1):
            obs = np.ravel(obs)    # flatten state for network
            obs = torch.from_numpy(obs).float().unsqueeze(0)
            
            mv.append(torch.max(obs).item())
            zeros.append(list(obs[obs==0].size())[0])
         
            action_probs, state_value = model['net'](obs)
            action, log_prob, entropy = select_action(action_probs)
            obs, reward, done, _ = env.step(action)

            # capute values for update
            state_values.append(state_value)
            log_probs.append(log_prob)
            rewards.append(reward)
            entropies.append(entropy)
            dones.append(done)
            
            # capture reward for end-of-episode statistics
            r.append(reward)
            
#             if ts % rollout == 0:
#                 state_values, rewards, entropies, log_probs, dones, old_log_probs = \
#                         update(model, state_values, log_probs, old_log_probs entropies, rewards, dones)

            if done:
                # calculate end-of-episode statistics
#                 print(r)
                returns.append(sum(r))
                avg_returns.append(mean(returns))
                if episode > 1:
#                     print(episode, returns)
                    stds.append(stdev(returns))
            
                max_values.append(max(mv))
                n_zeros.append(mean(zeros))
                steps.append(ts)
                
                break
        
        state_values, rewards, entropies, log_probs, dones, old_log_probs = \
                        update(model, state_values, log_probs, old_log_probs, entropies, rewards, dones) 
                                                                                   
                                                                       
        
        if episode % 10 == 0:
            print('Episode: {} - Avg Steps: {:.2f} - Avg Returns: {:.2f} - Avg Zeros: {:.2f} - Max Value: {:.2f}'
                  .format(episode, mean(steps), mean(returns), mean(n_zeros), max(max_values))
                 )
            
            max_values, n_zeros = [], []
            
        plot(returns, avg_returns, stds, fig, ax)

In [9]:
def main(environment='2048-v0', rollout=200, rollout_double=20, n_episodes=300, max_timesteps=2000):
    env = gym.make('CartPole-v0')
    
    set_seed(env, 0)

    obs_shape = env.observation_space.shape
    obs_dim = obs_shape[0]# * obs_shape[1]
    n_actions = env.action_space.n

    model = {}

    model['net'] = Network(obs_dim, n_actions)
    model['optim'] = optim.Adam(model['net'].parameters(), lr=1e-2)

    T_max = 400
    eta_min = 1e-4
    model['scheduler'] = optim.lr_scheduler.CosineAnnealingLR(model['optim'], T_max, eta_min)

    train(env, model, rollout, rollout_double, n_episodes, max_timesteps)

In [10]:
if __name__ == '__main__':
    main()

<IPython.core.display.Javascript object>

torch.Size([35, 1]) torch.Size([14, 1])


RuntimeError: The size of tensor a (35) must match the size of tensor b (14) at non-singleton dimension 0