In [1]:
%matplotlib notebook

import torch
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.distributions import Categorical, Normal

import numpy as np
import cv2
import matplotlib.pyplot as plt

from statistics import stdev, mean
import copy

import gym
import gym_2048

from model import Network
from utils import Trajectory, plot, set_seed

In [2]:
def select_action(action_probs):
    m = Categorical(action_probs)    # multinomial over actions
    action = m.sample()
    return action.item(), m.log_prob(action), m.entropy()

In [3]:
def update(model, clip=0.2, beta=0.01):
    observations = model['batch'].get_obs()
    
    log_probs, state_values, entropies = [], [], []
    
    # collect trajectory for update
    for obs in observations:
        action_probs, state_value = model['net'](obs)
        _, log_prob, entropy = select_action(action_probs)
        state_values.append(state_value)
        log_probs.append(log_prob)
        entropies.append(entropy)
    
    state_values = torch.cat(state_values)
    log_probs = torch.stack(log_probs, dim=0).unsqueeze(1)
    entropy = torch.tensor(entropies).mean()
    
    policy_loss = []
    critic_loss = []

    for traj in range(model['batch'].n_traj):
        # retrieve stored rollout data
        returns, advantages, old_log_probs = model['batch'].fetch(traj)
    
        # subtract log probs to get ratio
        ratio = (log_probs - old_log_probs).exp()
        obj = ratio * advantages
        # cliped update to regulate size of each update
        obj_clipped = ratio.clamp(1.0 - clip, 1.0 + clip) * advantages
        # set loss to the minimum 
        policy_loss.append(-torch.min(obj, obj_clipped) - beta * entropy)
        
        # critic loss
        critic_loss.append(F.mse_loss(state_values, returns, reduction='none'))

        loss = torch.stack(policy_loss).mean() + torch.stack(critic_loss).mean()
        
        # backwards pass of network, parameter update, and lr decay
    model['optim'].zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(model['net'].parameters(), clip)
    model['optim'].step()
        
    model['sched'].step()
    
#     with torch.no_grad():
#         for param in model['net'].parameters():
#             param.add_(torch.randn(param.size()) * 0.0001)
            
#     model['batch'].set_old_policy(log_probs.detach())
    model['batch'].clear()    

In [4]:
def train(env, model, n_episodes=20, max_timesteps=2000):
    fig, ax = plt.subplots(1,1, figsize=(9,4))
    
    avg_returns, returns, stds = [], [], [0,0]
    max_values, steps = [], []
    
    for episode in range(1, n_episodes+1):
        r, mv = [], []
        
        raw_obs = env.reset()

        for ts in range(1, max_timesteps+1):
            state = torch.from_numpy(np.ravel(raw_obs)).float().unsqueeze(0)
            
            mv.append(torch.max(state).item())
         
            action_probs, state_value = model['net'](state)
            action, log_prob, _ = select_action(action_probs)
            raw_obs, reward, done, _ = env.step(action)

            # capute values for update
            model['batch'].add(state_value, reward, log_prob, state, done)
            
            # capture reward for end-of-episode statistics
            r.append(reward)
            
            if model['batch'].batch_full():
                update(model)

            if done:
                # calculate end-of-episode statistics
                returns.append(sum(r))
                avg_returns.append(mean(returns))
                if episode > 1:
                    stds.append(stdev(returns))
            
                max_values.append(max(mv))
                steps.append(ts)
                
                break
        
        if episode % 10 == 0:
            print('Episode: {} - Avg Steps: {:.2f} - Avg Returns: {:.2f} - Max Value: {:.2f}'
                  .format(episode, mean(steps), mean(returns), max(max_values))
                 )
            
            max_values, n_zeros = [], []
            
        plot(returns, avg_returns, stds, fig, ax)

In [5]:
def main(environment='2048-v0', n_traj=100, rollout=20, n_episodes=5000, max_timesteps=2000):
    env = gym.make('CartPole-v0')
    
    set_seed(env, 0)

    obs_shape = env.observation_space.shape
    obs_dim = obs_shape[0]# * obs_shape[1]
    n_actions = env.action_space.n

    model = {}

    model['net'] = Network(obs_dim, n_actions)
    model['optim'] = optim.Adam(model['net'].parameters(), lr=1e-2)

    T_max = 400
    eta_min = 1e-4
    model['sched'] = optim.lr_scheduler.CosineAnnealingLR(model['optim'], T_max, eta_min)
    
    model['batch'] = Trajectory(n_traj=n_traj, rollout=rollout, gamma=0.9, lam=0.95)

    train(env, model, n_episodes, max_timesteps)

In [6]:
if __name__ == '__main__':
    main()

<IPython.core.display.Javascript object>

Episode: 10 - Avg Steps: 24.10 - Avg Returns: 24.10 - Max Value: 2.67
Episode: 20 - Avg Steps: 23.60 - Avg Returns: 23.60 - Max Value: 1.89
Episode: 30 - Avg Steps: 23.33 - Avg Returns: 23.33 - Max Value: 1.91
Episode: 40 - Avg Steps: 24.48 - Avg Returns: 24.48 - Max Value: 2.68
Episode: 50 - Avg Steps: 24.22 - Avg Returns: 24.22 - Max Value: 2.78
Episode: 60 - Avg Steps: 23.38 - Avg Returns: 23.38 - Max Value: 2.30
Episode: 70 - Avg Steps: 23.24 - Avg Returns: 23.24 - Max Value: 2.37


KeyboardInterrupt: 