In [3]:
hp = None
ENV = None
ENV_MASK_VELOCITY = None

In [4]:
from google.colab import drive
drive.mount("/google_drive")
WORKSPACE_PATH = "/google_drive/My Drive/workspace"

Mounted at /google_drive


In [5]:
!pip install -q gym[box2d]==0.15.4 dotmap line_profiler > /dev/null
!apt-get install -qq python-opengl xvfb x11-utils > /dev/null
!pip install -q pyvirtualdisplay piglet > /dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-probability 0.16.0 requires cloudpickle>=1.3, but you have cloudpickle 1.2.2 which is incompatible.[0m


In [6]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

<pyvirtualdisplay.display.Display at 0x7fde379ab850>

In [7]:
import torch
from torch import nn
from torch import distributions
import gym
import copy
from torch.utils.tensorboard import SummaryWriter
from torch import optim
import numpy as np
from itertools import count
import torch.nn.functional as F
import re
import copy
import time
import math
import pathlib
import time
import pickle
import os
from dataclasses import dataclass
import gc
from dotmap import DotMap
from base64 import b64encode
from IPython.display import HTML

# Select version 2 of tensorflow to avoid warnings.
%tensorflow_version 2.x

In [8]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, continuous_action_space, trainable_std_dev, init_log_std_dev=None):
        super().__init__()
        self.lstm = nn.LSTM(state_dim, hp.hidden_size, num_layers=hp.recurrent_layers)
        self.layer_hidden = nn.Linear(hp.hidden_size, hp.hidden_size)
        self.layer_policy_logits = nn.Linear(hp.hidden_size, action_dim)
        self.action_dim = action_dim
        self.continuous_action_space = continuous_action_space 
        self.log_std_dev = nn.Parameter(init_log_std_dev * torch.ones((action_dim), dtype=torch.float), requires_grad=trainable_std_dev)
        self.covariance_eye = torch.eye(self.action_dim).unsqueeze(0)
        self.hidden_cell = None
        
    def get_init_state(self, batch_size, device):
        self.hidden_cell = (torch.zeros(hp.recurrent_layers, batch_size, hp.hidden_size).to(device),
                            torch.zeros(hp.recurrent_layers, batch_size,hp.hidden_size).to(device))
        
    def forward(self, state, terminal=None):
        batch_size = state.shape[1]
        device = state.device
        if self.hidden_cell is None or batch_size != self.hidden_cell[0].shape[1]:
            self.get_init_state(batch_size, device)
        if terminal is not None:
            self.hidden_cell = [value * (1. - terminal).reshape(1, batch_size, 1) for value in self.hidden_cell]
        _, self.hidden_cell = self.lstm(state, self.hidden_cell)
        hidden_out = F.elu(self.layer_hidden(self.hidden_cell[0][-1]))
        policy_logits_out = self.layer_policy_logits(hidden_out)
        if self.continuous_action_space:
            cov_matrix = self.covariance_eye.to(device).expand(batch_size, self.action_dim, self.action_dim) * torch.exp(self.log_std_dev.to(device))
            # We define the distribution on the CPU since otherwise operations fail with CUDA illegal memory access error.
            policy_dist = torch.distributions.multivariate_normal.MultivariateNormal(policy_logits_out.to("cpu"), cov_matrix.to("cpu"))
        else:
            policy_dist = distributions.Categorical(F.softmax(policy_logits_out, dim=1))
        return policy_dist
    
class Critic(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.layer_lstm = nn.LSTM(state_dim, hp.hidden_size, num_layers=hp.recurrent_layers)
        self.layer_hidden = nn.Linear(hp.hidden_size, hp.hidden_size)
        self.layer_value = nn.Linear(hp.hidden_size, 1)
        self.hidden_cell = None
        
    def get_init_state(self, batch_size, device):
        self.hidden_cell = (torch.zeros(hp.recurrent_layers, batch_size, hp.hidden_size).to(device),
                            torch.zeros(hp.recurrent_layers, batch_size, hp.hidden_size).to(device))
    
    def forward(self, state, terminal=None):
        batch_size = state.shape[1]
        device = state.device
        if self.hidden_cell is None or batch_size != self.hidden_cell[0].shape[1]:
            self.get_init_state(batch_size, device)
        if terminal is not None:
            self.hidden_cell = [value * (1. - terminal).reshape(1, batch_size, 1) for value in self.hidden_cell]
        _, self.hidden_cell = self.layer_lstm(state, self.hidden_cell)
        hidden_out = F.elu(self.layer_hidden(self.hidden_cell[0][-1]))
        value_out = self.layer_value(hidden_out)
        return value_out

In [9]:
@dataclass
class HyperParameters():
    scale_reward:         float
    min_reward:           float
    hidden_size:          float
    batch_size:           int
    discount:             float
    gae_lambda:           float
    ppo_clip:             float
    ppo_epochs:           int
    max_grad_norm:        float
    entropy_factor:       float
    actor_learning_rate:  float
    critic_learning_rate: float
    recurrent_seq_len:    int
    recurrent_layers:     int
    rollout_steps:        int
    parallel_rollouts:    int
    patience:             int
    # Apply to continous action spaces only 
    trainable_std_dev:    bool
    init_log_std_dev:     float
@dataclass
class StopConditions():
    """
    Store parameters and variables used to stop training. 
    """
    best_reward: float = -1e6
    fail_to_improve_count: int = 0
    max_iterations: int = 1000000
def get_env_space():
    """
    Return obsvervation dimensions, action dimensions and whether or not action space is continuous.
    """
    env = gym.make(ENV)
    continuous_action_space = type(env.action_space) is gym.spaces.box.Box
    if continuous_action_space:
        action_dim =  env.action_space.shape[0]
    else:
        action_dim = env.action_space.n 
    obsv_dim= env.observation_space.shape[0] 
    return obsv_dim, action_dim, continuous_action_space

def load_checkpoint(iteration):
    """
    Load from training checkpoint.
    """
    global ENV
    global ENV_MASK_VELOCITY
    global hp
    CHECKPOINT_PATH = BASE_CHECKPOINT_PATH + f"{iteration}/"
    with open(CHECKPOINT_PATH + "parameters.pt", "rb") as f:
        checkpoint = pickle.load(f)
        
    ENV = checkpoint.env
    ENV_MASK_VELOCITY = checkpoint.env_mask_velocity
    hp = checkpoint.hp

    actor_state_dict = torch.load(CHECKPOINT_PATH + "actor.pt", map_location=torch.device("cpu"))
    critic_state_dict = torch.load(CHECKPOINT_PATH + "critic.pt", map_location=torch.device("cpu"))
    actor_optimizer_state_dict = torch.load(CHECKPOINT_PATH + "actor_optimizer.pt", map_location=torch.device("cpu"))
    critic_optimizer_state_dict = torch.load(CHECKPOINT_PATH + "critic_optimizer.pt", map_location=torch.device("cpu"))
    
    return (actor_state_dict, critic_state_dict,
           actor_optimizer_state_dict, critic_optimizer_state_dict,
           checkpoint.stop_conditions)

def load_from_checkpoint(max_checkpoint_iteration):
    
    actor_state_dict, critic_state_dict, actor_optimizer_state_dict, critic_optimizer_state_dict, stop_conditions = load_checkpoint(max_checkpoint_iteration)
    
    obsv_dim, action_dim, continuous_action_space = get_env_space()
    actor = Actor(obsv_dim,
                  action_dim,
                  continuous_action_space=continuous_action_space,
                  trainable_std_dev=hp.trainable_std_dev,
                  init_log_std_dev=hp.init_log_std_dev)
    critic = Critic(obsv_dim)
    
    actor_optimizer = optim.AdamW(actor.parameters(), lr=hp.actor_learning_rate)
    critic_optimizer = optim.AdamW(critic.parameters(), lr=hp.critic_learning_rate)

    actor.load_state_dict(actor_state_dict, strict=True) 
    critic.load_state_dict(critic_state_dict, strict=True)
    actor_optimizer.load_state_dict(actor_optimizer_state_dict)
    critic_optimizer.load_state_dict(critic_optimizer_state_dict)

    # We have to move manually move optimizer states to TRAIN_DEVICE manually since optimizer doesn't yet have a "to" method.
    for state in actor_optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.to("cpu")

    for state in critic_optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.to("cpu")

    return actor, critic, actor_optimizer, critic_optimizer, max_checkpoint_iteration, stop_conditions


def visualise_policy(actor):
    """
    Visualise policy.
    """
    env = gym.make(ENV)
    if ENV_MASK_VELOCITY:
        env = MaskVelocityWrapper(env)
    env = gym.wrappers.Monitor(env, f"{WORKSPACE_PATH}/videos", force=True)
    observation = env.reset()
    done_mask = torch.zeros(1) 
    done = False
    step_idx = 0
    actor = actor.to("cpu")
    actor.eval()
    actor.get_init_state(1, "cpu")
    total_reward = 0.
    print("Testing policy...")
    while(not done):
        # Choose next action 
        state = torch.tensor(observation, dtype=torch.float32)
        dist = actor(state.reshape([1, 1, -1]), done_mask)
        action =  dist.sample().squeeze(0)
        #import pdb; pdb.set_trace()
        #
        
        #action =  dist.mean.detach().squeeze(0)
        # Apply action
        action_np = action.cpu().numpy()
        observation, reward, done, info = env.step(action_np)
        step_idx += 1
        total_reward += reward
    print(f"Steps to done: {step_idx}, Total reward: {total_reward}")
    return env.file_infix

class MaskVelocityWrapper(gym.ObservationWrapper):
    """
    Gym environment observation wrapper used to mask velocity terms in
    observations. The intention is the make the MDP partially observatiable.
    """
    def __init__(self, env):
        super(MaskVelocityWrapper, self).__init__(env)
        if ENV == "CartPole-v1":
            self.mask = np.array([1., 0., 1., 0.])
        elif ENV == "Pendulum-v0":
            self.mask = np.array([1., 1., 0.])
        elif ENV == "LunarLander-v2":
            self.mask = np.array([1., 1., 0., 0., 1., 0., 1., 1,])
        elif ENV == "LunarLanderContinuous-v2":
            self.mask = np.array([1., 1., 0., 0., 1., 0., 1., 1,])
        else:
            raise NotImplementedError

    def observation(self, observation):
        return  observation * self.mask

In [10]:
EXPERIMENT_NAME = "experiment-003"
BASE_CHECKPOINT_PATH = f"{WORKSPACE_PATH}/checkpoints/{EXPERIMENT_NAME}/"

In [17]:
###REPLACE THE NUMBER WITH THE CHECKPOINT ITERATION YOU WANT TO EVALUATE/VISUALIZE
actor, critic, actor_optimizer, critic_optimizer, iteration, stop_conditions = load_from_checkpoint(490)


In [18]:
file_infix = visualise_policy(actor=actor)

Testing policy...
Steps to done: 203, Total reward: 264.0322568587684


In [19]:
mp4 = open(f"{WORKSPACE_PATH}/videos/openaigym.video.{file_infix}.video000000.mp4",'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)