Enable cuda

In case of google colab.
To enable GPU hardware accelerator, just go to Runtime -> Change runtime type -> Hardware accelerator -> GPU



In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
# install procgen dependencies
!pip install procgen



Defining the training parameters.



In [4]:
million_steps = 10
num_levels = 10000
game = 'coinrun'
# Model: 1 - impala, 2 - leaky impala, 3 - 5 blocks impala
model = 1
# Data augmentation strategy, 0=identity, 1=crop, 2=translate, 3=cutout, 4=colormix, 5=random sequences of all
training_aug = 0
num_features = 256
# Validation augmentation strategy. Can be 0-4 or not specified. When specified, this strategy will be removed from the random rounds of Arg5.
validation_aug = 0


Using define parameters

In [5]:
import sys
from random import randrange


total_steps = (int(million_steps) * 1e6)
print("Total steps: " + str(total_steps))
# lowering envs. in colab. Runs out of memory with 64.
num_envs = 16

print("Num levels: " + str(num_levels))
num_levels = int(num_levels)

eval_frequency_min = 0.33e6
eval_frequency_max = 2e6
num_steps = 256
num_epochs = 3
batch_size = 1024  #Use lower for low VRAM
eps = .2
grad_eps = .5
value_coef = .5
entropy_coef = .01
gamma = 0.99

print("Game: " + game)
env_name = game

print("Model: " + str(model))
model = int(model)

print("Augmentation mode: " + str(training_aug))
augmentationMode = int(training_aug)

print("nr_features: " + str(num_features))
nr_features = int(num_features)

useHoldoutAugmentation = False

print("Augmentation mode validation: " + str(validation_aug))
augmentationModeValidation = int(validation_aug)
useHoldoutAugmentation = True

randomAugmentation = False

if augmentationMode > 4:
    randomAugmentation = True

Total steps: 10000000.0
Num levels: 10000
Game: coinrun
Model: 1
Augmentation mode: 0
nr_features: 256
Augmentation mode validation: 0


Load different IMPALA models


In [6]:
import torch.nn as nn


class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)


def xavier_uniform_initializer(module, gain=1.0):
    if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
        nn.init.xavier_uniform_(module.weight.data, gain)
        nn.init.constant_(module.bias.data, 0)
    return module


class ResidualBlock(nn.Module):
    def __init__(self,
                 in_channels,
                 a_fn = nn.ReLU()):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)
        self.activation = a_fn

    def forward(self, x):
        out = nn.LeakyReLU()(x)
        out = self.conv1(out)
        out = self.activation(out)
        out = self.conv2(out)
        out = self.activation(out)
        out = self.conv3(out)
        return out + x


class ImpalaBlock(nn.Module):
    def __init__(self, in_channels, out_channels, a_fn = nn.ReLU()):
        super(ImpalaBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.res1 = ResidualBlock(out_channels, a_fn = a_fn)
        self.res2 = ResidualBlock(out_channels, a_fn = a_fn)

    def forward(self, x):
        x = self.conv(x)
        x = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)(x)
        x = self.res1(x)
        x = self.res2(x)
        return x


class ImpalaModel(nn.Module):
    def __init__(self,
                 in_channels, out_channels=256,
                 **kwargs):
        super(ImpalaModel, self).__init__()
        self.block1 = ImpalaBlock(in_channels=in_channels, out_channels=16)
        self.block2 = ImpalaBlock(in_channels=16, out_channels=32)
        self.block3 = ImpalaBlock(in_channels=32, out_channels=64)
        self.fc = nn.Linear(in_features=64 * 8 * 8, out_features=out_channels)

        self.output_dim = out_channels
        self.apply(xavier_uniform_initializer)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = nn.ReLU()(x)
        x = Flatten()(x)
        x = self.fc(x)
        x = nn.ReLU()(x)
        return x


In [7]:

class LeakyImpalaModel(nn.Module):
    def __init__(self,
                 in_channels, out_channels = 256,
                 **kwargs):
        super(LeakyImpalaModel, self).__init__()
        self.block1 = ImpalaBlock(in_channels=in_channels, out_channels=16, a_fn = nn.LeakyReLU())
        self.block2 = ImpalaBlock(in_channels=16, out_channels=32, a_fn = nn.LeakyReLU())
        self.block3 = ImpalaBlock(in_channels=32, out_channels=64, a_fn = nn.LeakyReLU())
        self.fc = nn.Linear(in_features=64 * 8 * 8, out_features=out_channels)

        self.output_dim = out_channels
        self.apply(xavier_uniform_initializer)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = nn.LeakyReLU()(x)
        x = Flatten()(x)
        x = self.fc(x)
        x = nn.LeakyReLU()(x)
        return x


In [8]:
class FiveBlocksImpala(nn.Module):
    def __init__(self,
                 in_channels, out_channels = 256,
                 **kwargs):
        super(FiveBlocksImpala, self).__init__()
        self.block1 = ImpalaBlock(in_channels=in_channels, out_channels=8)
        self.block2 = ImpalaBlock(in_channels=8, out_channels=16)
        self.block3 = ImpalaBlock(in_channels=16, out_channels=32)
        self.block4 = ImpalaBlock(in_channels=32, out_channels=64)
        self.block5 = ImpalaBlock(in_channels=64, out_channels=128)
        self.fc = nn.Linear(in_features=128 * 4, out_features=out_channels)

        self.output_dim = out_channels
        self.apply(xavier_uniform_initializer)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        x = nn.ReLU()(x)
        x = Flatten()(x)
        x = self.fc(x)
        x = nn.ReLU()(x)
        return x

Load utils.

In [9]:
import contextlib
import os
from abc import ABC, abstractmethod
import numpy as np
import gym
import random
from gym import spaces
import time
from collections import deque
import os
import torch
import torch.nn as nn
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
from procgen import ProcgenEnv
from collections import deque

"""
Utility functions for the deep RL projects that I supervise in 02456 Deep Learning @ DTU.
"""


def make_env(
	n_envs=32,
	env_name='starpilot',
	start_level=0,
	num_levels=100,
	use_backgrounds=True,
	normalize_obs=False,
	normalize_reward=True,
	seed=0,
	gamma=0.99
	):
	"""Make environment for procgen experiments"""
	set_global_seeds(seed)
	set_global_log_levels(40)
	env = ProcgenEnv(
		num_envs=n_envs,
		env_name=env_name,
		start_level=start_level,
		num_levels=num_levels,
		distribution_mode='easy',
		use_backgrounds=use_backgrounds,
		restrict_themes=not use_backgrounds,
		render_mode='rgb_array',
		rand_seed=seed
	)
	env = VecExtractDictObs(env, "rgb")
	env = VecNormalize(env, ob=normalize_obs, ret=normalize_reward, gamma = gamma)
	env = TransposeFrame(env)
	env = ScaledFloatFrame(env)
	env = TensorEnv(env)
	
	return env


class Storage():
	def __init__(self, obs_shape, num_steps, num_envs, gamma=0.99, lmbda=0.95, normalize_advantage=True):
		self.obs_shape = obs_shape
		self.num_steps = num_steps
		self.num_envs = num_envs
		self.gamma = gamma
		self.lmbda = lmbda
		self.normalize_advantage = normalize_advantage
		self.reset()

	def reset(self):
		self.obs = torch.zeros(self.num_steps+1, self.num_envs, *self.obs_shape)
		self.action = torch.zeros(self.num_steps, self.num_envs)
		self.reward = torch.zeros(self.num_steps, self.num_envs)
		self.done = torch.zeros(self.num_steps, self.num_envs)
		self.log_prob = torch.zeros(self.num_steps, self.num_envs)
		self.value = torch.zeros(self.num_steps+1, self.num_envs)
		self.returns = torch.zeros(self.num_steps, self.num_envs)
		self.advantage = torch.zeros(self.num_steps, self.num_envs)
		self.info = deque(maxlen=self.num_steps)
		self.step = 0

	def store(self, obs, action, reward, done, info, log_prob, value):
		self.obs[self.step] = obs.clone()
		self.action[self.step] = action.clone()
		self.reward[self.step] = torch.from_numpy(reward.copy())
		self.done[self.step] = torch.from_numpy(done.copy())
		self.info.append(info)
		self.log_prob[self.step] = log_prob.clone()
		self.value[self.step] = value.clone()
		self.step = (self.step + 1) % self.num_steps

	def store_last(self, obs, value):
		self.obs[-1] = obs.clone()
		self.value[-1] = value.clone()

	def compute_return_advantage(self):
		advantage = 0
		for i in reversed(range(self.num_steps)):
			delta = (self.reward[i] + self.gamma * self.value[i+1] * (1 - self.done[i])) - self.value[i]
			advantage = self.gamma * self.lmbda * advantage * (1 - self.done[i]) + delta
			self.advantage[i] = advantage

		self.returns = self.advantage + self.value[:-1]
		if self.normalize_advantage:
			self.advantage = (self.advantage - self.advantage.mean()) / (self.advantage.std() + 1e-9)

	def get_generator(self, batch_size=1024):
		iterator = BatchSampler(SubsetRandomSampler(range(self.num_steps*self.num_envs)), batch_size, drop_last=True)
		for indices in iterator:
			obs = self.obs[:-1].reshape(-1, *self.obs_shape)[indices].cuda()
			action = self.action.reshape(-1)[indices].cuda()
			log_prob = self.log_prob.reshape(-1)[indices].cuda()
			value = self.value[:-1].reshape(-1)[indices].cuda()
			returns = self.returns.reshape(-1)[indices].cuda()
			advantage = self.advantage.reshape(-1)[indices].cuda()
			yield obs, action, log_prob, value, returns, advantage

	def get_reward(self, normalized_reward=True):
		if normalized_reward:
			reward = []
			for step in range(self.num_steps):
				info = self.info[step]
				reward.append([d['reward'] for d in info])
			reward = torch.Tensor(reward)
		else:
			reward = self.reward
		
		return reward.mean(1).sum(0)


def orthogonal_init(module, gain=nn.init.calculate_gain('relu')):
	"""Orthogonal weight initialization: https://arxiv.org/abs/1312.6120"""
	if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
		nn.init.orthogonal_(module.weight.data, gain)
		nn.init.constant_(module.bias.data, 0)
	return module


"""
Helper functions that set global seeds and gym logging preferences
"""

def set_global_seeds(seed):
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	np.random.seed(seed)
	random.seed(seed)


def set_global_log_levels(level):
	gym.logger.set_level(level)


"""
Copy-pasted from OpenAI to obviate dependency on Baselines. Required for vectorized environments.
You will never have to look beyond this line.
"""

class AlreadySteppingError(Exception):
	"""
	Raised when an asynchronous step is running while
	step_async() is called again.
	"""

	def __init__(self):
		msg = 'already running an async step'
		Exception.__init__(self, msg)


class NotSteppingError(Exception):
	"""
	Raised when an asynchronous step is not running but
	step_wait() is called.
	"""

	def __init__(self):
		msg = 'not running an async step'
		Exception.__init__(self, msg)


class VecEnv(ABC):
	"""
	An abstract asynchronous, vectorized environment.
	Used to batch data from multiple copies of an environment, so that
	each observation becomes an batch of observations, and expected action is a batch of actions to
	be applied per-environment.
	"""
	closed = False
	viewer = None

	metadata = {
		'render.modes': ['human', 'rgb_array']
	}

	def __init__(self, num_envs, observation_space, action_space):
		self.num_envs = num_envs
		self.observation_space = observation_space
		self.action_space = action_space

	@abstractmethod
	def reset(self):
		"""
		Reset all the environments and return an array of
		observations, or a dict of observation arrays.

		If step_async is still doing work, that work will
		be cancelled and step_wait() should not be called
		until step_async() is invoked again.
		"""
		pass

	@abstractmethod
	def step_async(self, actions):
		"""
		Tell all the environments to start taking a step
		with the given actions.
		Call step_wait() to get the results of the step.

		You should not call this if a step_async run is
		already pending.
		"""
		pass

	@abstractmethod
	def step_wait(self):
		"""
		Wait for the step taken with step_async().

		Returns (obs, rews, dones, infos):
		 - obs: an array of observations, or a dict of
				arrays of observations.
		 - rews: an array of rewards
		 - dones: an array of "episode done" booleans
		 - infos: a sequence of info objects
		"""
		pass

	def close_extras(self):
		"""
		Clean up the  extra resources, beyond what's in this base class.
		Only runs when not self.closed.
		"""
		pass

	def close(self):
		if self.closed:
			return
		if self.viewer is not None:
			self.viewer.close()
		self.close_extras()
		self.closed = True

	def step(self, actions):
		"""
		Step the environments synchronously.

		This is available for backwards compatibility.
		"""
		self.step_async(actions)
		return self.step_wait()

	def render(self, mode='human'):
		imgs = self.get_images()
		bigimg = "ARGHH" #tile_images(imgs)
		if mode == 'human':
			self.get_viewer().imshow(bigimg)
			return self.get_viewer().isopen
		elif mode == 'rgb_array':
			return bigimg
		else:
			raise NotImplementedError

	def get_images(self):
		"""
		Return RGB images from each environment
		"""
		raise NotImplementedError

	@property
	def unwrapped(self):
		if isinstance(self, VecEnvWrapper):
			return self.venv.unwrapped
		else:
			return self

	def get_viewer(self):
		if self.viewer is None:
			from gym.envs.classic_control import rendering
			self.viewer = rendering.SimpleImageViewer()
		return self.viewer

	
class VecEnvWrapper(VecEnv):
	"""
	An environment wrapper that applies to an entire batch
	of environments at once.
	"""

	def __init__(self, venv, observation_space=None, action_space=None):
		self.venv = venv
		super().__init__(num_envs=venv.num_envs,
						observation_space=observation_space or venv.observation_space,
						action_space=action_space or venv.action_space)

	def step_async(self, actions):
		self.venv.step_async(actions)

	@abstractmethod
	def reset(self):
		pass

	@abstractmethod
	def step_wait(self):
		pass

	def close(self):
		return self.venv.close()

	def render(self, mode='human'):
		return self.venv.render(mode=mode)

	def get_images(self):
		return self.venv.get_images()

	def __getattr__(self, name):
		if name.startswith('_'):
			raise AttributeError("attempted to get missing private attribute '{}'".format(name))
		return getattr(self.venv, name)

	
class VecEnvObservationWrapper(VecEnvWrapper):
	@abstractmethod
	def process(self, obs):
		pass

	def reset(self):
		obs = self.venv.reset()
		return self.process(obs)

	def step_wait(self):
		obs, rews, dones, infos = self.venv.step_wait()
		return self.process(obs), rews, dones, infos

	
class CloudpickleWrapper(object):
	"""
	Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
	"""

	def __init__(self, x):
		self.x = x

	def __getstate__(self):
		import cloudpickle
		return cloudpickle.dumps(self.x)

	def __setstate__(self, ob):
		import pickle
		self.x = pickle.loads(ob)

		
@contextlib.contextmanager
def clear_mpi_env_vars():
	"""
	from mpi4py import MPI will call MPI_Init by default.  If the child process has MPI environment variables, MPI will think that the child process is an MPI process just like the parent and do bad things such as hang.
	This context manager is a hacky way to clear those environment variables temporarily such as when we are starting multiprocessing
	Processes.
	"""
	removed_environment = {}
	for k, v in list(os.environ.items()):
		for prefix in ['OMPI_', 'PMI_']:
			if k.startswith(prefix):
				removed_environment[k] = v
				del os.environ[k]
	try:
		yield
	finally:
		os.environ.update(removed_environment)

		
class VecFrameStack(VecEnvWrapper):
	def __init__(self, venv, nstack):
		self.venv = venv
		self.nstack = nstack
		wos = venv.observation_space  # wrapped ob space
		low = np.repeat(wos.low, self.nstack, axis=-1)
		high = np.repeat(wos.high, self.nstack, axis=-1)
		self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
		observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
		VecEnvWrapper.__init__(self, venv, observation_space=observation_space)

	def step_wait(self):
		obs, rews, news, infos = self.venv.step_wait()
		self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
		for (i, new) in enumerate(news):
			if new:
				self.stackedobs[i] = 0
		self.stackedobs[..., -obs.shape[-1]:] = obs
		return self.stackedobs, rews, news, infos

	def reset(self):
		obs = self.venv.reset()
		self.stackedobs[...] = 0
		self.stackedobs[..., -obs.shape[-1]:] = obs
		return self.stackedobs
	
class VecExtractDictObs(VecEnvObservationWrapper):
	def __init__(self, venv, key):
		self.key = key
		super().__init__(venv=venv,
			observation_space=venv.observation_space.spaces[self.key])

	def process(self, obs):
		return obs[self.key]
	
	
class RunningMeanStd(object):
	# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
	def __init__(self, epsilon=1e-4, shape=()):
		self.mean = np.zeros(shape, 'float64')
		self.var = np.ones(shape, 'float64')
		self.count = epsilon

	def update(self, x):
		batch_mean = np.mean(x, axis=0)
		batch_var = np.var(x, axis=0)
		batch_count = x.shape[0]
		self.update_from_moments(batch_mean, batch_var, batch_count)

	def update_from_moments(self, batch_mean, batch_var, batch_count):
		self.mean, self.var, self.count = update_mean_var_count_from_moments(
			self.mean, self.var, self.count, batch_mean, batch_var, batch_count)

		
def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count):
	delta = batch_mean - mean
	tot_count = count + batch_count

	new_mean = mean + delta * batch_count / tot_count
	m_a = var * count
	m_b = batch_var * batch_count
	M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
	new_var = M2 / tot_count
	new_count = tot_count

	return new_mean, new_var, new_count


class VecNormalize(VecEnvWrapper):
	"""
	A vectorized wrapper that normalizes the observations
	and returns from an environment.
	"""

	def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
		VecEnvWrapper.__init__(self, venv)

		self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
		self.ret_rms = RunningMeanStd(shape=()) if ret else None
		
		self.clipob = clipob
		self.cliprew = cliprew
		self.ret = np.zeros(self.num_envs)
		self.gamma = gamma
		self.epsilon = epsilon

	def step_wait(self):
		obs, rews, news, infos = self.venv.step_wait()
		for i in range(len(infos)):
			infos[i]['reward'] = rews[i]
		self.ret = self.ret * self.gamma + rews
		obs = self._obfilt(obs)
		if self.ret_rms:
			self.ret_rms.update(self.ret)
			rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
		self.ret[news] = 0.
		return obs, rews, news, infos

	def _obfilt(self, obs):
		if self.ob_rms:
			self.ob_rms.update(obs)
			obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
			return obs
		else:
			return obs

	def reset(self):
		self.ret = np.zeros(self.num_envs)
		obs = self.venv.reset()
		return self._obfilt(obs)


class TransposeFrame(VecEnvWrapper):
	def __init__(self, env):
		super().__init__(venv=env)
		obs_shape = self.observation_space.shape
		self.observation_space = gym.spaces.Box(low=0, high=255, shape=(obs_shape[2], obs_shape[0], obs_shape[1]), dtype=np.float32)

	def step_wait(self):
		obs, reward, done, info = self.venv.step_wait()
		return obs.transpose(0,3,1,2), reward, done, info

	def reset(self):
		obs = self.venv.reset()
		return obs.transpose(0,3,1,2)


class ScaledFloatFrame(VecEnvWrapper):
	def __init__(self, env):
		super().__init__(venv=env)
		obs_shape = self.observation_space.shape
		self.observation_space = gym.spaces.Box(low=0, high=1, shape=obs_shape, dtype=np.float32)

	def step_wait(self):
		obs, reward, done, info = self.venv.step_wait()
		return obs/255.0, reward, done, info

	def reset(self):
		obs = self.venv.reset()
		return obs/255.0


class TensorEnv(VecEnvWrapper):
	def __init__(self, env):
		super().__init__(venv=env)

	def step_async(self, actions):
		if isinstance(actions, torch.Tensor):
			actions = actions.detach().cpu().numpy()
		self.venv.step_async(actions)

	def step_wait(self):
		obs, reward, done, info = self.venv.step_wait()
		return torch.Tensor(obs), reward, done, info

	def reset(self):
		obs = self.venv.reset()
		return torch.Tensor(obs)


Initialize procgen environment

In [10]:
env = make_env(num_envs, num_levels=num_levels, gamma=gamma, env_name=env_name)

Create PPO policy.

In [11]:
class Policy(nn.Module):
    def __init__(self, encoder, feature_dim, num_actions):
        super().__init__()
        self.encoder = encoder
        self.policy = orthogonal_init(nn.Linear(feature_dim, num_actions), gain=.01)
        self.value = orthogonal_init(nn.Linear(feature_dim, 1), gain=1.)

    def act(self, x):
        with torch.no_grad():
            x = x.cuda().contiguous()
            dist, value, maxAction = self.forward(x)
            action = dist.sample()
            log_prob = dist.log_prob(action)

        return action.cpu(), log_prob.cpu(), value.cpu()

    def actMax(self, x):
        with torch.no_grad():
            x = x.cuda().contiguous()
            dist, value, maxAction = self.forward(x)
            action = maxAction
            log_prob = dist.log_prob(action)

        return action.cpu(), log_prob.cpu(), value.cpu()

    def forward(self, x):
        x = self.encoder(x)
        logits = self.policy(x)
        value = self.value(x).squeeze(1)
        maxAction = logits.argmax(dim=1)
        dist = torch.distributions.Categorical(logits=logits)

        return dist, value, maxAction

Initialize the model

In [12]:
in_channels = 3

if model == 1:
    encoder = ImpalaModel(in_channels, nr_features)

if model == 2:
    encoder = LeakyImpalaModel(in_channels, nr_features)

if model == 3:
    encoder = FiveBlocksImpala(in_channels, nr_features)

policy = Policy(encoder, nr_features, env.action_space.n)
policy.cuda()

Policy(
  (encoder): ImpalaModel(
    (block1): ImpalaBlock(
      (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (res1): ResidualBlock(
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (conv3): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (activation): ReLU()
      )
      (res2): ResidualBlock(
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (conv3): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (activation): ReLU()
      )
    )
    (block2): ImpalaBlock(
      (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (res1): ResidualBlock(
        (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     

Define augmentation used during training and validation

In [13]:
import imageio
import torch.nn.functional as F
import warnings
warnings.filterwarnings("ignore")


def stretch(frame, orgW, orgH):
    factor = (orgW / frame.shape[1] + 1e-5, orgH / frame.shape[2] + 1e-5)
    img = frame.float().unsqueeze(0)
    return F.interpolate(img, scale_factor=factor)[0,:]

def position(frame, orgW, orgH):
    zeros = torch.zeros(3, orgW, orgH)
    width = frame.shape[1]
    height = frame.shape[2]
    offsetX = randrange(orgW-width)
    offsetY = randrange(orgH-height)
    zeros[:,offsetX:offsetX+width,offsetY:offsetY+height] = frame
    return zeros



def identity(frame):
    return frame

def crop(frame):
    orgW = frame.shape[1]
    orgH = frame.shape[2]
    cropsize = randrange(32) + min(orgW, orgH) - 32
    offsetX = randrange(orgW-cropsize)
    offsetY = randrange(orgH-cropsize)
    cropped = frame[:,offsetX:offsetX+cropsize,offsetY:offsetY+cropsize]
    stretched = stretch(cropped, orgW, orgH)
    return stretched

def translate(frame):
    orgW = frame.shape[1]
    orgH = frame.shape[2]
    cropsize = randrange(32) + min(orgW, orgH) - 32
    offsetX = randrange(orgW-cropsize)
    offsetY = randrange(orgH-cropsize)
    cropped = frame[:,offsetX:offsetX+cropsize,offsetY:offsetY+cropsize]
    return position(cropped, orgW, orgH)

def cutout(frame):
    orgW = frame.shape[1]
    orgH = frame.shape[2]
    cutoutWidth = randrange(20) + 4
    cutoutHeight = randrange(20) + 4
    zeros = torch.zeros(3, cutoutWidth, cutoutHeight)
    offsetX = randrange(orgW-cutoutWidth)
    offsetY = randrange(orgH-cutoutHeight)
    frame[:,offsetX : offsetX + cutoutWidth, offsetY : offsetY + cutoutHeight] = zeros
    return frame

def colormix(frame):
    redWeight = 0.3 + randrange(100) / 100.0
    greenWeight = 0.3 + randrange(100) / 100.0
    blueWeight = 0.3 + randrange(100) / 100.0
    frame[0] = frame[0,:] * redWeight
    frame[1] = frame[1,:] * greenWeight
    frame[2] = frame[2,:] * blueWeight
    return frame

AugmentationFuncArr = []

RemoveModeFromRandom = False
ModeToRemove = 0

def setHoldoutAgumentation(mode):
    global RemoveModeFromRandom
    global ModeToRemove
    ModeToRemove = mode
    RemoveModeFromRandom = True

def setAugmentationMode(mode, environments):
    global AugmentationFuncArr
    AugmentationFuncArr = []
    for i in range(environments):
        if mode == 0:
            AugmentationFuncArr.append(identity)
        elif mode == 1:
            AugmentationFuncArr.append(crop)
        elif mode == 2:
            AugmentationFuncArr.append(translate)
        elif mode == 3:
            AugmentationFuncArr.append(cutout)
        elif mode == 4:
            AugmentationFuncArr.append(colormix)

def setRandomAugmentationMode(environments):
    global AugmentationFuncArr
    global RemoveModeFromRandom
    global ModeToRemove
    AugmentationFuncArr = []
    for i in range(environments):
        mode = randrange(5)
        if RemoveModeFromRandom:
            while(mode == ModeToRemove):
                mode = randrange(5)
        if mode == 0:
            AugmentationFuncArr.append(identity)
        elif mode == 1:
            AugmentationFuncArr.append(crop)
        elif mode == 2:
            AugmentationFuncArr.append(translate)
        elif mode == 3:
            AugmentationFuncArr.append(cutout)
        elif mode == 4:
            AugmentationFuncArr.append(colormix)

def augment(obs):
    for i in range(obs.shape[0]):
        obs[i] = AugmentationFuncArr[i % len(AugmentationFuncArr)](obs[i])
    return obs

def testCrop():
    env = make_env(4, num_levels=10, gamma=0.999, env_name='coinrun')
    obs = env.reset()
    for i in range(obs.shape[0]):
        frame = (obs[0,:]*255.)
        imageio.imsave("crop" + str(i) + ".png",crop(frame).T.byte())

def testTranslate():
    env = make_env(4, num_levels=10, gamma=0.999, env_name='coinrun')
    obs = env.reset()
    for i in range(obs.shape[0]):
        frame = (obs[0,:]*255.)
        imageio.imsave("translate" + str(i) + ".png",translate(frame).T.byte())

def testCutout():
    env = make_env(4, num_levels=10, gamma=0.999, env_name='coinrun')
    obs = env.reset()
    for i in range(obs.shape[0]):
        frame = (obs[0,:]*255.)
        imageio.imsave("cutout" + str(i) + ".png",cutout(frame).T.byte())

def testColorMix():
    env = make_env(4, num_levels=10, gamma=0.999, env_name='coinrun')
    obs = env.reset()
    for i in range(obs.shape[0]):
        frame = (obs[0,:]*255.)
        imageio.imsave("colorMix" + str(i) + ".png",colormix(frame).T.byte())


Initialize all the requisites for the training.



In [14]:
# Define optimizer
# these are reasonable values but probably not optimal
optimizer = torch.optim.Adam(policy.parameters(), lr=5e-4, eps=1e-5)

# Define temporary storage
# we use this to collect transitions during each iteration
storage = Storage(
    env.observation_space.shape,
    num_steps,
    num_envs,
    gamma
)

def resetAugmentationMode():
    if (randomAugmentation):
        setRandomAugmentationMode(num_envs)
    else:
        setAugmentationMode(augmentationMode, num_envs)

if useHoldoutAugmentation:
    setHoldoutAgumentation(augmentationModeValidation)


def evaluate(step, testEnv, testEnvAugmentationMode = 0):
    setAugmentationMode(testEnvAugmentationMode, num_envs)
    # Make evaluation environment
    startlvl = 0
    numlevels = num_levels
    if testEnv:
        startlvl = num_levels
        numlevels = 100000
    eval_env = make_env(num_envs, start_level=startlvl, num_levels=numlevels, gamma=gamma, env_name=env_name)
    obs = eval_env.reset()


    total_reward = []

    # Evaluate policy
    policy.eval()
    for _ in range(2048):
        # Use policy
        action, log_prob, value = policy.actMax(augment(obs))

        # Take step in environment
        obs, reward, done, info = eval_env.step(action)
        total_reward.append(torch.Tensor(reward))

    # Calculate average return
    total_reward = torch.stack(total_reward).sum(0).mean(0)
    if testEnv:
        resetAugmentationMode()
    return total_reward


resetAugmentationMode()

In [None]:
# Run training
obs = augment(env.reset())
# obs = augment(obs)
step = 0
lastEval = step
while step < total_steps:
    # Use policy to collect data for num_steps steps
    policy.eval()
    for _ in range(num_steps):
        # Use policy
        action, log_prob, value = policy.act(obs)

        # Take step in environment
        next_obs, reward, done, info = env.step(action)
        # Update augmentation mode if we have random augmentations
        if (randomAugmentation and randrange(3) == 0):
            setRandomAugmentationMode(num_envs)

        # Store data
        storage.store(obs, action, reward, done, info, log_prob, value)

        # Update current observation
        obs = augment(next_obs)

    # Add the last observation to collected data
    _, _, value = policy.act(obs)
    storage.store_last(obs, value)

    # Compute return and advantage
    storage.compute_return_advantage()

    # Optimize policy
    policy.train()
    for epoch in range(num_epochs):

        # Iterate over batches of transitions
        generator = storage.get_generator(batch_size)
        for batch in generator:
            b_obs, b_action, b_log_prob, b_value, b_returns, b_advantage = batch

            # Get current policy outputs
            new_dist, new_value, _ = policy(augment(b_obs))
            new_log_prob = new_dist.log_prob(b_action)

            ratio = (new_log_prob - b_log_prob).exp()
            surr1 = ratio * b_advantage
            surr2 = torch.clamp(ratio, 1.0 - eps, 1.0 + eps) * b_advantage

            # Clipped policy objective
            pi_loss = - torch.min(surr1, surr2).mean()
            # Clipped value function objective
            value_loss = (new_value - b_returns).pow(2).mean()

            # Entropy loss
            entropy_loss = torch.mean(torch.exp(new_log_prob) * new_log_prob)

            # Backpropagate losses
            loss = pi_loss + value_coef * value_loss + entropy_coef * entropy_loss
            loss.backward()

            # Clip gradients
            torch.nn.utils.clip_grad_norm_(policy.parameters(), grad_eps)

            # Update policy
            optimizer.step()
            optimizer.zero_grad()

    # Update stats
    step += num_envs * num_steps
    if ((lastEval < 1e6 and (step - lastEval) >= eval_frequency_min) or ((step - lastEval) >= eval_frequency_max) or step >= total_steps):
        trainScore = evaluate(step, False, augmentationModeValidation)
        testScore = evaluate(step, True, augmentationModeValidation)
        print(f'Step: {step}\t({trainScore},{testScore})')
        lastEval = step

print('Completed training!')
torch.save(policy.state_dict, 'checkpoint.pt')

Generate video of the model .

In [None]:
import imageio

# Make evaluation environment
eval_env = make_env(num_envs, start_level=num_levels, num_levels=num_levels, gamma=gamma, env_name=env_name)
obs = eval_env.reset()

frames = []
total_reward = []

# Evaluate policy
policy.eval()
for _ in range(2048):
    # Use policy
    action, log_prob, value = policy.actMax(obs)

    # Take step in environment
    obs, reward, done, info = eval_env.step(action)

    total_reward.append(torch.Tensor(reward))

    # Render environment and store
    frame = (torch.Tensor(eval_env.render(mode='rgb_array')) * 255.).byte()
    frames.append(frame)

# Calculate average return
total_reward = torch.stack(total_reward).sum(0).mean(0)
print('Average return:', total_reward)
# Save frames as video
frames = torch.stack(frames)
imageio.mimsave('vid.mp4', frames, fps=25)
