In [2]:
import math
import random
import pdb
import copy
from typing import *

import collections as cc
import sortedcontainers as sc
import itertools as it
import functools as ft

import einops as eo
import scipy as sp
import numpy as np
import numpy.random as npr
import sklearn as skl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import torch as tc
import torch.nn as tcn
import torch.nn.functional as tcf
import torch.optim as tco
import torch.distributions as tcd
import einops.layers.torch as eol

pd.options.display.max_rows = 40
pd.options.display.min_rows = 20
pd.options.display.max_columns = 100

from IPython.display import display, HTML, clear_output

%matplotlib inline

import gym
import gym.spaces

## Global Configs

In [3]:
class Config:
	def __init__(self, _epoch = 0, _debug = True):
		self.maxSteps = 20
		self.lamb = 0.8
		self.epochSize = 40960
		self.miniBatchSize = 64
		self.delayBatchs = 128
		self.totalEpochs = 4
		self.testEpochs = 4
		self.epoch = _epoch
		self.lr = 1E-3
		self.targetRewards = 0
		self.modelPath = "./checkpoints/model"
		self.debug = _debug
		self.device = tc.device("cpu") if _debug or not tc.cuda.is_available() else tc.device("cuda")
		self.mapName = "4x4"
		print(self)
	
	def stepCheckpoint(self, _epoch = None):
		if _epoch is not None:
			self.epoch = _epoch
		return "{path}_{epoch:02d}.bin".format(path = self.modelPath, epoch = self.epoch)


In [5]:
env_ = gym.make("FrozenLake-v1", map_name = "8x8")
env_.render()
env_.close()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [7]:
class ReplayEnvironment:
	def __init__(self, config: Config):
		self.name = "FrozenLake-v1"
		self.instance = gym.make(self.name, map_name = config.mapName)
		self.actionSpace = self.instance.action_space.n
		self.stateSpace = self.instance.observation_space.n
		self.config = config
		self.instance.render()
	
	@staticmethod
	def createOneHot(data: tc.Tensor, classes: int) -> tc.Tensor:
		return tcf.one_hot(data, classes).requires_grad_(False)
	
	def __str__(self):
		return "ReplayEnvironment: " + self.name
	
	# (newState, reward, done)
	def step(self, states, actions) -> Tuple[tc.Tensor, tc.Tensor, tc.Tensor]:
		data_ = list()
		for s, a in zip(states, actions):
			input_ = self.instance.P[s][a]
			data_.append(input_[npr.choice(len(input_), None, True, np.asarray(next(zip(*input_))))][1:])
		return tuple(map(lambda x: tc.tensor(x, requires_grad = False, device = self.config.device), zip(*data_)))

In [9]:
class ActorModel(tcn.Module):
	def __init__(self, env: ReplayEnvironment):
		super().__init__()
		self.stateSize = env.stateSpace

		self.baseModel = tcn.Sequential(
			tcn.Linear(self.stateSize, self.stateSize),
			tcn.GELU(),
			tcn.Linear(self.stateSize, self.stateSize * 2),
			tcn.GELU(),
			tcn.Linear(self.stateSize * 2, self.stateSize),
			tcn.GELU(),
			tcn.Linear(self.stateSize, 1),
		)
	
	def decayParameters(self):
		return map(lambda x: x[1], filter(lambda x: "bias" not in x[0], self.named_parameters()))

	def nondecayParameters(self):
		return map(lambda x: x[1], filter(lambda x: "bias" in x[0], self.named_parameters()))
	
	def train(self, _device):
		return self.to(_device)

	def test(self, _device):
		return self.to(_device)
	
	def save(self, path):
		tc.save(self.state_dict(), path)

	def load(self, path, device):
		self.load_state_dict(tc.load(path, device))
	
	def forward(self, state: tc.Tensor) -> tc.Tensor:
		temp_ = self.baseModel(tcf.one_hot(state.long(), self.stateSize).float())
		return temp_.squeeze(1)

In [None]:
def testAllStates(model: ActorModel):
	model.test("cpu")
	testState_ = tc.arange(model.stateSize)
	print(model(testState_))

def testTrain(model: ActorModel):
	testState_ = tc.arange(model.stateSize)
	testScore_ = tc.zeros(model.stateSize)
	testScore_[0] = 1
	print(model(testState_))
	loss_ = tcf.mse_loss(model(testState_), testScore_)
	print(loss_)
	return loss_

def testTrains():
	config_ = Config(0, True)
	model_ = ActorModel(ReplayEnvironment(config_))
	opt_ = tco.AdamW([{"params": model_.decayParameters(), "weight_decay": 0.01 }, {"params": model_.nondecayParameters()}], config_.lr)

	for i in range(20):
		loss_ = testTrain(model_)
		opt_.zero_grad()
		loss_.backward()
		opt_.step()

testTrains()



	

In [None]:
def computeTdUpdateValue(states: Tuple[int], env: ReplayEnvironment, model: ActorModel, config: Config):
	with tc.no_grad():
		options_ = list()
		masks_ = tc.zeros(len(states), dtype = tc.int).requires_grad_(False).to(config.device)
		for a in range(env.actionSpace):
			nstates_, rewards_, success_ = env.step(states, [a] * len(states))
			failed_ = 1 - success_.int()
			updated_ = model(nstates_) * failed_ * config.lamb + rewards_ * 10
			masks_ |= failed_

			options_.append(updated_)
		ret_ = tc.max(tc.vstack(options_), 0).values
		
		assert len(ret_) == len(states), "return dimension doesn't match"
	return ret_, masks_

In [None]:
def trainModel(env: ReplayEnvironment, model: ActorModel, config: Config):
	model.train(config.device)
	opt_ = tco.AdamW([{"params": model.decayParameters(), "weight_decay": 0.01}, {"params": model.nondecayParameters(), "weight_decay": 0.0}], config.lr)
	
	losses_ = list()
	print("begin epoch {0}".format(config.epoch))
	for batches_, miniBatch_ in enumerate(zip(*([iter(npr.choice(env.stateSpace, config.epochSize, True))] * config.miniBatchSize))):
		assert len(miniBatch_) == config.miniBatchSize, "minibatch size doesn't match"
		opt_.zero_grad()
		
		# update model:
		if batches_ % config.delayBatchs == 0:
			replayModel_ = copy.deepcopy(model)
		tar_, mask_ = computeTdUpdateValue(miniBatch_, env, replayModel_, config)
		loss_ = tcf.mse_loss(model(tc.tensor(miniBatch_, device = config.device)) * mask_, tar_ * mask_)

		losses_.append(loss_.item())
		loss_.backward()
		opt_.step()

	print("finish epoch {0}".format(config.epoch))
	return losses_

def testRLModel(_env: Environment, _model: ActorModel, _config: Config) -> tc.Tensor:
	state_ = _env.reset().cpu()
	scores_ = tc.zeros(state_.shape[0]).cpu()
	completed_ = tc.zeros(state_.shape[0]).cpu()
	_model = _model.test(tc.device("cpu"))
	with tc.no_grad():
		limits_ = _config.maxSteps
		while completed_.sum() and limits_ > 0:
			actionDist_, _ = _model(state_.to(_config.device))
			action_ = actionDist_.sample()
			state_, rewards_, completed_ = _env.step(action_)
			scores_ += rewards_
			limits_ -= 1
	
	return scores_.tolist()

In [None]:
def trainAll(_env: ReplayEnvironment, _config: Config) -> ActorModel:
	model_ = ActorModel(_env)
	epoch_ = _config.epoch
	if epoch_ > 0:
		print("loading checkpoint {0}".format(epoch_))
		model_.load(_config.stepCheckpoint())
	
	for t in range(epoch_, _config.totalEpochs):
		print("start training epoch {0}".format(t))
		_config.stepCheckpoint(t)
		loss_ = trainModel(_env, model_, _config)
		print("finish training with {0} loss".format(np.mean(loss_)))

		"""
		if t % _config.testEpochs == 0:
			print("start evaluation")
			reward_ = np.mean(testRLModel(_env, model_, _config))
			print("finish evaluation with {0} rewards".format(reward_))
			print("save model check point")
			model_.save(_config.stepCheckpoint(t + 1))
			if _config.targetRewards > 0 and reward_ > _config.targetRewards:
				print("target reward {0} reached, early stop on epoch {1}".format(_config.targetRewards, t))
				break
		"""
		print("finish training epoch {0}".format(t))
	
	print("finish all training steps")
	return model_

**11. PPO Algorithm: Actor**

For `T` steps generate random actions and collect effects in the environment.

In the very first moment, consider the current state of the environment and thanks to the current policy evaluates the state, it associate to each action the probability to be best suited to that state.
The agent picks from the policy an action with some probability. Then the agent execute that action in the environment, the action modifies the state in the environment and new percepts are collected (data of value function, rewards and probabilities for each actions). Plus, add to collection a vector *masks* that keeps track of the end state of the environment (if the environment has reached `done` state).

At then end of this first empirical step,the algorithm computes the generalized advantage estimation to estimate for each action if it improves the value function for the next state (if the action has taken the agent in a better state).

Now that all necessary data are collected, the second step of the algorithm analyze and update the weights of the model thanks to the Adam algorithm.

At the end of the computation the epoch ends.


**12. Implementation of PPO Algorithm: Actor-Critic Style**

Main function, for `N` agents (in `N` environments) join parallel *PPO* training.

The policy model is a *CNN* with three convolutional and two *256* hidden size dense layers. It implements *Adam* algorithm.

It is possible to set transfer learning by setting the `PATH` to the model and `TRANSFER_LEARNING` to `True` in section **4**. 

`test_rewards` is a list that record previous checkpoints rewards. `train_epoch` record the epoch needed to train the model at the current state.



In [None]:
def main():
	config_ = Config(0, False)
	model_ = trainAll(ReplayEnvironment(config_), config_)
	model_.save(config_.stepCheckpoint(config_.totalEpochs))
	
	testAllStates(model_)

if __name__ == "__main__":
	main()

<__main__.Config object at 0x000001DC36D8AEB0>

[41mS[0mFFF
FHFH
FFFH
HFFG
start training epoch 0
begin epoch 0
finish epoch 0
finish training with 1.4645611109561287 loss
finish training epoch 0
start training epoch 1
begin epoch 1
finish epoch 1
finish training with 0.6166809556074441 loss
finish training epoch 1
start training epoch 2
begin epoch 2
finish epoch 2
finish training with 0.4973745384020731 loss
finish training epoch 2
start training epoch 3
begin epoch 3
finish epoch 3
finish training with 0.4987373967305757 loss
finish training epoch 3
start training epoch 4
begin epoch 4
finish epoch 4
finish training with 0.4951305897673592 loss
finish training epoch 4
start training epoch 5
begin epoch 5
finish epoch 5
finish training with 0.4770012002205476 loss
finish training epoch 5
start training epoch 6
begin epoch 6
finish epoch 6
finish training with 0.49658066583797333 loss
finish training epoch 6
start training epoch 7
begin epoch 7
finish epoch 7
finish training with 0.