In [1]:
import math
import random
from typing import *

import itertools as it

import numpy as np
import numpy.random as npr
import pandas as pd
import torch as tc
import torch.nn as tcn
import torch.nn.functional as tcf
import torch.optim as tco
import torch.distributions as tcd
import einops.layers.torch as eol

pd.options.display.max_rows = 40
pd.options.display.min_rows = 20
pd.options.display.max_columns = 100
tc.set_printoptions(edgeitems= 100, linewidth = 160)


from IPython.display import display, HTML, clear_output

%matplotlib inline

import gym
import gym.spaces

## Global Configs

In [3]:
class Config:
	def __init__(self, _epoch = 0, _debug = True):
		self.clip = 0.2
		self.c1 = 0.5
		self.c2 = 0.01
		self.maxSteps = 20
		self.gamma = 0.95
		self.lamb = 0.9
		self.agentCount = 4096
		self.miniBatchSize = 256
		self.epochIterations = 64
		self.totalEpochs = 4
		self.testEpochs = 1
		self.testAgentCount = 128
		self.epoch = _epoch
		self.lr = 1E-4
		self.targetRewards = 0
		self.modelPath = "./checkpoints/model"
		self.debug = _debug
		self.device = tc.device("cpu") if _debug or not tc.cuda.is_available() else tc.device("cuda")

		print(self)
	
	def stepCheckpoint(self, _epoch = None):
		if _epoch is not None:
			self.epoch = _epoch
		return "{path}_{epoch:02d}.bin".format(path = self.modelPath, epoch = self.epoch)


## Generalized Advantage Estimation

For each result collected with respect to each action by the algorithm, it computes the delta, which is the sum of current reward and the expected success of the next state (next state value minus current state value), zero if 'done' is reached.
So, it computes the sum over all collected states, until last is reached.

GAE is computed summing all GAE of previous actions. It is higher if near actions had success, lower otherwhise.

At the end sum again the value of current state, so that actions in different states obtain bonus or malus if a better states are reached thanks to those actions.

In [3]:
def calculateGAE(_values: List[tc.Tensor], _rewards: List[tc.Tensor], _masks: List[tc.Tensor], _config: Config) -> tc.Tensor:
	assert 0 < len(_values) == len(_masks) == len(_rewards), "tensor list size must be greater than 0 and consistent"
	assert not any(map(lambda x: x.requires_grad, it.chain(_masks, _rewards))), "masks and rewards should not require grad"
	
	decay_ = _config.lamb * _config.gamma
	advantages_ = list()
	gae_ = tc.zeros_like(_values[0])
	for t in range(len(_values) - 1, 0, -1):
		delta_ = _rewards[t - 1] + _config.gamma * _values[t] * _masks[t] - _values[t - 1]
		gae_ = delta_ + gae_ * decay_ * _masks[t]
		advantages_.append(gae_)
	advantages_.reverse()
	
	return advantages_

 **10. PPO Algorithm: Critic**

For `K` times analyze all the collected data to make gradient updates.

At first, the algorithm grabs random mini-batches several times until it covers all data. 
It takes a state in the batch, from the policy it computes how good is stay in it, then it takes the action that generated that state and computes the new log probability for the action with respect to the current policy (updates during *P* iterations, so, first iteration for new action non visited will be 1).

Now, it computes the ratio (`new_prob/old_prob`) of changing of the probability of the action for that state (depending on the training of the policy).

The algorithm computes first surrogate function, which promotes the action of which probability variation increased the advantage.
After that, it repeats the same computation as before but cutting the ratio values out of a certain range to avoid hysteresis, this is called *CLIP*, and is made to penalize changes to the policy that move the ratio away from 1. It only ignores the change in probability ratio when it would make the objective improve, and it includes the ratio when ratio makes the objective worse.

At this point, the algorithm takes the minimum of the two surrogate and do the mean, it computes the 'surrogate loss' in which include only actions that decrease the performances of the actor. 
It also computes a squared loss for the actor, that is the squared loss of the reward of the action in the state with respect to the policy estimation of the action in the state.

Now, the algorithm can compute the loss function, adding the clipped actor loss and the squared critic loss, summing also some entropy bonus, to guarantee the model to promote exploration and not only exploitation.

At the end of each step, the algorithm updates the weights of the model thanks to the gradient, generated thanks to Adam algorithm.


In [4]:
env_ = gym.make("FrozenLake-v1")
env_.render()
env_.close()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [28]:
class Environment:
	def __init__(self, instanceSize: int):
		self.name = "FrozenLake-v1"
		self.instance = gym.make(self.name)
		self.actionSpace = self.instance.action_space
		self.stateSpace = self.instance.observation_space
		self.trans = self.instance.P
	
	def reset(self, n: int) -> tc.Tensor:
		return tc.stack([tc.as_tensor([self.instance.reset()]) for i in range(n)]).requires_grad_(False)

	def randomReset(self, n: int) -> tc.Tensor:
		return tc.randint(self.stateSpace.n, size = (n,)).requires_grad_(False)
	def __str__(self):
		return "Environment: " + self.name
	
	def step(self, state: tc.Tensor, action: tc.Tensor) -> Tuple[tc.Tensor, tc.Tensor, tc.Tensor]:
		with tc.no_grad():
			trans_ = [self.trans[s.item()][a.item()] for s, a in zip(state, action)]
			transP_ = [npr.choice(len(p), size = None, replace = True, p = list(map(lambda x: x[0], p))) for p in trans_]
			states_ = zip(*[t[p][1:] for t, p in zip(trans_, transP_)])
			return tc.as_tensor(next(states_), dtype = tc.int, device = state.device), tc.as_tensor(next(states_), dtype = tc.float, device = state.device) * 100, tc.as_tensor(next(states_), dtype = tc.int, device = state.device)


In [19]:
class ActorModel(tcn.Module):
	def __init__(self, env: Environment):
		super().__init__()
		self.stateSize = env.stateSpace.n
		self.actionSize = env.actionSpace.n

		self.baseModel = tcn.Sequential(
			tcn.Linear(self.stateSize, self.stateSize),
			tcn.GELU(),
			tcn.Linear(self.stateSize, self.stateSize * 2),
			tcn.GELU(),
			tcn.Linear(self.stateSize * 2, self.stateSize * 2),
			tcn.GELU(),
		)

		self.actionHead = tcn.Sequential(
			tcn.Linear(self.stateSize * 2, self.stateSize * 2),
			tcn.GELU(),
			tcn.Linear(self.stateSize * 2, self.actionSize, bias = False),
			tcn.Softmax()
		)

		self.predHead = tcn.Sequential(
			tcn.Linear(self.stateSize * 2, self.stateSize),
			tcn.GELU(),
			tcn.Linear(self.stateSize, 1)
		)
	
	def decayParameters(self):
		return map(lambda x: x[1], filter(lambda x: "bias" not in x[0], self.named_parameters()))

	def nondecayParameters(self):
		return map(lambda x: x[1], filter(lambda x: "bias" in x[0], self.named_parameters()))
	
	def save(self, path):
		tc.save(self.state_dict(), path)

	def load(self, path, device):
		self.load_state_dict(tc.load(path, device))
	
	def forward(self, state: tc.Tensor) -> Tuple[tcd.Distribution, tc.Tensor]:
		temp_ = self.baseModel(tcf.one_hot(state.long(), self.stateSize).float())
		return tcd.Categorical(self.actionHead(temp_)), self.predHead(temp_).squeeze(1)

In [7]:
class BatchLog:
	def __init__(self, _states: tc.Tensor, _actions: tc.Tensor, _actionLogProbs: tc.Tensor, _estimatedValues: tc.Tensor, _advantages: tc.Tensor):
		assert len(set(map(lambda x: x.shape[0], [_states, _actions, _actionLogProbs, _estimatedValues, _advantages]))) == 1, "all inputs must have consist size"
		self.states = _states
		self.actions = _actions
		self.actionLogProbs = _actionLogProbs
		self.estimatedValues = _estimatedValues
		self.advantages = _advantages
	
	def generateMiniBatchs(self, _batchSize: int, _dropLast = False) -> Tuple[tc.Tensor, tc.Tensor, tc.Tensor, tc.Tensor, tc.Tensor]:
		size_ = self.states.shape[0]
		randIndex_ = tc.randperm(size_)
		for p in range(0, size_, _batchSize):
			if _dropLast and p + _batchSize > size_:
				break
			batchIndex_ = randIndex_[p : p + _batchSize]
			yield self.states[batchIndex_], self.actions[batchIndex_], self.actionLogProbs[batchIndex_], self.estimatedValues[batchIndex_], self.advantages[batchIndex_]

In [8]:
def collectPPOModel(_state: tc.Tensor, _action: tc.Tensor, _oldLogProb: tc.Tensor, _estimatedValue: tc.Tensor, _advantage: tc.Tensor,
	_model: ActorModel, _config: Config) -> tc.Tensor:
	
	actionDist_, value_ = _model(_state)
	newLogProb_ = actionDist_.log_prob(tc.squeeze(_action))
	ratio_ = (newLogProb_ - tc.squeeze(_oldLogProb).detach()).exp() # new_prob/old_prob
	surr1_ = ratio_ * _advantage.detach()
	surr2_ = tc.clamp(ratio_, 1.0 - _config.clip, 1.0 + _config.clip) * _advantage.detach()
	actor_loss_ = - tc.min(surr1_, surr2_).mean()
	critic_loss_ = tcf.mse_loss(value_, _estimatedValue)
	entropy_loss_ = -actionDist_.entropy().mean()
	return actor_loss_ + _config.c1 * critic_loss_ + _config.c2 * entropy_loss_ 

In [9]:
def testRLModel(_env: Environment, _model: ActorModel, _config: Config) -> tc.Tensor:
	state_ = _env.reset(_config.testAgentCount)
	scores_ = tc.zeros(state_.shape[0], dtype = tc.float).cpu()
	completed_ = tc.zeros(state_.shape[0], dtype = tc.int).cpu()
	_model = _model.to("cpu")
	_model.eval()
	
	with tc.no_grad():
		limits_ = _config.maxSteps
		while (1 - completed_).sum() and limits_ > 0:
			actionDist_, _ = _model(state_.to(_config.device))
			action_ = actionDist_.sample()
			state_, rewards_, completed_ = _env.step(state_, action_)
			scores_ += rewards_
			limits_ -= 1
	
	return scores_.tolist()

In [10]:
def standizeAdvantage(advantage: tc.Tensor) -> tc.Tensor:
	with tc.no_grad():
		return (advantage - advantage.mean()) / (advantage.std() + 1e-8)

def runRLModel(_env: Environment, _model: ActorModel, _config: Config) -> BatchLog:
	assert _config.maxSteps > 2, "step size must be greater than 2"
	
	with tc.no_grad():
		_model = _model.to(_config.device)
		_model.eval()
		state_ = _env.randomReset(_config.agentCount).to(_config.device)
		mask_ = tc.ones(state_.shape[0], dtype = tc.int).to(_config.device)
		states_ = list()
		actions_ = list()
		actionProbs_ = list()
		rewards_ = list()
		values_ = list()
		masks_ = list()
		for _ in range(_config.maxSteps):
			states_.append(state_)
			masks_.append(mask_)
			actionDist_, estimatedValue_ = _model(state_)
			values_.append(estimatedValue_)
			action_ = actionDist_.sample()
			actions_.append(action_)
			actionProb_ = actionDist_.log_prob(action_)
			actionProbs_.append(actionProb_)
			state_, reward_, completed_ = _env.step(state_, action_)
			mask_ = 1 - completed_
			rewards_.append(reward_)
		
		advantages_ = tc.cat(calculateGAE(values_, rewards_, masks_, _config))
		states_.pop()
		states_ = tc.cat(states_)
		actions_.pop()
		actions_ = tc.cat(actions_)
		actionProbs_.pop()
		actionProbs_ = tc.cat(actionProbs_)
		values_.pop()
		values_ = (advantages_ + tc.cat(values_))
		advantages_ = standizeAdvantage(advantages_)
		masks_.pop()
		masks_ = tc.cat(masks_).bool()
		return BatchLog(states_[masks_].detach(), actions_[masks_].detach(), actionProbs_[masks_].detach(), values_[masks_].detach(), advantages_[masks_].detach())

In [25]:
def trainModel(_env: Environment, _model: ActorModel, _config: Config) -> List[float]:
	opt_ = tco.AdamW([{"params": _model.decayParameters(), "weight_decay": 0.01}, {"params": _model.nondecayParameters(), "weight_decay": 0.0}], _config.lr)
	_model = _model.to(_config.device)
	_model.train()
	
	for iter_ in range(_config.epochIterations):
		losses_ = list()
		logs_ = runRLModel(_env, _model, _config)
		for batch_ in logs_.generateMiniBatchs(_config.miniBatchSize):
			opt_.zero_grad()
			loss_ = collectPPOModel(*batch_, _model, _config)
			losses_.append(loss_.item())
			loss_.backward()
			opt_.step()
		print("epoch {0} iteration {1}, loss_ = {2}".format(_config.epoch, iter_, np.mean(losses_)))
	return losses_

In [12]:
def trainAll(_env: Environment, _config: Config) -> ActorModel:
	model_ = ActorModel(_env)
	epoch_ = _config.epoch
	if epoch_ > 0:
		print("loading checkpoint {0}".format(epoch_))
		model_.load(_config.stepCheckpoint())

	for t in range(epoch_, _config.totalEpochs):
		print("start training epoch {0}".format(t))
		loss_ = np.mean(trainModel(_env, model_, _config))
		print("finish training with {0} loss".format(loss_))

		if t % _config.testEpochs == 0:
			print("start evaluation")
			reward_ = np.mean(testRLModel(_env, model_, _config))
			print("finish evaluation with {0} rewards".format(reward_))
			print("save model check point")
			model_.save(_config.stepCheckpoint(t + 1))
			if _config.targetRewards > 0 and reward_ > _config.targetRewards:
				print("target reward {0} reached, early stop on epoch {1}".format(_config.targetRewards, t))
				break
	
		print("finish training epoch {0}".format(t))
	
	print("finish all training steps")
	return model_

**11. PPO Algorithm: Actor**

For `T` steps generate random actions and collect effects in the environment.

In the very first moment, consider the current state of the environment and thanks to the current policy evaluates the state, it associate to each action the probability to be best suited to that state.
The agent picks from the policy an action with some probability. Then the agent execute that action in the environment, the action modifies the state in the environment and new percepts are collected (data of value function, rewards and probabilities for each actions). Plus, add to collection a vector *masks* that keeps track of the end state of the environment (if the environment has reached `done` state).

At then end of this first empirical step,the algorithm computes the generalized advantage estimation to estimate for each action if it improves the value function for the next state (if the action has taken the agent in a better state).

Now that all necessary data are collected, the second step of the algorithm analyze and update the weights of the model thanks to the Adam algorithm.

At the end of the computation the epoch ends.


**12. Implementation of PPO Algorithm: Actor-Critic Style**

Main function, for `N` agents (in `N` environments) join parallel *PPO* training.

The policy model is a *CNN* with three convolutional and two *256* hidden size dense layers. It implements *Adam* algorithm.

It is possible to set transfer learning by setting the `PATH` to the model and `TRANSFER_LEARNING` to `True` in section **4**. 

`test_rewards` is a list that record previous checkpoints rewards. `train_epoch` record the epoch needed to train the model at the current state.



In [30]:
def main():
	config_ = Config(0, False)
	model_ = trainAll(Environment(config_.agentCount), config_)
	model_.save(config_.stepCheckpoint(config_.totalEpochs))

if __name__ == "__main__":
	main()

<__main__.Config object at 0x000002683BF24CD0>
start training epoch 0


  input = module(input)


epoch 0 iteration 0, loss_ = 199.93720334923785
epoch 0 iteration 1, loss_ = 208.61294486604888
epoch 0 iteration 2, loss_ = 164.57565647516495
epoch 0 iteration 3, loss_ = 188.14210349635073
epoch 0 iteration 4, loss_ = 242.36613750457764
epoch 0 iteration 5, loss_ = 268.42747637960645
epoch 0 iteration 6, loss_ = 322.4516826740746
epoch 0 iteration 7, loss_ = 358.67720598043854
epoch 0 iteration 8, loss_ = 340.9190360945885
epoch 0 iteration 9, loss_ = 350.7608364423116
epoch 0 iteration 10, loss_ = 324.15786635273633
epoch 0 iteration 11, loss_ = 319.3127734375
epoch 0 iteration 12, loss_ = 297.9696681213379
epoch 0 iteration 13, loss_ = 281.3952464094066
epoch 0 iteration 14, loss_ = 270.1421759033203
epoch 0 iteration 15, loss_ = 271.7441306993799
epoch 0 iteration 16, loss_ = 262.5396153376653
epoch 0 iteration 17, loss_ = 251.57675903918695
epoch 0 iteration 18, loss_ = 249.2711899097149
epoch 0 iteration 19, loss_ = 241.03758066350764
epoch 0 iteration 20, loss_ = 245.988061232

KeyboardInterrupt: 