# Intro

In this homework you will work with TRPO and PPO algorithms. You need to adjust parameters and  fill placeholders with your own code. As a result you'll get a trained virtual robot. 

Code execution types : 

1) you may use url in order to access this notebook

2) or you may upload .ipynb file into google collab and execute it

![picture](https://raw.githubusercontent.com/dim2r/rl_cource/master/img/ant.jpg)

Train this robot to walk!

## Global parametrs.Define environment name and training time [[*EDIT HERE*]]

In [None]:
#Pybullet env name
config_env_name = 'AntPyBulletEnv-v0' 


# Time limit in minutes for training of one agent. 
# So total time will be as long as two this periods  
# 30 min - poor result
# 120 min - fair result
config_train_time_minutes = 120 


# Choose algorithm to use
config_use_trpo = True #optional
config_use_ppo = True




config_save_dir = 'saved-sessions'

import sys
#Configure PATH to your pg repository.
#You may mount google drive using the following two commands
#from google.colab import drive
#drive.mount('/content/drive')

# example: config_pg_folder_path = '/content/drive/MyDrive/PG/'
config_pg_folder_path = './'
sys.path.append(config_pg_folder_path)

if 'google.colab' in str(get_ipython()):
  config_i_am_in_collab = True
  print('Running in google collab')
else:
  config_i_am_in_collab = False
  print('I am not in google collab')



## Install modules and initialize environment. Run this  before you start. It takes about 5 minutes. 


In [None]:
# Step 1. Install required packages

 
import os
if config_i_am_in_collab:
    print('downloading modules')
    os.system("curl -o layers.py https://raw.githubusercontent.com/dim2r/rl_cource/master/src/layers.py")
    os.system("curl -o pg_utils.py https://raw.githubusercontent.com/dim2r/rl_cource/master/src/pg_utils.py")
    os.system("curl -o trpo.py https://raw.githubusercontent.com/dim2r/rl_cource/master/src/trpo.py")


!pip install torch torchvision gym numpngw numpy pyglet PyYAML jdc matplotlib  
!pip install pybullet
!pip install git+https://github.com/benelot/pybullet-gym

    
# Step 2. Fix installation
import shutil 
import pybulletgym
libdir = os.path.dirname(os.path.dirname(pybulletgym.__file__))
def recursive_copy(src, dest):
    if os.path.isdir(src):
        if not os.path.isdir(dest):
            os.makedirs(dest)
        files = os.listdir(src)
        for f in files:
            recursive_copy(os.path.join(src, f), 
                           os.path.join(dest, f)
                          )
    else:
        #print(src+"->>"+dest)
        shutil.copyfile(src, dest)

recursive_copy(f"{libdir}/pybullet_data/",f"{libdir}/pybulletgym/envs/roboschool/scenes/../../assets/scenes/stadium/")
recursive_copy(f"{libdir}/pybullet_data/",f"{libdir}/pybulletgym/envs/roboschool/robots/../../assets/")

#Step 3. Create 10x10 pixel empty images in order to avoid render error
import numpy as np
import numpngw 
img = np.zeros((10, 10, 3), dtype=np.uint8)
img=img+255
images=[img]
numpngw.write_apng('anim_random_actions.png', images, delay=1)
numpngw.write_apng('anim_trpo.png', images, delay=1)
numpngw.write_apng('anim_PPO.png', images, delay=1)
print('done')

## Import all libraries

In [None]:
import jdc
import gym
import pybullet
import pybulletgym
import torch
import torch.nn as nn
# import collections   
import IPython.display  

# import os 
# import yaml  
# import glob

from datetime import datetime as dt
from datetime import timedelta

gym.logger.set_level(40) #avoid some warning

# import time
# from datetime import datetime
# import matplotlib.pyplot as plt

from pg_utils import get_device, show_plot
from pg_utils import apply_update, flatten, flat_grad, get_flat_params, detach_dist
from pg_utils import Transform, Bound, ZFilter

print('done')
 

## Pybullet environment description. Run this cell in order to get familiar with an ant you will train.



The Ant has 8 joints/actions and 28 observations. The reward function has a positive reward signal for the pelvic velocity, a negative reward signal for the effort of the current action state, and a negative signal if the joints are at their limit. The termination function triggers if the body of the ant rotates more than 0.2 units from upright.

You may investigate source code in order to get more information https://github.com/openai/gym/blob/master/gym/envs/mujoco/ant.py

In [None]:
#Run this cell in order to get familiar with an environment.
#No agent. Just random actions.

  
env_name = config_env_name  #see list of environments https://github.com/benelot/pybullet-gym/blob/master/README.md

number_of_steps = 100

env = gym.make(env_name)
state = env.reset()
 
images = []
print("action_space=")
print(env.action_space.shape)    
print("observation_space=")
print(env.observation_space.shape)    

for _ in range(number_of_steps):
    random_action = env.action_space.sample()
    
    state, reward, done, _ = env.step(random_action)
    img = env.render(mode='rgb_array')
    images.append(img)
env.close()

 
numpngw.write_apng('anim_random_actions.png', images, delay=1)
IPython.display.Image(filename='anim_random_actions.png')

# \-------------------------------------------

# TPRO algorithm (Optional, no placeholders)

Just investigate it and answer some questions

## Math functions

### Conjecture gradient implementation 

In [None]:
def cg_solver(Avp_fun, b, max_iter=10):
    '''
    Finds an approximate solution to a set of linear equations Ax = b

    Parameters
    ----------
    Avp_fun : callable
        a function that right multiplies a matrix A by a vector

    b : torch.FloatTensor
        the right hand term in the set of linear equations Ax = b

    max_iter : int
        the maximum number of iterations (default is 10)

    Returns
    -------
    x : torch.FloatTensor
        the approximate solution to the system of equations defined by Avp_fun
        and b
    '''

    device = get_device()
    x = torch.zeros_like(b).to(device)
    r = b.clone()
    p = b.clone()

    for i in range(max_iter):
        Avp = Avp_fun(p, retain_graph=True)

        alpha = torch.matmul(r, r) / torch.matmul(p, Avp)
        x += alpha * p

        if i == max_iter - 1:
            return x

        r_new = r - alpha * Avp
        beta = torch.matmul(r_new, r_new) / torch.matmul(r, r)
        r = r_new
        p = r + beta * p


### Kulbak-Leibner divergence (D_KL)
$D_{KL}\left ( P || Q \right )=\sum_{x\in \chi } P(x) log\left (  \frac{P(x)}{Q(x)} \right )$


In [None]:
def mean_kl_first_fixed(dist_1, dist_2):
    '''
    Calculate the kl-divergence between dist_1 and dist_2 after detaching dist_1
    from the computational graph

    Parameters
    ----------
    dist_1 : torch.distributions.distribution.Distribution
        the first argument to the kl-divergence function (will be fixed)

    dist_2 : torch.distributions.distribution.Distribution
        the second argument to the kl-divergence function (will not be fixed)

    Returns
    -------
    mean_kl : torch.float
        the kl-divergence between dist_1 and dist_2
    '''
    dist_1_detached = detach_dist(dist_1)
    mean_kl = torch.mean(torch.distributions.kl.kl_divergence(dist_1_detached, dist_2))

    return mean_kl

### line_search()

In [None]:
def line_search(search_dir, max_step_len, constraints_satisfied, line_search_coef=0.9,
                max_iter=10):
    '''
    Perform a backtracking line search that terminates when constraints_satisfied
    return True and return the calculated step length. Return 0.0 if no step
    length can be found for which constraints_satisfied returns True

    Parameters
    ----------
    search_dir : torch.FloatTensor
        the search direction along which the line search is done

    max_step_len : torch.FloatTensor
        the maximum step length to consider in the line search

    constraints_satisfied : callable
        a function that returns a boolean indicating whether the constraints
        are met by the current step length

    line_search_coef : float
        the proportion by which to reduce the step length after each iteration

    max_iter : int
        the maximum number of backtracks to do before return 0.0

    Returns
    -------
    the maximum step length coefficient for which constraints_satisfied evaluates
    to True
    '''

    step_len = max_step_len / line_search_coef

    for i in range(max_iter):
        step_len *= line_search_coef

        if constraints_satisfied(step_len * search_dir, step_len):
            return step_len

    return torch.tensor(0.0).to(self.device)


## Import TRPO class

In [None]:
from trpo import Simulator, TRPO

## Add some methods to TRPO class 

### Define train() method

In [None]:
%%add_to TRPO
def train(self, n_episodes):
    
    last_q = None
    last_states = None
    time_elapsed = False
    reward_log=[]
    while self.episode_num < n_episodes and not time_elapsed:
        start_time = dt.now()
        self.episode_num += 1
        samples = self.simulator.sample_trajectories()
        states, actions, rewards, q_vals = self.unroll_samples(samples)

        advantages, states_with_time = self.get_advantages(samples)
        advantages -= torch.mean(advantages)
        advantages /= torch.std(advantages)

        self.update_policy(states, actions, advantages)

        if last_q is not None:
            self.update_value_fun(torch.cat([states_with_time, last_states]), torch.cat([q_vals, last_q]))
        else:
            self.update_value_fun(states_with_time, q_vals)

        last_q = q_vals
        last_states = states_with_time

        mean_reward = np.mean([np.sum(trajectory['rewards']) for trajectory in samples])
        mean_reward_np = mean_reward
        self.mean_rewards.append(mean_reward_np)
        self.elapsed_time += dt.now() - start_time
        self.print_update()
        reward_log.append([self.episode_num,mean_reward])
        show_plot(reward_log, config_pg_folder_path, "TRPO", 0.5)

        if self.elapsed_time.seconds>config_train_time_minutes*60:
            print('training time elapsed')
            time_elapsed=True
        

        if self.save_every and not self.episode_num % self.save_every:
            self.save_session(f"{config_pg_folder_path}/{config_save_dir}")

### Define get_advantages()  

TASK:Define discounted residuals as multiplication of td_residual and  discount_pows

In [None]:
%%add_to TRPO
def get_advantages(self, samples):
    advantages = []
    states_with_time = []
    T = self.simulator.trajectory_len

    for trajectory in samples:
        time = torch.arange(0, len(trajectory['rewards'])).unsqueeze(1).float() / T
        states = torch.stack(trajectory['states'])
        states = torch.cat([states, time], dim=-1)
        states = states.to(self.device)
        states_with_time.append(states.cpu())
        rewards = torch.tensor(trajectory['rewards'])

        state_values = self.value_fun(states)
        state_values = state_values.view(-1)
        state_values = state_values.cpu()
        state_values_next = torch.cat([state_values[1:], torch.tensor([0.0])])

        
        td_residuals = rewards + self.discount * state_values_next - state_values  #PLACE HOLDER
        
        reverse = torch.arange(rewards.size(0) - 1, -1, -1)
        
        discount_pows = torch.pow(self.discount * self.lam, torch.arange(0, rewards.size(0)).float())
        
        discounted_residuals = td_residuals * discount_pows
        
        disc_res_sums = torch.cumsum(discounted_residuals[reverse], dim=-1)[reverse]



        trajectory_advs = disc_res_sums / discount_pows
        advantages.append(trajectory_advs)

    advantages = torch.cat(advantages)

    states_with_time = torch.cat(states_with_time)

    return advantages, states_with_time


### Define update_value_fun() 

TASK: Add L2 regularization in value function. Use torch.sum(), torch.pow() or torch.square(). 


In [None]:
%%add_to TRPO
def update_value_fun(self, states, q_vals):
    self.value_fun.train()

    states = states.to(self.device)
    q_vals = q_vals.to(self.device)

    for i in range(self.vf_iters):
        def mse():
            self.value_optimizer.zero_grad()
            state_values = self.value_fun(states).view(-1)

            loss = self.mse_loss(state_values, q_vals)
            flat_params=torch.cat([param.view(-1) for param in self.value_fun.parameters()])
            
            l2_loss = self.vf_l2_reg_coef * torch.sum(torch.pow(flat_params, 2))
            loss += l2_loss

            loss.backward()

            return loss

        self.value_optimizer.step(mse)


### Define Hessian closure function 


$H_f=\begin{pmatrix} &\frac{\partial^2 }{\partial x^2_1}  &\frac{\partial^2 }{\partial x_1 \partial x_2}    &... &\frac{\partial^2 }{\partial x_1 \partial x_n} \\  &\frac{\partial^2 }{\partial x_2 \partial x_1}  &\frac{\partial^2 }{\partial x^2_2}  &... &\frac{\partial^2 }{\partial x_2 \partial x_n} \\  &\vdots   &\vdots   &\ddots    &\vdots \\  &\frac{\partial^2 }{\partial x_n \partial x_1}  &\frac{\partial^2 }{\partial x_n \partial x_2}  &... &\frac{\partial^2 }{\partial x^2_n}\end{pmatrix}$


This function used in Conjecture gradient algorithm in order to find surrogate function maximum.


In [None]:
%%add_to TRPO
def get_Hvp_fun(self,functional_output, inputs, damping_coef=0.0):
    '''
    Returns a function that calculates a Hessian-vector product with the Hessian
    of functional_output w.r.t. inputs

    Parameters
    ----------
    functional_output : torch.FloatTensor (with requires_grad=True)
        the output of the function of which the Hessian is calculated

    inputs : torch.FloatTensor
        the inputs w.r.t. which the Hessian is calculated

    damping_coef : float
        the multiple of the identity matrix to be added to the Hessian
    '''

    inputs = list(inputs)
    
    flat_grads = flat_grad(functional_output, inputs, create_graph=True)
    #grads = torch.autograd.grad(loss, self.policy.parameters(), retain_graph=True, create_graph=True)
    #flat_grads =  torch.cat([v.view(-1) for v in grads])

    def Hvp_fun(v, retain_graph=True):
        gvp = torch.matmul(flat_grads, v)
        
        Hvp = flat_grad(gvp, inputs, retain_graph=retain_graph)   #TODO ADD PLACEHOLDER

        Hvp += damping_coef * v

        return Hvp

    return Hvp_fun

### Define update_policy() 




In [None]:
%%add_to TRPO
def update_policy(self, states, actions, advantages):
    self.policy.train()

    states = states.to(self.device)
    actions = actions.to(self.device)
    advantages = advantages.to(self.device)

    action_dists = self.policy(states)
    log_action_probs = action_dists.log_prob(actions)

    loss = self.surrogate_loss(log_action_probs, log_action_probs.detach(), advantages)
    
    #loss_grad = flat_grad(loss, self.policy.parameters(), retain_graph=True)
    grads = torch.autograd.grad(loss, self.policy.parameters(), retain_graph=True, create_graph=False)
    flat_grads =  torch.cat([v.view(-1) for v in grads])
    loss_grad=flat_grads

    mean_kl = mean_kl_first_fixed(action_dists, action_dists)

    Fvp_fun = self.get_Hvp_fun(mean_kl, self.policy.parameters())
    search_dir = cg_solver(Fvp_fun, loss_grad, self.cg_max_iters)

    expected_improvement = torch.matmul(loss_grad, search_dir)

    def constraints_satisfied(step, beta):
        apply_update(self.policy, step)

        with torch.no_grad():
            new_action_dists = self.policy(states)
            new_log_action_probs = new_action_dists.log_prob(actions)

            new_loss = self.surrogate_loss(new_log_action_probs, log_action_probs, advantages)
            mean_kl = mean_kl_first_fixed(action_dists, new_action_dists)
             

        actual_improvement = new_loss - loss  #### DELETE
        improvement_ratio = actual_improvement / (expected_improvement * beta)  #DELETE

        apply_update(self.policy, -step)

        surrogate_cond = improvement_ratio >= self.line_search_accept_ratio and actual_improvement > 0.0
        kl_cond = mean_kl <= self.max_kl_div

        return surrogate_cond and kl_cond

    max_step_len = self.get_max_step_len(search_dir, Fvp_fun, self.max_kl_div, retain_graph=True)
    step_len = line_search(search_dir, max_step_len, constraints_satisfied)

    opt_step = step_len * search_dir
    apply_update(self.policy, opt_step)

###  Define surrogate loss  

Implement the following formula for surrogate loss function:  


$\mathbb{E}_{a \sim  q}\left[ \frac{\pi_{\theta}(a|s_n)}{q(a|s_n)} A_{\theta_{old}}(s_n,a) \right]$

Take into account that program uses logprob so you need to use torch.exp() in order to get proper result

In [None]:
%%add_to TRPO
def surrogate_loss(self, log_action_probs, imp_sample_probs, advantages):
    return torch.mean(torch.exp(log_action_probs - imp_sample_probs) * advantages)


## Train it!

You may increace number of epoches to get more smart bot

In [None]:
from layers import build_diag_gauss_policy, build_mlp

In [None]:
if config_use_trpo:

    # Initialize the simulator
    env_name = config_env_name #config['env_name']
    n_episodes=1000 
    n_trajectories = 50 
    max_timesteps = 1000



    continue_from_file = False
    device = get_device()


    env = gym.make(env_name)
    action_space = env.action_space
    observation_space = env.observation_space
    policy_hidden_dims = [64, 64] 
    vf_hidden_dims = [64, 64]     
    vf_args = (observation_space.shape[0] + 1, vf_hidden_dims, 1)

    policy_args = (observation_space.shape[0], policy_hidden_dims, action_space.shape[0])
    # build_diag_gauss_policy - implements a layer that outputs a Gaussian distribution with a diagonal
    # covariance matrix
    policy = build_diag_gauss_policy(*policy_args)


    # Initalize the value function
    # build_mlp - build a multilayer perceptron(mlp) with tanh activations with the specified input,
    # output, and hidden layer sizes
    value_fun = build_mlp(*vf_args)
    policy.to(device)
    value_fun.to(device)



    env_args = {}


    # Initialize the state transformation
    # ZFilter - a z-scoring filter
    z_filter = ZFilter()

    # Bound - implements a bounding function
    state_bound = Bound(-5, 5)

    # Transform - composes several transformation and applies them sequentially
    state_filter = Transform(state_bound, z_filter)


    # It uses 50 gym environment objects at once.
    simulator = Simulator(env_name, policy, n_trajectories,
                          max_timesteps, state_filter=state_filter,
                          **env_args)
    trpo_args = {}
    model_name='env1'
    trpo = TRPO(policy, value_fun, simulator, model_name=model_name,
                continue_from_file=continue_from_file, **trpo_args)

    print(f'TRPO Starting training policy {model_name} on {env_name} environment...\n')

    trpo.train(n_episodes)

    print('\nTRPO Training complete.\n')


## Render result. Run this cell in order to view policy in action.

In [None]:
#render result
if config_use_trpo:
    max_timesteps = 1000
    run_speed= 1

    all_configs = {} 
    config = {} 

    device = get_device()

    env_name = config_env_name
    env = gym.make(env_name)
    env._max_episode_steps = 1000

    
    action_space = env.action_space
    observation_space = env.observation_space
    policy_hidden_dims = [64,64] 

    policy_args = (observation_space.shape[0], policy_hidden_dims, action_space.shape[0])
    policy = build_diag_gauss_policy(*policy_args)

    session_dir = config_save_dir 
    load_path = os.path.join(config_pg_folder_path, session_dir, model_name + '.pt')

    if device.type == 'cuda':
        ckpt = torch.load(load_path)
    else:
        ckpt = torch.load(load_path, map_location='cpu')

    policy.load_state_dict(ckpt['policy_state_dict'])
    policy.to(device)
    state_filter = ckpt['state_filter']

    # Adjust the camera angle
    # env.viewer.cam.lookat[0], env.viewer.cam.lookat[1], env.viewer.cam.lookat[2] = [0, -1.0, 1.15]
    # env.viewer.cam.azimuth = 0
    # env.viewer.cam.elevation = -30
    # env.viewer.cam.fixedcamid = 0

    # Run the simulation
    policy.eval()
    state = env.reset()
    done = False

    images = []
    step=0

    while not done:
        state = torch.tensor(state).float()
        state = state_filter(state).to(device)

        action_dist = policy(state)
        action = action_dist.sample().cpu()

        state, reward, done, _ = env.step(action.numpy())
        # env.render()
        img = env.render(mode='rgb_array')
        images.append(img)
        step +=1
    #    print(step)
        if step>1000:
            done=True

    env.close()

    numpngw.write_apng(f"{config_pg_folder_path}/anim_trpo.png", images, delay=10)
    IPython.display.Image(filename=f"{config_pg_folder_path}/anim_trpo.png")

In [None]:
IPython.display.Image(filename=f"{config_pg_folder_path}/anim_trpo.png")

# \-------------------------------------------


# PPO algorithm

## Prepare PPO training 1. Define PPO helper classes.

### class RolloutBuffer

Used to keep a trajectory

In [None]:
class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


### class ActorCritic 

In [None]:

class ActorCritic(torch.nn.Module):
    def __init__(self, state_dim, action_dim,   action_std_init):
        super(ActorCritic, self).__init__()

        
        self.action_dim = action_dim
        self.action_var = torch.full((action_dim,), action_std_init * action_std_init).to(device)

        # actor
        self.actor = nn.Sequential(
                        nn.Linear(state_dim, 64),
                        nn.Tanh(),
                        nn.Linear(64, 64),
                        nn.Tanh(),
                        nn.Linear(64, action_dim),
                        nn.Tanh()
                    )
      
        
        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 64),
                        nn.Tanh(),
                        nn.Linear(64, 64),
                        nn.Tanh(),
                        nn.Linear(64, 1)
                    )
        
    def set_action_std(self, new_action_std):
        self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std).to(device)
        

    def forward(self):
        raise NotImplementedError


#### add act() method to ActorCritic class 

In [None]:
%%add_to ActorCritic
def act(self, state):

    action_mean = self.actor(state)
    cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
    dist = torch.distributions.MultivariateNormal(action_mean, cov_mat)
    action = dist.sample()
    action_logprob = dist.log_prob(action)
    
    return action.detach(), action_logprob.detach()

#### add evaluate() method to ActorCritic class 

In [None]:
%%add_to ActorCritic
def evaluate(self, state, action):
    action_mean = self.actor(state)
    action_var = self.action_var.expand_as(action_mean)
    
    cov_mat = torch.diag_embed(action_var).to(device)
    dist = torch.distributions.MultivariateNormal(action_mean, cov_mat)
      
    # For Single Action Environments.
    if self.action_dim == 1:
        action = action.reshape(-1, self.action_dim)


    action_logprobs = dist.log_prob(action)
    dist_entropy = dist.entropy()
    state_values = self.critic(state)
      
    return action_logprobs, state_values, dist_entropy

## Prepare PPO training 2. Define PPO class 

In [None]:
device = get_device()
 

################################## PPO Policy ##################################



class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip,  action_std_init=0.6):
        self.action_std = action_std_init

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(state_dim, action_dim, action_std_init).to(device)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(state_dim, action_dim,  action_std_init).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = torch.nn.MSELoss()


    def set_action_std(self, new_action_std):
        self.action_std = new_action_std
        self.policy.set_action_std(new_action_std)
        self.policy_old.set_action_std(new_action_std)


    def decay_action_std(self, action_std_decay_rate, min_action_std):
        print("--------------------------------------------------------------------------------------------")

        self.action_std = self.action_std - action_std_decay_rate
        self.action_std = round(self.action_std, 4)
        if (self.action_std <= min_action_std):
            self.action_std = min_action_std
            print("setting actor output action_std to min_action_std : ", self.action_std)
        else:
            print("setting actor output action_std to : ", self.action_std)
        self.set_action_std(self.action_std)


        print("--------------------------------------------------------------------------------------------")
    
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
   

    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))

### Define select_action() 

In [None]:
%%add_to PPO
def select_action(self, state):
    with torch.no_grad():
        state = torch.FloatTensor(state).to(device)
        action, action_logprob = self.policy_old.act(state)

    self.buffer.states.append(state)
    self.buffer.actions.append(action)
    self.buffer.logprobs.append(action_logprob)

    return action.detach().cpu().numpy().flatten()



### Define update_policy()[[*YOUR CODE HERE*]]

 

$L^{CLIP}(\theta)=\hat{\mathbb{E}}_t \left[ min \left(r_t(\theta)\hat{A}_t, clip \left(r_t(\theta),1-\epsilon,1+\epsilon \right ) A_t \right)\right]$

In [None]:
%%add_to PPO
def update_policy(self):

    # Monte Carlo estimate of returns
    rewards = []
    discounted_reward = 0
    for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
        if is_terminal:
            discounted_reward = 0
        discounted_reward = reward + (self.gamma * discounted_reward)
        rewards.insert(0, discounted_reward)
        
    # Normalizing the rewards
    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
    rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

    # convert list to tensor
    old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
    old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
    old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)

    
    # Optimize policy for K epochs
    for _ in range(self.K_epochs):

        # Evaluating old actions and values
        logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

        # match state_values tensor dimensions with rewards tensor
        state_values = torch.squeeze(state_values)
        
        # Finding the ratio (pi_theta / pi_theta__old)
        
        #  
        ratios = torch.exp(logprobs - old_logprobs.detach())

        # Finding Surrogate Loss
        advantages = rewards - state_values.detach()

        ## [[Your code here]]
        ## Implement surrogate clip loss function.
        ## Take epsion value from self.eps_clip
        ## r=ratios
        ## A=advantages
        ## use torch.min() and torch.clamp() functions

        L_CLIP = 0 #[[Your code here]]
        loss = -L_CLIP + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
        
        # take gradient step
        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step()
        
    # Copy new weights into old policy
    self.policy_old.load_state_dict(self.policy.state_dict())

    # clear buffer
    self.buffer.clear()

## Train PPO!

In [None]:
####### Initialize PPO environment hyperparameters ######


env_name = config_env_name

has_continuous_action_space = True  # continuous action space; else discrete

max_ep_len = 500                   # max timesteps in one episode
max_training_timesteps = int(3e6)   # break training loop if timeteps > max_training_timesteps

print_freq = max_ep_len * 10        # print avg reward in the interval (in num timesteps)
log_freq = max_ep_len * 2           # log avg reward in the interval (in num timesteps)
save_model_freq = int(1e5)          # save model frequency (in num timesteps)

action_std = 0.6                    # starting std for action distribution (Multivariate Normal)
action_std_decay_rate = 0.05        # linearly decay action_std (action_std = action_std - action_std_decay_rate)
min_action_std = 0.1                # minimum action_std (stop decay after action_std <= min_action_std)
action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)

#####################################################


## Note : print/log frequencies should be > than max_ep_len


################ PPO hyperparameters ################

update_timestep = max_ep_len * 4      # update policy every n timesteps
K_epochs = 80               # update policy for K epochs in one PPO update

eps_clip = 0.2          # clip parameter for PPO
gamma = 0.99            # discount factor

lr_actor = 0.0003       # learning rate for actor network
lr_critic = 0.001       # learning rate for critic network

random_seed = 0         # set random seed if required (0 = no random seed)

#####################################################



In [None]:

################################### Train PPO ###################################

def train_PPO():

    print("============================================================================================")

    print("training environment name : " + env_name)

    env = gym.make(env_name)

    # state space dimension
    state_dim = env.observation_space.shape[0]

    # action space dimension
    action_dim = env.action_space.shape[0]
    


    ###################### logging ######################

    #### log files for multiple runs are NOT overwritten

    log_dir = "PPO_logs"
    if not os.path.exists(log_dir):
          os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
          os.makedirs(log_dir)


    #### get number of log files in log directory
    run_num = 0
    current_num_files = next(os.walk(log_dir))[2]
    run_num = len(current_num_files)


    #### create new log file for each run
    log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

    print("current logging run number for " + env_name + " : ", run_num)
    print("logging at : " + log_f_name)

    #####################################################


    ################### checkpointing ###################

    run_num_pretrained = 0      #### change this to prevent overwriting weights in same env_name folder

    directory = "PPO_preTrained"
    if not os.path.exists(directory):
          os.makedirs(directory)

    directory = directory + '/' + env_name + '/'
    if not os.path.exists(directory):
          os.makedirs(directory)


    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
    print("save checkpoint path : " + checkpoint_path)

    #####################################################

    

    if random_seed:
        print("--------------------------------------------------------------------------------------------")
        print("setting random seed to ", random_seed)
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)

    #####################################################

    print("============================================================================================")

    ################# training procedure ################

    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip,  action_std)


    # track total training time
    start_time = dt.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)

    print("============================================================================================")


    # logging file
    log_f = open(log_f_name,"w+")
    log_f.write('episode,timestep,reward\n')


    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    log_running_reward = 0
    log_running_episodes = 0

    time_step = 0
    i_episode = 0
    elapsed_time = timedelta(0)
    start_time=dt.now()
    time_elapsed = False
    reward_log=[]
    # training loop
    while time_step <= max_training_timesteps and not time_elapsed:
        elapsed_time = dt.now() - start_time
        if  elapsed_time.seconds>config_train_time_minutes*60:
            ppo_agent.save(f'{config_pg_folder_path}/PPO.pth')
            print('training time elapsed')
            time_elapsed=True

        state = env.reset()
        current_ep_reward = 0

        for t in range(1, max_ep_len+1): #stops on done

            # select action with policy
            action = ppo_agent.select_action(state)
            state, reward, done, _ = env.step(action)

            # saving reward and is_terminals
            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done)

            time_step +=1
            current_ep_reward += reward

            # update PPO agent
            if time_step % update_timestep == 0:
                ppo_agent.update_policy()

            # if continuous action space; then decay action std of ouput action distribution
            if time_step % action_std_decay_freq == 0:
                ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)

            # log in logging file
            if time_step % log_freq == 0:

                # log average reward till last episode
                log_avg_reward = log_running_reward / log_running_episodes
                log_avg_reward = round(log_avg_reward, 4)

                log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
                log_f.flush()

                log_running_reward = 0
                log_running_episodes = 0

            # printing average reward
            if time_step % print_freq == 0:

                # print average reward till last episode
                print_avg_reward = print_running_reward / print_running_episodes
                print_avg_reward = round(print_avg_reward, 2)

                print("PPO Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))

                print_running_reward = 0
                print_running_episodes = 0

            # save model weights
            if time_step % save_model_freq == 0:
                print("--------------------------------------------------------------------------------------------")
                ppo_agent.save(f'{config_pg_folder_path}/PPO.pth')
                print("Elapsed Time  : ", dt.now().replace(microsecond=0) - start_time)
                print("--------------------------------------------------------------------------------------------")
                show_plot(reward_log, config_pg_folder_path, "PPO")


            # break; if the episode is over
            if done:
                break
        reward_log.append([i_episode,current_ep_reward])
        print_running_reward += current_ep_reward
        print_running_episodes += 1

        log_running_reward += current_ep_reward
        log_running_episodes += 1

        i_episode += 1
        if time_step % print_freq == 0 or time_step % save_model_freq == 0 or time_elapsed:
            show_plot(reward_log, config_pg_folder_path, "PPO")


    log_f.close()
    env.close()


    # print total training time
    print("============================================================================================")
    end_time = dt.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("Finished training at (GMT) : ", end_time)
    print("Total training time  : ", end_time - start_time)
    print("============================================================================================")


if config_use_ppo:
    train_PPO()

## Render result. Run this cell in order to view PPO policy in action.

In [None]:
def test_PPO():

    print("============================================================================================")

    
    env_name = config_env_name
    has_continuous_action_space = True
    max_ep_len = 300           # max timesteps in one episode
    action_std = 0.1            # set same std for action distribution which was used while saving


    render = True              # render environment on screen
    frame_delay = 0             # if required; add delay b/w frames


    total_test_episodes = 1    # total num of testing episodes

    K_epochs = 80               # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    gamma = 0.99                # discount factor

    lr_actor = 0.0003           # learning rate for actor
    lr_critic = 0.001           # learning rate for critic

    #####################################################


    env = gym.make(env_name)

    # state space dimension
    state_dim = env.observation_space.shape[0]

    # action space dimension
    action_dim = env.action_space.shape[0]

    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip,  action_std)

    random_seed = 0             #### set this to load a particular checkpoint trained on random seed
    run_num_pretrained = 0      #### set this to load a particular checkpoint num

    print("loading network from PPO.pth " )
    ppo_agent.load(f"{config_pg_folder_path}/PPO.pth")

    print("--------------------------------------------------------------------------------------------")



    test_running_reward = 0
    images = []
    for ep in range(1, total_test_episodes+1):
        ep_reward = 0
        state = env.reset()

        for t in range(1, max_ep_len+1):
            action = ppo_agent.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            img = env.render(mode='rgb_array')
            images.append(img)
            if done:
                break

        # clear buffer
        ppo_agent.buffer.clear()

        test_running_reward +=  ep_reward
        print('Episode: {} \t\t Reward: {}'.format(ep, round(ep_reward, 2)))
        ep_reward = 0

    env.close()


    print("============================================================================================")

    avg_test_reward = test_running_reward / total_test_episodes
    avg_test_reward = round(avg_test_reward, 2)
    print("average test reward : " + str(avg_test_reward))

    print("============================================================================================")
    
   
    numpngw.write_apng(f'{config_pg_folder_path}/anim_PPO.png', images, delay=10)



if config_use_ppo:
    test_PPO()
    IPython.display.Image(filename=f'{config_pg_folder_path}/anim_PPO.png')

In [None]:
IPython.display.Image(filename=f'{config_pg_folder_path}/anim_PPO.png')