In [1]:
#All imports here
## Feel free to add or remove

import os
import random
import time
from tqdm import tqdm
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

In [2]:
#Hyperparameters
gym_id = "CartPole-v1"  #The id of the gym environment
learning_rate = 0.0001
seed = 1
total_timesteps =  10000#The total timesteps of the experiments
torch_deterministic = True   #If toggled, `torch.backends.cudnn.deterministic=False
cuda = True

num_envs = 4  #The number of parallel game environments (Yes PPO works with vectorized environments)
num_steps = 128 #The number of steps to run in each environment per policy rollout
anneal_lr = True #Toggle learning rate annealing for policy and value networks
gae = True #Use GAE for advantage computation
gamma = 0.99
gae_lambda = 0.95 #The lambda for the general advantage estimation
num_minibatches = 4
update_epochs =  10#The K epochs to update the policy
norm_adv = True  #Toggles advantages normalization
clip_coef = 0.2 #The surrogate clipping coefficient (See what is recommended in the paper!)
clip_vloss = True #Toggles whether or not to use a clipped loss for the value function, as per the paper
ent_coef =  0.001#Coefficient of the entropy
vf_coef =  0.1#Coefficient of the value function
max_grad_norm = 1
target_kl = None #The target KL divergence threshold


batch_size = int(num_envs * num_steps)
minibatch_size = int(batch_size // num_minibatches)


In [3]:
#PPO works with vectorized enviromnets lets make a function that returns a function that returns an environment.
#Refer how to make vectorized environments in gymnasium
def make_env(gym_id, seed):
    #Your code here
    env = gym.make_vec(gym_id, num_envs=num_envs)
    _ = env.reset(seed=seed)
    return env

In [4]:
def orthogonal_init(tensor, gain=1):
    assert tensor.ndimension() >= 2
    rows = tensor.size(0)
    cols = tensor[0].numel()
    flattened = tensor.new(rows, cols).normal_(0, 1)
    if rows < cols:
        flattened.t_()
    u, s, v = torch.svd(flattened, some=True)
    if rows < cols:
        u.t_()
    q = u if tuple(u.shape) == (rows, cols) else v
    with torch.no_grad():
        tensor.view_as(q).copy_(q)
        tensor.mul_(gain)

In [5]:
#We initialize the layers in PPO , refer paper.
#Lets initialize the layers with this function
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    if hasattr(layer, 'bias'):
        layer.bias.data = layer.bias.data.zero_() + bias_const
        orthogonal_init(layer.weight.data, gain=std)
    #inplace initialization no need to return 

In [6]:
def get_path_indices(dones):
    indices = []
    num_timesteps = dones.shape[1]
    for actor in range(dones.shape[0]):
        last_index = 0
        for i in range(num_timesteps):
            if dones[actor, i] == 0.:
                indices.append((actor, last_index, i + 1))
                last_index = i + 1
        if last_index != num_timesteps:
            indices.append((actor, last_index, num_timesteps))
    return indices


def discount_path(path, h):
    curr = 0
    rets = []
    for i in range(len(path)):
        curr = curr*h + path[-1-i]
        rets.append(curr)
    rets =  torch.stack(list(reversed(rets)), 0)
    return rets


def compute_advantage_return(rewards, values, dones, gamma=gamma, gae_lambda=gae_lambda):
    rewards = rewards.T
    values = values.T
    dones = dones.T
    V_s = torch.cat([values[:,1:], values[:, -1:]], 1) * dones
    deltas = rewards + gamma * V_s - values
    advantages = torch.zeros_like(rewards)
    returns = torch.zeros_like(rewards)
    indices = get_path_indices(dones)
    for agent, start, end in indices:
        advantages[agent, start:end] = discount_path(deltas[agent, start:end], gae_lambda*gamma)
        returns[agent, start:end] = discount_path(rewards[agent, start:end], gamma)
    return advantages.T.clone().detach(), returns.T.clone().detach()



In [7]:
class Net(nn.Module):
        def __init__(self, inDim, outDim, hDim, act='tanh'):
            super().__init__()
            act_fn = nn.Tanh() if act=='tanh' else nn.ReLU() 
            net_list = [nn.Linear(inDim, hDim[0]), act_fn]
            for i in range(len(hDim)-1):
                net_list += [nn.Linear(hDim[i], hDim[i+1]), act_fn]
            net_list += [nn.Linear(hDim[-1], outDim)]
            self.net = nn.Sequential(*net_list)
            self.weight_init()
            
        def weight_init(self):
            for i in range(len(self.net)):
                layer_init(self.net[i])
                
        def forward(self, x):
            return self.net(x)
            

In [8]:
#Lets make the Main agent class
class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.critic = Net(inDim=envs.observation_space.shape[-1],
                              outDim=1, hDim=[64,64], act='tahn')
                              
        #(Returns a single value of the observation)
        
        self.actor = Net(inDim=envs.observation_space.shape[-1],
                              outDim=envs.single_action_space.n, hDim=[64,64], act='tahn')
        
        #(Returns the logits of the actions on the observations)
    
    def get_value(self, x):
        # Returns the value from the critic on the observation x
        return self.critic(x)
        
    def get_action_and_value(self, x, action=None):
        #Returns 1.the action (sampled according to the logits), 
        #2.log_prob of the action,
        #3.Entropy,
        #4.Value from the critic
        
        #Your code here
        logits = self.actor(x)
        dist = torch.distributions.Categorical(logits=logits)
        actions = dist.sample()
        logPs = dist.log_prob(actions)
        ent = dist.entropy()
        return actions, logPs, ent, self.get_value(x)


In [15]:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and cuda else "cpu")


In [16]:
#Make the vectorized environments, use the helper function that we have declared above
envs = make_env('CartPole-v1', seed=42)# Your code here

In [17]:
agent = Agent(envs=envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5) #eps is not the default that pytorch uses

# ALGO Logic: Storage setup
obs = torch.zeros((num_steps, num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((num_steps, num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((num_steps, num_envs)).to(device)
rewards = torch.zeros((num_steps, num_envs)).to(device)
dones = torch.zeros((num_steps, num_envs)).to(device)
values = torch.zeros((num_steps, num_envs)).to(device)


In [18]:
# Start the game
global_step = 0
start_time = time.time()
next_obs, info = envs.reset()
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(num_envs).to(device)
num_updates = total_timesteps // batch_size

In [19]:
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, 
                                            lr_lambda=lambda f: 1-f/num_updates)

In [20]:
#This is the main training loop where we collect the experience , 
#calculate the advantages, ratio , the total loss and learn the policy

for update in tqdm(range(1, num_updates + 1)):
    
    # Annealing the rate if instructed to do so.
    if anneal_lr:
        # Your code here 
        if update>1:
            scheduler.step()
        

    for step in range(0, num_steps):
        global_step += 1 * num_envs  # We are taking a step in each environment 
        obs[step] = next_obs
        dones[step] = next_done

        # ALGO LOGIC: action logic
        with torch.no_grad():
            #Get the action , logprob , _ , value from the agent.
            
            action, logprob, _, value = agent.get_action_and_value(next_obs)

            values[step] = value.flatten()
        actions[step] = action
        logprobs[step] = logprob

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, reward, done,truncated, info = envs.step(action.cpu().numpy())
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
        
        for item in info.keys():
            if item == "final_info" and info[item][0]:
                print(f"global_step={global_step}, episodic_return={info[item][0]['episode']['r']}")
                break
    #import sys; sys.exit(0)
    # bootstrap value if not done
    with torch.no_grad():
        next_value = agent.get_value(next_obs).reshape(1, -1)
        if gae:
            advantages, returns = compute_advantage_return(rewards, values, dones)
            #returns = advantages + values  (yes official implementation of ppo calculates it this way)
        else:
            # Your code here 
            #advantages = returns - values
            pass
            

    # flatten the batch
    b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = values.reshape(-1)

    # Optimizing the policy and value network
    b_inds = np.arange(batch_size)
    clipfracs = []
    for epoch in range(update_epochs):
        #Get a random sample of batch_size
        np.random.shuffle(b_inds)
        for start in range(0, batch_size, minibatch_size):
            end = start + minibatch_size
            mb_inds = b_inds[start:end]

            #Your code here
            #Calculate the ratio
            _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds])
            logratio =  b_logprobs[mb_inds] - newlogprob
            ratio = torch.exp(logratio)
            
            with torch.no_grad():
                # calculate approx_kl http://joschu.net/blog/kl-approx.html
                # Refer the blog for calculating kl in a simpler way
                #old_approx_kl = 
                #approx_kl = 
                clipfracs += [((ratio - 1.0).abs() > clip_coef).float().mean().item()]

            mb_advantages = b_advantages[mb_inds]
            if norm_adv:
                mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

            # Policy loss (Calculate the policy loss pg_loss)
            # Your code here 
            pg_loss = -torch.min(mb_advantages*ratio, mb_advantages*torch.clamp(ratio, 1-clip_coef, 1+clip_coef)).mean()
            # Value loss v_loss
            newvalue = newvalue.view(-1)
            vs_tgt = (b_values[mb_inds] + mb_advantages).detach()
            v_clipped = b_values[mb_inds] + torch.clamp(newvalue - b_values[mb_inds],-clip_coef,clip_coef )
            sel = dones.view(-1)[mb_inds].bool()
            val_loss_unclipped = (newvalue - vs_tgt)[sel].pow(2)
            val_loss_clipped = (v_clipped - vs_tgt)[sel].pow(2)
            if clip_vloss:
                v_loss = torch.max(val_loss_unclipped,val_loss_clipped).mean()
            else:
                v_loss = val_loss_unclipped.mean()

            # Entropy loss 
            entropy_loss = entropy.mean()

            # Total loss
            loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()

        # if target_kl is not None:
        #     if approx_kl > target_kl:
        #         break

    y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y


envs.close()

  0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:04<00:00,  4.21it/s]
