## PPO

### Algorithms:

1. Initialize actor and critic network
2. Collect data
   1. Initialize environment and its states
   2. Let the agent interact with the environment
      1. Store the state, rewards, actions, next states, and log probability of the action into a list
         1. Note that here, actions are simply from equal probability
         2. Rewards are defined by OpenAI environment, so we don't really need to care
   3. Once you finish collecting these states, rewards, etc.
      1. Compute G_t, i.e., the discounted rewards
3. Calculate probability
   1. $\displaystyle\frac{\pi}{\pi_\text{old}} \hat{A}_t$
   2. $\text{clip}(\displaystyle\frac{\pi}{\pi_\text{old}}, 1 - \epsilon, 1 + \epsilon) \hat{A}_t$
4. Calculate loss
   1. Actor loss = - minimum of the 3.1 and 3.2
   2. Critic loss = $(G_t - V)^2$
5. Backpropagate

### Backbone neural network for actor and critic networks

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

class FeedForwardNN(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(FeedForwardNN, self).__init__()

        self.layer1 = nn.Linear(in_dim, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, out_dim)
        
    def forward(self, states):
        if isinstance(states, np.ndarray):
            states = torch.tensor(states, dtype = torch.float)
                    
        activation1 = F.relu(self.layer1(states))
        activation2 = F.relu(self.layer2(activation1))
        out         = self.layer3(activation2)
        
        return out

### PPO class

In [2]:

from torch.distributions import MultivariateNormal
from torch.optim import Adam

class PPO:
    def __init__(self, env):
        self._init_params()
        
        #extract info from environment
        self.env = env
        self.states_dim = env.observation_space.shape[0]
        self.act_dim    = env.action_space.shape[0]
        
        ## STEP 1
        #input is state for both actor and critic networks
        #output is a value for critic networks, and action distribution for actor networks 
        self.actor  = FeedForwardNN(self.states_dim, self.act_dim) 
        self.critic = FeedForwardNN(self.states_dim, 1)
        
        ##this is for sampling actions when collecting data
        self.cov_var = torch.full(size = (self.act_dim, ), fill_value=0.5)
        self.cov_mat = torch.diag(self.cov_var)  #basically every action has a probabiliy of 0.5
        
        self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)
    
    def _init_params(self):
        torch.manual_seed(999)  #just for reproducibility
        self.timesteps_per_batch = 4800
        self.max_timesteps_per_episode = 1600
        self.gamma = 0.95
        self.n_updates_per_iteration = 5
        self.clip = 0.2
        self.lr = 0.005
        self.entropy_weight = 0.05 #higher means more exploration; we can set it very low for pendulum because it's a very simple problem
    
    ## STEP 2
    def collect_data(self):
        #rollout
        batch_states    = [] #shape: (number of timesteps per batch, states_dim)
        batch_acts      = [] #shape: (number of timesteps per batch, act_dim)
        batch_log_probs = [] #(number of timesteps per batch, )
        batch_rewards   = [] #(number of episodes, number of timesteps per episode)
        batch_discounted_rewards = [] #(number of timesteps per batch, )
        batch_lens      = [] #(number of episodes, )
        
        #Number of timesteps run so far this batch
        t = 0
        ep_rewards = []
        
        #batch means one batch of data we collect, which can span multiple episodes
        #one episode means you start the env, until you reach the terminal state
        
        while t < self.timesteps_per_batch:  #30
            
            #Rewards this episode
            ep_rewards = []
            
            states = self.env.reset()[0]  ## STEP 2.1
            done   = False
            
            ## STEP 2.2
            for ep_t in range(self.max_timesteps_per_episode):
                t += 1
                
                #collect states
                batch_states.append(states)
                
                action, log_prob = self.get_action(states)    
                states, rewards, done, _, _ = self.env.step(action)
                
                #collect reward, action, and log prob
                ep_rewards.append(rewards)                
                batch_acts.append(action)
                batch_log_probs.append(log_prob)
                
                if done:
                    break
                
            batch_lens.append(ep_t + 1)           
            batch_rewards.append(ep_rewards)
        
        #convert to tensor; note that converting the list first to np array then to tensor is much faster
        batch_states    = torch.tensor(np.array(batch_states), dtype=torch.float)
        batch_acts      = torch.tensor(np.array(batch_acts), dtype=torch.float)
        batch_log_probs = torch.tensor(np.array(batch_log_probs), dtype=torch.float)

        ## STEP 2.3
        #compute G_t
        batch_discounted_rewards = self.compute_discounted_rewards(batch_rewards)
        
        return batch_states, batch_acts, batch_log_probs, batch_discounted_rewards, batch_lens
                
    def fit(self, total_timesteps):
        t = 0 # Timesteps simulated until now
        i = 0
        actor_losses  = [] #for reporting
        critic_losses = []
        discounted_rewards = []
        
        while t < total_timesteps:
                        
            batch_states, batch_acts, batch_log_probs, batch_discounted_rewards, batch_lens = self.collect_data()
                        
            t += np.sum(batch_lens)
            i += 1
                    
            # Calculate V
            V, _ , _ = self.predict(batch_states, batch_acts)

            # Calculate advantage
            A_k = batch_discounted_rewards - V.detach()
            
            # For faster convergence
            A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)
            
            for _ in range(self.n_updates_per_iteration):
                V, curr_log_probs, entropy = self.predict(batch_states, batch_acts)
                ratios = torch.exp(curr_log_probs - batch_log_probs) #log ratio become minus
                
                # Calculate surrogate losses
                surr1 = ratios * A_k
                surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k
                actor_loss = (-torch.min(surr1, surr2)).mean()
                entropy_loss = entropy.mean()
                actor_loss = actor_loss - self.entropy_weight * entropy_loss
                critic_loss = nn.MSELoss()(V, batch_discounted_rewards)
                
                discounted_rewards.append(batch_discounted_rewards.mean())
                
                actor_losses.append(actor_loss.detach())
                critic_losses.append(critic_loss.detach())
                
                # Backprop
                self.actor_optim.zero_grad()
                actor_loss.backward()
                self.actor_optim.step()
                
                self.critic_optim.zero_grad()    
                critic_loss.backward()    
                self.critic_optim.step()
                
            self.print_summary(i, t, discounted_rewards, critic_losses, actor_losses)
                
    def get_action(self, states):
        mean = self.actor(states)
        dist = MultivariateNormal(mean, self.cov_mat)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        
        #detach from computational graph
        return action.detach().numpy(), log_prob.detach()
    
    def compute_discounted_rewards(self, batch_rewards):
        # batch_rewards: shape (number of episodes, number of timesteps per episode)
        batch_discounted_rewards = []  #shape: (num of timesteps in batch)
                        
        # Iterate through each episode backwards to maintain same order in batch_discounted_rewards
        for episode_reward in reversed(batch_rewards):
        
            discounted_reward = 0
            for reward in reversed(episode_reward):
                discounted_reward = reward + discounted_reward * self.gamma
                batch_discounted_rewards.insert(0, discounted_reward)
                
        batch_discounted_rewards = torch.tensor(batch_discounted_rewards, dtype=torch.float)
        
        return batch_discounted_rewards
    
    def predict(self, batch_states, batch_acts):
        # Query critic network for a value V for each state in batch_states.
        V = self.critic(batch_states).squeeze()
        
        mean = self.actor(batch_states)
        dist = MultivariateNormal(mean, self.cov_mat)
                
        log_probs = dist.log_prob(batch_acts)
        
        return V, log_probs, dist.entropy()
    
    def print_summary(self, i, t, discounted_rewards, critic_losses, actor_losses):
        avg_discounted_rewards  = np.mean([rewards.float().mean() for rewards in discounted_rewards])
        avg_actor_loss  = np.mean([losses.float().mean() for losses in actor_losses])
        avg_critic_loss = np.mean([losses.float().mean() for losses in critic_losses])
        
        if(i+1) % 10 == 0:
            print(f"#{i+1:3.0f} | Timesteps: {t:7.0f} |  Critic Loss: {avg_critic_loss:10.3f} | Actor Loss: {avg_actor_loss:10.6f} | Dis. Rewards: {avg_discounted_rewards:5.3f}")
        

### Training

In [3]:
#pip install gymnasium
#brew install swig
#pip install box2d-py

import gymnasium as gym
import pickle

env = gym.make("Pendulum-v1")

model = PPO(env)
model.fit(500000)

filename = 'model/pendulumv1'
with open(f'{filename}.pkl', 'wb') as file:
    pickle.dump(model, file)

# 10 | Timesteps:   43200 |  Critic Loss:   6701.342 | Actor Loss:  -0.049411 | Dis. Rewards: -105.265
# 20 | Timesteps:   91200 |  Critic Loss:   3994.854 | Actor Loss:  -0.051814 | Dis. Rewards: -98.304
# 30 | Timesteps:  139200 |  Critic Loss:   2823.071 | Actor Loss:  -0.052744 | Dis. Rewards: -93.360
# 40 | Timesteps:  187200 |  Critic Loss:   2222.132 | Actor Loss:  -0.053453 | Dis. Rewards: -87.557
# 50 | Timesteps:  235200 |  Critic Loss:   1870.752 | Actor Loss:  -0.053845 | Dis. Rewards: -75.523
# 60 | Timesteps:  283200 |  Critic Loss:   1588.402 | Actor Loss:  -0.053969 | Dis. Rewards: -64.052
# 70 | Timesteps:  331200 |  Critic Loss:   1390.032 | Actor Loss:  -0.053761 | Dis. Rewards: -56.503
# 80 | Timesteps:  379200 |  Critic Loss:   1220.994 | Actor Loss:  -0.053910 | Dis. Rewards: -49.834
# 90 | Timesteps:  427200 |  Critic Loss:   1089.057 | Actor Loss:  -0.054012 | Dis. Rewards: -44.565
#100 | Timesteps:  475200 |  Critic Loss:    984.298 | Actor Loss:  -0.054108 | D

## Testing

In [4]:
import gymnasium as gym
import pickle

filename = 'model/pendulumv1'

with open(f'{filename}.pkl', 'rb') as file:
    model = pickle.load(file)

env = gym.make('Pendulum-v1', render_mode='human')
num_episodes = 1

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0
    
    i = 0
    while not done:
        env.render()

        action, log_probabilities = model.get_action(state)
        next_state, reward, done, truncated, info = env.step(action)    
        
        angle = np.arctan2(next_state[1], next_state[0])
        angle_threshold = 0.005
        if abs(angle) < angle_threshold:
            done = True
                       
        state = next_state
        
        i += 1
        #the more negative the rewards, the farther it is from the upright position.
        print(f"Iteration {i: 4.0f} | Angle: {angle:6.3f} | Reward: {reward:3.5f}")

env.close() #env.close() won't close the window; just restart the kernel and it will close the window

Iteration    1 | Angle:  2.190 | Reward: -4.69727
Iteration    2 | Angle:  2.248 | Reward: -4.82139
Iteration    3 | Angle:  2.344 | Reward: -5.19183
Iteration    4 | Angle:  2.481 | Reward: -5.86241
Iteration    5 | Angle:  2.657 | Reward: -6.91433
Iteration    6 | Angle:  2.864 | Reward: -8.29007
Iteration    7 | Angle:  3.097 | Reward: -9.93492
Iteration    8 | Angle: -2.937 | Reward: -11.76946
Iteration    9 | Angle: -2.680 | Reward: -11.11169
Iteration   10 | Angle: -2.425 | Reward: -9.81979
Iteration   11 | Angle: -2.180 | Reward: -8.48397
Iteration   12 | Angle: -1.959 | Reward: -7.15898
Iteration   13 | Angle: -1.757 | Reward: -5.80084
Iteration   14 | Angle: -1.577 | Reward: -4.71790
Iteration   15 | Angle: -1.423 | Reward: -3.78353
Iteration   16 | Angle: -1.290 | Reward: -2.98249
Iteration   17 | Angle: -1.179 | Reward: -2.36992
Iteration   18 | Angle: -1.087 | Reward: -1.88994
Iteration   19 | Angle: -1.015 | Reward: -1.52198
Iteration   20 | Angle: -0.959 | Reward: -1.2438