## Backbone neural network for actor and critic networks

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

class FeedForwardNN(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(FeedForwardNN, self).__init__()

        self.layer1 = nn.Linear(in_dim, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, out_dim)
        
    def forward(self, states):
        if isinstance(states, np.ndarray):
            states = torch.tensor(states, dtype = torch.float)
                    
        activation1 = F.relu(self.layer1(states))
        activation2 = F.relu(self.layer2(activation1))
        out         = self.layer3(activation2)
        
        return out

## PPO

### Algorithms:

1. Initialize actor and critic network
2. Collect data
   1. Initialize environment and its states
   2. Let the agent interact with the environment
      1. Store the state, rewards, actions, next states, and log probability of the action into a list
         1. Note that here, actions are simply from equal probability
         2. Rewards are defined by OpenAI environment, so we don't really need to care
   3. Once you finish collecting these states, rewards, etc.
      1. Compute G_t, i.e., the discounted rewards
3. Calculate probability
   1. $\displaystyle\frac{\pi}{\pi_\text{old}} \hat{A}_t$
   2. $\text{clip}(\displaystyle\frac{\pi}{\pi_\text{old}}, 1 - \epsilon, 1 + \epsilon) \hat{A}_t$
4. Calculate loss
   1. Actor loss = - minimum of the 3.1 and 3.2
   2. Critic loss = $(G_t - V)^2$
5. Backpropagate

In [2]:
from torch.distributions import MultivariateNormal
from torch.optim import Adam

class PPO:
    def __init__(self, env):
        self._init_params()
        
        #extract info from environment
        self.env = env
        self.states_dim = env.observation_space.shape[0]
        self.act_dim    = env.action_space.shape[0]
        
        ## STEP 1
        #input is state for both actor and critic networks
        #output is a value for critic networks, and action distribution for actor networks 
        self.actor  = FeedForwardNN(self.states_dim, self.act_dim) 
        self.critic = FeedForwardNN(self.states_dim, 1)
        
        ##this is for sampling actions when collecting data
        self.cov_var = torch.full(size = (self.act_dim, ), fill_value=0.5)
        self.cov_mat = torch.diag(self.cov_var)  #basically every action has a probabiliy of 0.5
        
        self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)
    
    def _init_params(self):
        torch.manual_seed(999)  #just for reproducibility
        self.timesteps_per_batch = 4800
        self.max_timesteps_per_episode = 1600
        self.gamma = 0.95
        self.n_updates_per_iteration = 5
        self.clip = 0.2
        self.lr = 0.005
    
    ## STEP 2
    def collect_data(self):
        #rollout
        batch_states    = [] #shape: (number of timesteps per batch, states_dim)
        batch_acts      = [] #shape: (number of timesteps per batch, act_dim)
        batch_log_probs = [] #(number of timesteps per batch, )
        batch_rewards   = [] #(number of episodes, number of timesteps per episode)
        batch_discounted_rewards = [] #(number of timesteps per batch, )
        batch_lens      = [] #(number of episodes, )
        
        #Number of timesteps run so far this batch
        t = 0
        ep_rewards = []
        
        #batch means one batch of data we collect, which can span multiple episodes
        #one episode means you start the env, until you reach the terminal state
        
        while t < self.timesteps_per_batch:  #30
            
            #Rewards this episode
            ep_rewards = []
            
            states = self.env.reset()[0]  ## STEP 2.1
            done   = False
            
            ## STEP 2.2
            for ep_t in range(self.max_timesteps_per_episode):
                t += 1
                                
                #collect states
                batch_states.append(states)
                
                action, log_prob = self.get_action(states)    
                states, rewards, done, _, _ = self.env.step(action)
                
                #collect reward, action, and log prob
                ep_rewards.append(rewards)                
                batch_acts.append(action)
                batch_log_probs.append(log_prob)
                
                if done:
                    break
                
            batch_lens.append(ep_t + 1)           
            batch_rewards.append(ep_rewards)
        
        # Reshape data as tensors in the shape specified before returning
        batch_states = torch.tensor(batch_states, dtype=torch.float)
        batch_acts = torch.tensor(batch_acts, dtype=torch.float)
        batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float)
        
        ## STEP 2.3
        #compute G_t
        batch_discounted_rewards = self.compute_discounted_rewards(batch_rewards)
        
        return batch_states, batch_acts, batch_log_probs, batch_discounted_rewards, batch_lens
                
    def learn(self, total_timesteps):
        t_so_far = 0 # Timesteps simulated until now
        
        while t_so_far < total_timesteps:
                        
            batch_states, batch_acts, batch_log_probs, batch_discounted_rewards, batch_lens = self.collect_data()
                        
            t_so_far += np.sum(batch_lens)
                    
            # Calculate V
            V, _ = self.evaluate(batch_states, batch_acts)

            # Calculate advantage
            A_k = batch_discounted_rewards - V.detach()
            
            # For faster convergence
            A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)
            
            for _ in range(self.n_updates_per_iteration):
                V, curr_log_probs = self.evaluate(batch_states, batch_acts)
                ratios = torch.exp(curr_log_probs - batch_log_probs) #log ratio become minus
                
                # Calculate surrogate losses
                surr1 = ratios * A_k
                surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k
                
                actor_loss = (-torch.min(surr1, surr2)).mean()
                critic_loss = nn.MSELoss()(V, batch_discounted_rewards)
                
                self.actor_optim.zero_grad()
                actor_loss.backward()
                self.actor_optim.step()
                
                self.critic_optim.zero_grad()    
                critic_loss.backward()    
                self.critic_optim.step()
                
    def get_action(self, states):
        mean = self.actor(states)
        dist = MultivariateNormal(mean, self.cov_mat)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        
        #detach from computational graph
        return action.detach().numpy(), log_prob.detach()
    
    def compute_discounted_rewards(self, batch_rewards):
        # batch_rewards: shape (number of episodes, number of timesteps per episode)
        batch_discounted_rewards = []  #shape: (num of timesteps in batch)
                        
        # Iterate through each episode backwards to maintain same order in batch_discounted_rewards
        for episode_reward in reversed(batch_rewards):
        
            discounted_reward = 0
            for reward in reversed(episode_reward):
                discounted_reward = reward + discounted_reward * self.gamma
                batch_discounted_rewards.insert(0, discounted_reward)
                
        batch_discounted_rewards = torch.tensor(batch_discounted_rewards, dtype=torch.float)
        
        return batch_discounted_rewards
    
    def evaluate(self, batch_states, batch_acts):
        # Query critic network for a value V for each state in batch_states.
        V = self.critic(batch_states).squeeze()
        
        mean = self.actor(batch_states)
        dist = MultivariateNormal(mean, self.cov_mat)
                
        log_probs = dist.log_prob(batch_acts)
        
        return V, log_probs
    

## Learning

In [3]:
import gym
env = gym.make('Pendulum-v1')
model = PPO(env)
model.learn(100)

V.shape=torch.Size([30])
V.shape=torch.Size([30])
V.shape=torch.Size([30])
V.shape=torch.Size([30])


  batch_states = torch.tensor(batch_states, dtype=torch.float)


## Testing

In [None]:
states_dim = env.observation_space.shape[0]
act_dim    = env.action_space.shape[0]

