# Proximal Policy Optimization

Some background: https://openai.com/research/openai-baselines-ppo

It is an On-Policy model, as in each training step we assume that the agent will follow the policy.

In [1]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Hyperparameters
GAMMA = 0.99
LR = 0.001
EPOCHS = 4
CLIP_EPS = 0.2
NUM_EPISODES = 1000

max_timesteps = 500


class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, action_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x): # L -> relu -> L -> softmax
        x = torch.relu(self.fc1(x))
        x = self.softmax(self.fc2(x))
        return x
    

def compute_returns(rewards, gamma): # g_t = r + gamma g_t+1
    returns = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    return returns


env = gym.make("CartPole-v0")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

policy = PolicyNetwork(state_size, action_size)
optimizer = optim.Adam(policy.parameters(), lr=LR)


In [2]:
# Untrained model
import time

n_test_episodes = 10
for episode in range(n_test_episodes):
    state = env.reset()
    total_reward = 0

    for t in range(max_timesteps):
        env.render()  # Render the environment
        time.sleep(0.02)  # Slow down the visualization

        state = torch.FloatTensor(state).unsqueeze(0)
        action_probs = policy(state)
        action = torch.argmax(action_probs, 1).item()

        state, reward, done, _ = env.step(action)
        total_reward += reward

        if done:
            break

    print("Test Episode: {}, Reward: {}".format(episode + 1, total_reward))

env.close()

Test Episode: 1, Reward: 11.0
Test Episode: 2, Reward: 11.0
Test Episode: 3, Reward: 8.0
Test Episode: 4, Reward: 9.0
Test Episode: 5, Reward: 9.0
Test Episode: 6, Reward: 8.0
Test Episode: 7, Reward: 9.0
Test Episode: 8, Reward: 11.0
Test Episode: 9, Reward: 8.0
Test Episode: 10, Reward: 9.0


In [61]:
def ppo(env, policy, optimizer, num_episodes):
    torch.autograd.set_detect_anomaly(True)
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        rewards = []
        log_probs = []
        states = []
        actions = []
        action_probas_arr = []

        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probas = policy(state_tensor)
            action = torch.multinomial(action_probas, 1).item()
            log_prob = torch.log(action_probas[0, action])

            new_state, reward, done, _ = env.step(action)

            rewards.append(reward)
            log_probs.append(log_prob)
            states.append(state)
            actions.append(action)
            action_probas_arr.append(action_probas.detach())

            state = new_state

        returns = compute_returns(rewards, GAMMA)
        returns = torch.FloatTensor(returns)

        
        """
        for _ in range(EPOCHS):
            for state, action, old_log_prob, return_ in zip(states, actions, log_probs, returns):
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                action_probs = policy(state_tensor)
                log_prob = torch.log(action_probs[0, action])

                ratio = torch.exp(log_prob - old_log_prob)
                surrogate1 = ratio * return_
                surrogate2 = torch.clamp(ratio, 1-CLIP_EPS, 1+CLIP_EPS) * return_
                loss = -torch.min(surrogate1, surrogate2)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        """


        for _ in range(EPOCHS):
            # Old
            old_states = torch.tensor(states)   # [L, S]
            #print('Old states: ', old_states.shape)
            old_action_probas = torch.stack(action_probas_arr).squeeze() # [L, A]
            #print('Oldaction_probas: ', old_action_probas.shape)
            old_actions = torch.tensor(actions) # [L]
            #print('Old Actions: ', old_actions.shape)
            old_probas = old_action_probas[np.arange(len(old_actions)),old_actions] # [L]
            #print('Old Probas: ', old_probas.shape)

            # New
            new_action_probs = policy(old_states) # [L, A]
            #print('New Action probas: ', new_action_probs.shape)
            new_probas = new_action_probs[np.arange(len(old_actions)),old_actions] # [L]
            #print('New Probas: ', new_probas.shape)

            # Ratio
            ratios = new_probas / old_probas # Unprotected? [L]
            #print("Ratios", ratios.shape)
            #print('Returns', returns.shape)

            # loss
            surrogate1 = torch.mul(ratios, returns)
            surrogate2 = torch.mul(torch.clamp(ratios, 1-CLIP_EPS, 1+CLIP_EPS) , returns)
            loss = (-torch.min(surrogate1, surrogate2)).mean()
            
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()



        """states_tensor = torch.tensor(states) #, dtype=torch.float32)
        old_action_dists = policy(states_tensor).detach()

        for _ in range(EPOCHS):
            action_dists = policy(states_tensor)
            action_tensor = torch.tensor(actions)  #, dtype=torch.int32)

            old_probs = old_action_dists.log_prob(action_tensor).exp()
            new_probs = action_dists.log_prob(action_tensor).exp()

            ratio = new_probs / old_probs

            discounted_rewards = [GAMMA ** i * r for i, r in enumerate(rewards)]
            advantages = discounted_rewards - torch.tensor(discounted_rewards).mean()

            surrogate_loss = torch.min(ratio * advantages, torch.clamp(ratio, 1 - CLIP_EPS, 1 + CLIP_EPS) * advantages)
            loss = -torch.mean(surrogate_loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()"""

        if episode % 100 == 0:
            print("Episode:", episode, "Total Reward:", sum(rewards))


ppo(env, policy, optimizer, NUM_EPISODES)

Episode: 0 Total Reward: 120.0
Episode: 100 Total Reward: 28.0
Episode: 200 Total Reward: 23.0
Episode: 300 Total Reward: 91.0
Episode: 400 Total Reward: 125.0
Episode: 500 Total Reward: 200.0
Episode: 600 Total Reward: 200.0
Episode: 700 Total Reward: 200.0
Episode: 800 Total Reward: 199.0
Episode: 900 Total Reward: 164.0
