<a href="https://colab.research.google.com/github/donghuna/AI-Expert/blob/main/%EC%96%91%EC%9D%B8%EC%88%9C/trpo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Trust Region Policy Optimization Practice

# -1. Setting

If you run in jupyter, turn

```
colab = False
```

In [None]:
colab = True
if colab:
    !pip install gym pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !apt-get update > /dev/null 2>&1
    !apt-get install cmake > /dev/null 2>&1
    !pip install --upgrade setuptools 2>&1
    !pip install ez_setup > /dev/null 2>&1
    !pip install swig
    !pip3 install box2d-py
    !pip3 install gym[Box_2D]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting setuptools
  Downloading setuptools-67.2.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.9.0 requires jedi>=0.10, which is not installed.
cvxpy 1.2.3 requires setuptools<=64.0.2, but you have setuptools 67.2.0 which is incompatible.[0m[31m
[0mSuccessfully installed setuptools-67.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simp

In [None]:
if colab:
    from google.colab import drive
    drive.mount('/content/drive')

    %cd /content/drive/MyDrive/drlcourse-main/day4/trpo
    !ls

Mounted at /content/drive
/content/drive/MyDrive/drlcourse-main/day4/trpo
chap7_trpo_ppo.pdf  ppo_learning_curves  __pycache__	  trpo.ipynb
learning_curves     ppo.py		 snapshots	  utils.py
memory.py	    ppo_snapshots	 trpo_full.ipynb  video


In [None]:
import numpy as np
import time
import csv
import torch
import os
import copy
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Independent
from torch.distributions.normal import Normal
from torch.optim import Adam
from memory import OnPolicyMemory
from utils import *

  """


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('current device : ', device)

current device :  cuda


# 0. Network Architectures

In [None]:
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden1, hidden2):
        # actor f_\phi(s)
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(obs_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        # TODO_1 : Define two fc layers fc3 and fc4 for \mu, \sigma
        self.fc3 = nn.Linear(hidden2, act_dim)  # for \mu
        self.fc4 = nn.Linear(hidden2, act_dim) # for \sigma

    def forward(self, obs):
        x = torch.tanh(self.fc1(obs))
        x = torch.tanh(self.fc2(x))

        mu = self.fc3(x)
        # Not "sigma = self.fc4(x)" - Why?
        log_sigma = self.fc4(x)

        sigma = torch.exp(log_sigma)

        return mu, sigma

    def log_prob(self, obs, act):
        mu, sigma = self.forward(obs)
        act_distribution = Independent(Normal(mu, sigma), 1)
        log_prob = act_distribution.log_prob(act)

        return log_prob

class Critic(nn.Module):
    # critic V(s ; \theta)
    def __init__(self, obs_dim, hidden1, hidden2):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(obs_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        # TODO_2 : Define fc3 layer, of which output is v(s)
        # Hint: What should the output dim of fc3 layer be?
        self.fc3 = nn.Linear(hidden2, 1)

    def forward(self, obs):
        x = torch.tanh(self.fc1(obs))
        x = torch.tanh(self.fc2(x))
        v = self.fc3(x)

        return v

# 1. Agent Definition

In [None]:
class TRPOAgent:
    def __init__(
                 self,
                 obs_dim,
                 act_dim,
                 hidden1=64,
                 hidden2=32,
                 ):

        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.hidden1 = hidden1
        self.hidden2 = hidden2

        self.pi = Actor(obs_dim, act_dim, hidden1, hidden2).to(device)
        self.V = Critic(obs_dim, hidden1, hidden2).to(device)

    def act(self, obs, deterministic=False):
        obs = torch.tensor(obs, dtype=torch.float).to(device)
        with torch.no_grad():
            # TODO_3 : Get mu and sigma from actor network
            mu, sigma = self.pi(obs)
            if deterministic:
                action = mu
                log_prob = None
                val = None
            else:
                # TODO_4 : Following TA, get action from action distribution N(mu, sigma)
                act_distribution = Independent(Normal(mu, sigma), 1)
                action = act_distribution.sample()

                log_prob = act_distribution.log_prob(action)

                # TODO_5 : Get v(s) from critic network
                val = self.V(obs)
                log_prob = log_prob.cpu().numpy()
                val = val.cpu().numpy()

        action = action.cpu().numpy()

        return action, log_prob, val

# 2. Policy & Value Function Approximation Update

Objective:
\begin{align*}
g = \nabla_\phi J(\phi) &\approx \nabla_\phi \mathbb{E}_{s \sim \rho_{\phi_{\text{old}}}, a \sim \pi_{\phi_{\text{old}}}}\left( \frac{\pi_{\phi}(s, a)}{\pi_{\phi_{\text{old}}}(s, a)} A^{\pi_{\phi_{\text{old}}}}(s, a) \right) \\
&\approx \nabla_\phi \frac{1}{N} \sum_{i = 1}^N \left( \frac{\pi_{\phi}(s_i, a_i)}{\pi_{\phi_{\text{old}}}(s_i, a_i)} \hat A(s_i, a_i) \right).
\end{align*} \\
Since we take into account approximated trust region constraint, the final update direction is
\begin{equation*}
s = H^{-1}g, \quad H s = g,
\end{equation*}
 and the stepsize is
 \begin{equation*}
\alpha = \sqrt{\frac{2\delta}{g^\top H^{-1} g}}.
 \end{equation*}
 Thus, the update is done as follows:
 \begin{gather*}
 \phi_{\text{old}} \longleftarrow \phi, \\
\phi \longleftarrow \phi + \alpha \cdot s.
 \end{gather*}

In [None]:
def update(agent, memory, critic_optim, delta, num_updates):

    batch = memory.load()

    states = torch.Tensor(batch['state']).to(device)
    actions = torch.Tensor(batch['action']).to(device)
    target_v = torch.Tensor(batch['val']).to(device)
    A = torch.Tensor(batch['A']).to(device)
    old_log_probs = torch.Tensor(batch['log_prob']).to(device)

    for _ in range(num_updates):
        ################
        # train critic #
        ################
        # TODO_6 : Implement training code for critic network
        # 1) Get output of critic network
        # 2) Define critic loss (MSE)
        # 3) Three lines needed for backprop
        out = agent.V(states)
        critic_loss = torch.mean((out - target_v)**2)

        critic_optim.zero_grad()
        critic_loss.backward()
        critic_optim.step()

        ###################
        # policy gradient #
        ###################
        log_probs = agent.pi.log_prob(states, actions)
        # TODO_7 : Calculate probabiltiy ratio, \pi(a_t | s_t ; \phi) / \pi(a_t | s_t ; \phi_old) and actor loss
        # Hint : Use log_probs and old_log_probs
        prob_ratio = torch.exp(log_probs - old_log_probs)
        actor_loss = torch.mean(prob_ratio * A)

        # TODO_8 : Calculate gradient of loss
        # Hint : Use torch.autograd.grad()
        loss_grad = torch.autograd.grad(actor_loss, agent.pi.parameters())

        # flatten gradients of params
        g = torch.cat([grad.view(-1) for grad in loss_grad]).data

        s = cg(fisher_vector_product, g, agent.pi, states)

        sAs = torch.sum(fisher_vector_product(s, agent.pi, states) * s, dim=0, keepdim=True)
        step_size = torch.sqrt(2 * delta / sAs)[0]    # stepsize : move as far as possible within trust region
        step = step_size * s

        old_actor = Actor(agent.obs_dim, agent.act_dim, agent.hidden1, agent.hidden2).to(device)
        old_actor.load_state_dict(agent.pi.state_dict())

        params = flat_params(agent.pi)

        # TODO_9, 10 : Implement line search algorithm in utils.py
        backtracking_line_search(old_actor, agent.pi, actor_loss, g,
                                 old_log_probs, params, step, delta, A, states, actions)    # line search => for improvement guarantee!

    return

In [None]:
def evaluate(agent, env, num_episodes=5):

    scores = np.zeros(num_episodes)
    for i in range(num_episodes):
        obs = env.reset()
        done = False
        score = 0.
        while not done:
            action = agent.act(obs, deterministic=True)[0]
            obs, rew, done, _ = env.step(action)
            score += rew

        scores[i] = score
    avg_score = np.mean(scores)
    std_score = np.std(scores)

    return avg_score, std_score

# 3. Training!

In [None]:
def train(env, agent, max_iter, gamma=0.99, lr=3e-4, lam=0.95, delta=1e-3, steps_per_epoch=10000, eval_interval=10000, snapshot_interval=10000):

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_ep_len = env._max_episode_steps
    memory = OnPolicyMemory(obs_dim, act_dim, gamma, lam, lim=steps_per_epoch)
    test_env = gym.make(env.unwrapped.spec.id)
    critic_optim = Adam(agent.V.parameters(), lr=lr)

    save_path = './snapshots/'
    os.makedirs(save_path, exist_ok=True)
    os.makedirs('./learning_curves/', exist_ok=True)
    log_file = open('./learning_curves/res.csv',
                    'w',
                    encoding='utf-8',
                    newline=''
                   )
    logger = csv.writer(log_file)
    num_epochs = max_iter // steps_per_epoch
    total_t = 0
    begin = time.time()
    for epoch in range(num_epochs):
        # start agent-env interaction
        state = env.reset()
        step_count = 0
        ep_reward = 0

        for t in range(steps_per_epoch):
            # TODO_11 : Collect transition samples by executing the policy
            # 1) Choose your agent's action first
            # 2) Execute the action and get next state, reward, done signal
            action, log_prob, v = agent.act(state)
            next_state, reward, done, _ = env.step(action)

            memory.append(state, action, reward, v, log_prob)

            ep_reward += reward
            step_count += 1

            if (step_count == max_ep_len) or (t == steps_per_epoch - 1):
                # termination of env by env wrapper, or by truncation due to memory size
                s_last = torch.tensor(next_state, dtype=torch.float).to(device)
                v_last = agent.V(s_last).item()
                memory.compute_values(v_last)
            elif done:
                # episode done as the agent reach a terminal state
                v_last = 0.0
                memory.compute_values(v_last)

            state = next_state

            if done:
                state = env.reset()
                step_count = 0
                ep_reward = 0

            if total_t % eval_interval == 0:
                avg_score, std_score = evaluate(agent, test_env, num_episodes=5)
                elapsed_t = time.time() - begin
                print('[elapsed time : {:.1f}s| iter {}] score = {:.2f}'.format(elapsed_t, total_t, avg_score), u'\u00B1', '{:.4f}'.format(std_score))
                evaluation_log = [t, avg_score, std_score]
                logger.writerow(evaluation_log)


            if total_t % snapshot_interval == 0:
                snapshot_path = save_path + 'iter{}_'.format(total_t)
                # save weight & training progress
                save_snapshot(agent, snapshot_path)

            total_t += 1

        # train agent at the end of each epoch
        update(agent, memory, critic_optim, delta, num_updates=1)

    log_file.close()
    return

In [None]:
# Let's move to robotic environment!
!pip install pybullet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pybullet
  Downloading pybullet-3.2.5-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (91.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.7/91.7 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.2.5


In [None]:
import pybullet_envs

env_id = 'HopperBulletEnv-v0'

env = gym.make(env_id)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
print('observation space dim. : {} / action space dim. : {}'.format(obs_dim, act_dim))

observation space dim. : 15 / action space dim. : 3


  logger.warn(
  deprecation(
  deprecation(


In [None]:
agent = TRPOAgent(obs_dim, act_dim, hidden1=128, hidden2=128)

In [None]:
next(agent.pi.parameters()).is_cuda

True

In [None]:
train(env, agent, max_iter=20000000, gamma=0.99, lr=5e-4, lam=0.95, delta=1e-3, steps_per_epoch=10000, eval_interval=500000)

[elapsed time : 0.7s| iter 0] score = 46.02 ± 14.9625
[elapsed time : 1029.7s| iter 500000] score = 23.36 ± 0.4444


KeyboardInterrupt: ignored

# 4. Watch how your agent solve the task!

In [None]:
import gym
from gym.wrappers.record_video import RecordVideo
import os
from IPython.display import HTML
from base64 import b64encode

In [None]:
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt
from IPython.display import HTML

def display_video(frames, framerate=30):
    height, width, _ = frames[0].shape
    dpi = 70
    orig_backend = matplotlib.get_backend()
    matplotlib.use('Agg')  # Switch to headless 'Agg' to inhibit figure rendering.
    fig, ax = plt.subplots(1, 1, figsize=(width / dpi, height / dpi), dpi=dpi)
    matplotlib.use(orig_backend)  # Switch back to the original backend.
    ax.set_axis_off()
    ax.set_aspect('equal')
    ax.set_position([0, 0, 1, 1])
    im = ax.imshow(frames[0])
    def update(frame):
      im.set_data(frame)
      return [im]
    interval = 1000/framerate
    anim = animation.FuncAnimation(fig=fig, func=update, frames=frames,
                                   interval=interval, blit=True, repeat=False)
    return HTML(anim.to_html5_video())

In [None]:
env = gym.make('HopperBulletEnv-v0')

os.makedirs('./video',exist_ok=True)

#env = RecordVideo(env=env,video_folder='./video')

load_model(agent, './snapshots/hopper_expert.tar', device)

frames = []
obs = env.reset()

done = False
score = 0.
#env.start_video_recorder()
while not done:
    frame = env.render(mode='rgb_array')
    frames.append(frame)
    #env.render()
    obs, rew, done, _ = env.step(agent.act(obs, deterministic=True)[0])
    score += rew
print('score : ', score)
#env.close_video_recorder()
display_video(frames=frames, framerate=30)

# Proximal Policy Optimization

In contrast to TRPO, PPO uses the following simple $1^{\text{st}}$-order objective!
\begin{equation*}
L(\phi) \approx \frac{1}{N} \sum_{i = 1}^N \min\left( r_i(\phi)\hat A_i, \text{clip}(r_i(\phi), 1 - \varepsilon, 1 + \varepsilon) \hat A_i  \right).
\end{equation*}
While we performed complex parameter updates in TRPO, we just build the above loss and use popular optimizers provided by PyTorch...

In [None]:
from ppo import *

In [None]:
ppo_agent = PPOAgent(obs_dim, act_dim, hidden1=128, hidden2=128, device=device)

In [None]:

def ppo_update(agent, memory, optimizer, epsilon, num_updates=1, device='cuda'):

    batch = memory.load()
    states = torch.Tensor(batch['state']).to(device)
    actions = torch.Tensor(batch['action']).to(device)
    target_v = torch.Tensor(batch['val']).to(device)
    A = torch.Tensor(batch['A']).to(device)
    old_log_probs = torch.Tensor(batch['log_prob']).to(device)

    for _ in range(num_updates):
        ################
        # train critic #
        ################
        log_probs, ent = agent.pi.compute_log_prob(states, actions)

        # compute prob ratio
        # $\frac{\pi(a_t | s_t ; \theta)}{\pi(a_t | s_t ; \theta_\text{old})}$
        r = torch.exp()
        # construct clipped loss
        # $r^\text{clipped}_t(\theta) = \text{clip}(r_t(\theta), 1 - \epsilon, 1 + \epsilon)$
        clipped_r = torch.clamp()
        # surrogate objective for each $t$
        # $\min \{ r_t(\theta) \hat{A}_t, r^\text{clipped}_t(\theta) \hat{A}_t \}$
        single_step_obj = torch.min()
        pi_loss = -torch.mean(single_step_obj)

        v = agent.V(states)
        V_loss = torch.mean((v - target_v) ** 2)
        ent_bonus = torch.mean(ent)

        loss = pi_loss + 0.5 * V_loss - 0.01 * ent_bonus
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return

In [None]:
env = gym.make('HopperBulletEnv-v0')
test_env = gym.make('HopperBulletEnv-v0')
ppo_train(env, test_env, ppo_agent, max_iter=500000, gamma=0.99, lr=3e-4, lam=0.95, delta=1e-3, epsilon=0.2, steps_per_epoch=10000, eval_interval=10000)