<a href="https://colab.research.google.com/github/donghuna/AI-Expert/blob/main/%EC%96%91%EC%9D%B8%EC%88%9C/ddpg_full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Day 3. DDPG

If you run in jupyter, turn

```
colab = False
```




In [None]:
colab = True
if colab:
    !pip install swig
    !pip install gym==0.21 pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !apt-get update > /dev/null 2>&1
    !apt-get install cmake > /dev/null 2>&1
    !pip install --upgrade setuptools 2>&1
    !pip install ez_setup > /dev/null 2>&1
    !pip3 install box2d-py
    !pip3 install gym[Box_2D]
    !pip install pygame
    !pip install pyglet==1.5.27

Collecting box2d-py
  Using cached box2d-py-2.3.8.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.8-cp310-cp310-linux_x86_64.whl size=2376099 sha256=b042db511b48afa517fbf5b41618113097bf53a443d5db5c2a1e0a8c998059e3
  Stored in directory: /root/.cache/pip/wheels/47/01/d2/6a780da77ccb98b1d2facdd520a8d10838a03b590f6f8d50c0
Successfully built box2d-py
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.8
Collecting pyglet==1.5.27
  Downloading pyglet-1.5.27-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyglet
Successfully installed pyglet-1.5.27


In [None]:
if colab:
    from google.colab import drive
    drive.mount('/content/drive')

    %cd /content/drive/MyDrive/day3_ddpg
    !ls

Mounted at /content/drive
/content/drive/MyDrive/day3_ddpg
buffer.py  ddpg_full.ipynb  __pycache__  utils.py


In [None]:
import torch
import torch.nn as nn
from torch.nn import MSELoss
import torch.nn.functional as F
import copy
import os
import numpy as np
from tqdm import tqdm
import torch
from torch.optim import Adam
from buffer import ReplayBuffer
from utils import save_snapshot, recover_snapshot, load_model
import gym

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('current device =', device)

current device = cuda


# 0. Define Q-network & policy-network

In [None]:
# critic network definition
# multi-layer perceptron (with 2 hidden layers)
class Critic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden1, hidden2):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(obs_dim + act_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)


    def forward(self, obs, act):
        x = torch.cat([obs, act], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return self.fc3(x)


# actor network definition
# multi-layer perceptron (with 2 hidden layers)
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim, ctrl_range, hidden1, hidden2):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(obs_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, act_dim)
        self.ctrl_range = ctrl_range

    def forward(self, obs):
        x = F.relu(self.fc1(obs))
        x = F.relu(self.fc2(x))

        # TODO_1: Return proper action (-ctrl_range ~ +ctrl_range) as an output of the actor network.
        # Hint: Use self.ctrl_range and tanh()!
        return self.ctrl_range * torch.tanh(self.fc3(x))

# 1. Define DDPG agent

In [None]:
class DDPGAgent:
    def __init__(self, obs_dim, act_dim, ctrl_range, hidden1, hidden2):
        super(DDPGAgent, self).__init__()
        # networks
        self.actor = Actor(obs_dim, act_dim, ctrl_range, hidden1, hidden2).to(device)
        self.critic = Critic(obs_dim, act_dim, hidden1, hidden2).to(device)

    def act(self, obs):
        # numpy ndarray to torch tensor
        # we first add an extra dimension
        obs = obs[np.newaxis, ...]
        with torch.no_grad():
            obs_tensor = torch.Tensor(obs).to(device)
            # TODO_2 : Get an action from policy network (self.actor)
            act_tensor = self.actor(obs_tensor)

        # torch tensor to numpy ndarray
        # remove extra dimension
        action = act_tensor.cpu().detach().numpy()
        action = np.squeeze(action, axis=0)

        return action

## 1.1.Test

In [None]:
agent = DDPGAgent(4, 2, 3, 32, 32)
action = agent.act(np.array([3., -1., 2., -5.]))
print(action)

[-1.1819849  0.6899658]


# 2. Implement one-step param update

Actor update? \\
First Observe
\begin{equation*}
\nabla_{\phi} Q_{\theta}(s_t, \mu_{\phi}(s_t)) =  \nabla_{\phi} \mu_{\phi}(s_t) \cdot \nabla_{a} Q(s_t, a)|_{a={\mu_{\phi}(s)}}.
\end{equation*}
Thus, we have
\begin{equation*}
\nabla_\phi J(\phi) \approx \frac{1}{N}\sum_{i = 1}^N \nabla_{\phi} \mu_{\phi}(s_t) \cdot \nabla_{a} Q(s_t, a)|_{a={\mu_{\phi}(s)}}  = \nabla_\phi\left( \frac{1}{N}\sum_{i = 1}^N Q_{\theta}(s_t, \mu_{\phi}(s_t)) \right).
\end{equation*}

In [None]:
def update(agent, replay_buf, gamma, actor_optim, critic_optim, target_actor, target_critic, tau, batch_size):
    # agent : agent with networks to be trained
    # replay_buf : replay buf from which we sample a batch
    # actor_optim / critic_optim : torch optimizers
    # tau : parameter for soft target update

    batch = replay_buf.sample_batch(batch_size=batch_size)

    # target construction does not need backward ftns
    with torch.no_grad():
        # unroll batch
        obs = torch.Tensor(batch.obs).to(device)
        act = torch.Tensor(batch.act).to(device)
        next_obs = torch.Tensor(batch.next_obs).to(device)
        rew = torch.Tensor(batch.rew).to(device)
        done = torch.Tensor(batch.done).to(device)

        ################
        # train critic #
        ################
        mask = 1. - done

        # TODO_3: Calculate target_q := r + gamma * Q_target(s', a')
        target = rew + gamma * mask * target_critic(next_obs, target_actor(next_obs))

    out = agent.critic(obs, act)

    # TODO_4 : Build critic MSELoss by yourself!
    # Hint : Use torch.mean().
    critic_loss = torch.mean((out - target) ** 2)

    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()

    ###############
    # train actor #
    ###############

    # freeze critic during actor training (why?)
    for p in agent.critic.parameters():
        p.requires_grad_(False)

    # TODO_5 : Construct the actor loss. (Warning: sign of the loss?)
    actor_loss = -torch.mean(agent.critic(obs, agent.actor(obs)))

    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()

    # unfreeze critic after actor training
    for p in agent.critic.parameters():
        p.requires_grad_(True)

    # soft target update (both actor & critic network)
    for p, targ_p in zip(agent.actor.parameters(), target_actor.parameters()):
        targ_p.data.copy_((1. - tau) * targ_p + tau * p)
    for p, targ_p in zip(agent.critic.parameters(), target_critic.parameters()):
        targ_p.data.copy_((1. - tau) * targ_p + tau * p)

In [None]:
def evaluate(agent, env, num_episodes=5):

    sum_scores = 0.

    for i in range(num_episodes):
        obs = env.reset()
        done = False
        score = 0.

        while not done:
            action = agent.act(obs)
            obs, rew, done, _ = env.step(action)
            score += rew
        sum_scores += score
    avg_score = sum_scores / num_episodes

    return avg_score

# 3. Combining these, we finally have...

In [None]:
def train(agent, env, gamma,
          actor_lr, critic_lr, tau, noise_std,
          ep_len, num_updates, batch_size,
          init_buffer=5000, buffer_size=100000,
          start_train=2000, train_interval=50,
          eval_interval=2000, snapshot_interval=10000, path=None):

    target_actor = copy.deepcopy(agent.actor)
    target_critic = copy.deepcopy(agent.critic)

    # TODO_6: Freeze target networks.
    for p in target_actor.parameters():
        p.requires_grad_(False)
    for p in target_critic.parameters():
        p.requires_grad_(False)

    actor_optim = Adam(agent.actor.parameters(), lr=actor_lr)
    critic_optim = Adam(agent.critic.parameters(), lr=critic_lr)

    if path is not None:
        recover_snapshot(path, agent.actor, agent.critic,
                   target_actor, target_critic,
                   actor_optim, critic_optim,
                   device=device
                  )
        # load snapshot

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    ctrl_range = env.action_space.high[0]

    replay_buf = ReplayBuffer(obs_dim, act_dim, buffer_size)

    save_path = './snapshots/'
    os.makedirs(save_path, exist_ok=True)

    test_env = copy.deepcopy(env)

    # main loop
    obs = env.reset()
    done = False
    step_count = 0
    ep = 0
    for t in range(num_updates + 1):
        if t < init_buffer:
            # perform random action until we collect sufficiently many samples
            # this is for exploration purpose
            action = env.action_space.sample()
        else:
            # TODO_7: Execute noisy action.
            # a_t = \pi(s_t) + N(0, \sigma^2)
            action = agent.act(obs) + noise_std * np.random.randn(act_dim)
            action = np.clip(action, -ctrl_range, ctrl_range)

        next_obs, rew, done, _ = env.step(action)
        step_count += 1

        if step_count == ep_len:
            # if the next_state is not terminal but done is set to True by gym env wrapper
            done = False

        replay_buf.append(obs, action, next_obs, rew, done)
        obs = next_obs

        if done == True or step_count == ep_len:
            # reset environment if current environment reaches a terminal state
            # or step count reaches predefined length
            obs = env.reset()
            done = False
            step_count = 0
            ep += 1

        if t % eval_interval == 0:
            avg_score = evaluate(agent, test_env, num_episodes=5)
            print('[iter {} / ep {}] average score = {:.4f} (over 5 episodes)'.format(t, ep, avg_score))

        if t > start_train and t % train_interval == 0:
            # start training after fixed number of steps
            # this may mitigate overfitting of networks to the
            # small number of samples collected during the initial stage of training
            for _ in range(train_interval):
                update(agent,
                       replay_buf,
                       gamma,
                       actor_optim,
                       critic_optim,
                       target_actor,
                       target_critic,
                       tau,
                       batch_size
                      )

        if t % snapshot_interval == 0:
            snapshot_path = save_path + 'iter{}_'.format(t)
            # save weight & training progress
            save_snapshot(snapshot_path, agent.actor, agent.critic,
                          target_actor, target_critic,
                          actor_optim, critic_optim)

# 4. Let's test the code!

In [None]:
env = gym.make('LunarLanderContinuous-v2')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
ctrl_range = env.action_space.high[0]

print('observation space dim : {} / action space dim : {}'.format(obs_dim, act_dim))
print('ctrl range : ', ctrl_range)

observation space dim : 8 / action space dim : 2
ctrl range :  1.0


  deprecation(
  deprecation(


In [None]:
agent = DDPGAgent(obs_dim=obs_dim, act_dim=act_dim, ctrl_range=ctrl_range, hidden1=256, hidden2=256)

In [None]:
gamma = 0.99
actor_lr = 1e-4
critic_lr = 1e-3
tau = 1e-3
noise_std = 0.1
ep_len = 1000
num_updates = 300000
batch_size = 128

In [None]:
train(agent, env, gamma,
      actor_lr, critic_lr, tau, noise_std,
      ep_len, num_updates, batch_size,
      init_buffer=5000, buffer_size=100000,
      start_train=2000, train_interval=50,
      eval_interval=5000
     )

  if not isinstance(terminated, (bool, np.bool8)):


[iter 0 / ep 0] average score = -70.0358 (over 5 episodes)
[iter 5000 / ep 44] average score = -329.0141 (over 5 episodes)
[iter 10000 / ep 63] average score = -331.6033 (over 5 episodes)
[iter 15000 / ep 80] average score = -241.9629 (over 5 episodes)
[iter 20000 / ep 101] average score = -63.6437 (over 5 episodes)
[iter 25000 / ep 121] average score = -85.0020 (over 5 episodes)
[iter 30000 / ep 145] average score = -121.4315 (over 5 episodes)
[iter 35000 / ep 173] average score = -144.2862 (over 5 episodes)
[iter 40000 / ep 201] average score = -170.5884 (over 5 episodes)
[iter 45000 / ep 230] average score = -72.3294 (over 5 episodes)
[iter 50000 / ep 270] average score = -108.0746 (over 5 episodes)
[iter 55000 / ep 292] average score = -60.6794 (over 5 episodes)
[iter 60000 / ep 322] average score = -130.7457 (over 5 episodes)
[iter 65000 / ep 336] average score = -12.3601 (over 5 episodes)
[iter 70000 / ep 343] average score = -17.4470 (over 5 episodes)
[iter 75000 / ep 348] avera

# 5. Watch the trained agent!

In [None]:
!pip install pyvirtualdisplay
!apt-get install xvfb

  and should_run_async(code)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libfontenc1 libxfont2 libxkbfile1 x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils
  xserver-common
The following NEW packages will be installed:
  libfontenc1 libxfont2 libxkbfile1 x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils
  xserver-common xvfb
0 upgraded, 9 newly installed, 0 to remove and 48 not upgraded.
Need to get 7,813 kB of archives.
After this operation, 11.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libfontenc1 amd64 1:1.1.4-1build3 [14.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxfont2 amd64 1:2.0.5-1build1 [94.5 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxkbfile1 amd64 1:1.1.0-1build3 [71.8 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/main amd64 x11-xkb-utils amd64 7.7+5build4 [172 kB]
Get:5 http://archiv

In [None]:
if colab:
    import gym
    from gym.wrappers.record_video import RecordVideo
    import glob
    import io
    import base64
    from IPython.display import HTML
    from pyvirtualdisplay import Display
    from IPython import display as ipythondisplay

    display = Display(visible=0, size=(1400, 900))
    display.start()

    def show_video():
      mp4list = glob.glob('video/*.mp4')
      if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                </video>'''.format(encoded.decode('ascii'))))
      else:
        print("Could not find video")


    def wrap_env(env):
      env = RecordVideo(env, './video')
      return env

    env = wrap_env(env)

  deprecation(


In [None]:
env = gym.make('LunarLanderContinuous-v2')
if colab:
  env = wrap_env(env)
obs = env.reset(seed=22)
done = False
score = 0.

while not done:
    env.render(mode='human')
    obs, rew, done, _ = env.step(agent.act(obs))
    score += rew
print('score : ', score)
env.close()

if colab:
    show_video()

  deprecation(
  deprecation(
  logger.warn(
  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


score :  -63.44120538742763
