In [1]:
# !apt-get install -y \
#     libgl1-mesa-dev \
#     libgl1-mesa-glx \
#     libglew-dev \
#     libosmesa6-dev \
#     software-properties-common

# !apt-get install -y patchelf

# !apt-get update --fix-missing
# !pip install stable-baselines3
# !pip install mujoco
# !pip install  --upgrade gymnasium==0.29
# !pip install free-mujoco-py

In [2]:
import os
import time
import wandb
import random

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from stable_baselines3.common.buffers import ReplayBuffer

In [3]:
def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0 :
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)

        env = gym.wrappers.RecordEpisodeStatistics(env)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env
    return thunk

In [4]:
class QNetwork(nn.Module):

    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, x, a):
        x = torch.cat([x, a], 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [5]:
class Actor(nn.Module):

    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape))
        self.register_buffer(
            "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32)
        )
        self.register_buffer(
            "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32)
        )

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = torch.tanh(self.fc_mu(x))

        return x * self.action_scale + self.action_bias

In [6]:
def train(
    env_id,
    seed,
    total_timesteps,
    learning_rate,
    buffer_size,
    gamma,
    tau,
    batch_size,
    exploration_noise,
    learning_starts,
    policy_frequency,
    noise_clip):

    run_name = f"{env_id}__{seed}__{int(time.time())}"
    wandb.init(
        project="ddpg-mujoco-benchmark",
        config={
            "env":env_id,
            "seed":seed,
            "timesteps":total_timesteps,
            "lr":learning_rate,
            "buffer_size":buffer_size,
            "gamma":gamma,
            "tau": tau,
            "batch_size": batch_size,
            "exploration_noise":exploration_noise,
            "learning_starts":learning_starts,
            "policy_frequency":policy_frequency,
            "noise_clip":noise_clip,
        },
        sync_tensorboard=True,
        monitor_gym=True,
        name=run_name
    )
    writer = SummaryWriter(f"runs/{run_name}")

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    envs = gym.vector.SyncVectorEnv(
        [make_env(env_id, seed, 0, True, run_name)]
    )

    actor = Actor(envs).to(device)
    qf1 = QNetwork(envs).to(device)
    qf1_target = QNetwork(envs).to(device)
    target_actor = Actor(envs).to(device)
    target_actor.load_state_dict(actor.state_dict())
    qf1_target.load_state_dict(qf1.state_dict())

    q_optimizer = optim.Adam(qf1.parameters(), lr=learning_rate)
    actor_optimizer = optim.Adam(actor.parameters(), lr=learning_rate)

    envs.single_observation_space.dtype = np.float32
    rb = ReplayBuffer(
        buffer_size,
        envs.single_observation_space,
        envs.single_action_space,
        device,
        handle_timeout_termination=False,
    )

    start_time = time.time()
    obs, _ = envs.reset(seed=seed)

    for global_step in range(total_timesteps):
        if global_step < learning_starts:
            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
        else:
            with torch.no_grad():
                actions = actor(torch.Tensor(obs).to(device))
                actions += torch.normal(0, actor.action_scale * exploration_noise)
                actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)

        next_obs, rewards, terminated, truncated, infos = envs.step(actions)
        if "final_info" in infos:
            for info in infos["final_info"]:
                print(f"global_step={global_step} episodic_return={info['episode']['r']}")
                writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
                break


        real_next_obs = next_obs.copy()
        for idx,d in enumerate(truncated):
            if d:
                real_next_obs[idx] = infos["final_observation"][idx]
        rb.add(obs, real_next_obs, actions, rewards, terminated, infos)

        obs = next_obs

        if global_step > learning_starts:
            data = rb.sample(batch_size)
            with torch.no_grad():
                next_state_actions = target_actor(data.next_observations)
                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
                next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * gamma * (qf1_next_target).view(-1)

            qf1_a_values = qf1(data.observations, data.actions).view(-1)
            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)

            q_optimizer.zero_grad()
            qf1_loss.backward()
            q_optimizer.step()

            if global_step % policy_frequency == 0:
                actor_loss = -qf1(data.observations, actor(data.observations)).mean()
                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                for param, target_param in zip(actor.parameters(), target_actor.parameters()):
                    target_param.data.copy_(tau * param.data + (1 -tau) * target_param.data)
                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 -tau) * target_param.data)

            if global_step % 100 == 0:
                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
                writer.add_scalar("charts/SPS", int(global_step/ (time.time()-start_time)), global_step)

    envs.close()
    writer.close()
    wandb.finish()


In [7]:
env = {"hopper":"Hopper-v2","humanoid":"Humanoid-v2","halfCheetah":"HalfCheetah-v2","ant":"Ant-v2"}
seed = 1
total_timesteps = 500000
learning_rate = 0.00003
buffer_size = 100000
gamma = 0.99
tau = 0.005
batch_size = 1024
exploration_noise = 0.1
learning_starts = 25000
policy_frequency = 2
noise_clip = 0.5

In [8]:
train(env["hopper"],seed, total_timesteps,
      learning_rate, buffer_size, gamma, tau, batch_size,
      exploration_noise,learning_starts,
      policy_frequency,noise_clip)

[34m[1mwandb[0m: Currently logged in as: [33mchkda[0m. Use [1m`wandb login --relogin`[0m to force relogin


  logger.deprecation(
  logger.deprecation(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Moviepy - Building video /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-0.mp4.
Moviepy - Writing video /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-0.mp4



                                                            

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-0.mp4
global_step=12 episodic_return=[10.095653]
Moviepy - Building video /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-1.mp4.
Moviepy - Writing video /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-1.mp4



                                                            

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-1.mp4
global_step=33 episodic_return=[24.481318]
global_step=77 episodic_return=[30.460733]
global_step=97 episodic_return=[3.5976954]
global_step=113 episodic_return=[10.385685]
global_step=145 episodic_return=[26.22122]
global_step=167 episodic_return=[9.670132]
global_step=208 episodic_return=[31.524872]
Moviepy - Building video /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-8.mp4.
Moviepy - Writing video /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-8.mp4



                                                            

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-8.mp4
global_step=224 episodic_return=[16.408388]
global_step=243 episodic_return=[19.952955]
global_step=258 episodic_return=[12.416902]
global_step=277 episodic_return=[18.6857]
global_step=288 episodic_return=[7.5842714]
global_step=328 episodic_return=[36.625114]
global_step=383 episodic_return=[48.878635]
global_step=395 episodic_return=[10.565069]
global_step=413 episodic_return=[15.761967]
global_step=426 episodic_return=[11.247956]
global_step=446 episodic_return=[16.84083]
global_step=464 episodic_return=[10.773517]
global_step=487 episodic_return=[16.906458]
global_step=509 episodic_return=[21.48977]
global_step=524 episodic_return=[10.177545]
global_step=537 episodic_return=[11.206543]
global_step=561 episodic_return=[11.512118]
global_step=570 episodic_return=[5.5846457]
global_step=598 episodic_return=[16.13907]
Moviepy - Building video /notebooks/rl-algos/videos/Hop

                                                            

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-27.mp4
global_step=623 episodic_return=[15.609553]
global_step=652 episodic_return=[23.70242]
global_step=671 episodic_return=[14.439957]
global_step=687 episodic_return=[14.2062845]
global_step=705 episodic_return=[9.332891]
global_step=720 episodic_return=[13.764719]
global_step=747 episodic_return=[31.680313]
global_step=765 episodic_return=[19.303547]
global_step=779 episodic_return=[9.934476]
global_step=792 episodic_return=[10.275201]
global_step=821 episodic_return=[10.891268]
global_step=845 episodic_return=[12.897434]
global_step=866 episodic_return=[19.623322]
global_step=882 episodic_return=[8.301266]
global_step=892 episodic_return=[8.716988]
global_step=910 episodic_return=[16.481533]
global_step=923 episodic_return=[11.333176]
global_step=943 episodic_return=[4.940121]
global_step=965 episodic_return=[10.237156]
global_step=977 episodic_return=[10.315764]
global_ste

                                                            

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-64.mp4
global_step=1313 episodic_return=[8.954852]
global_step=1331 episodic_return=[10.208559]
global_step=1351 episodic_return=[13.304131]
global_step=1380 episodic_return=[42.49586]
global_step=1407 episodic_return=[11.751942]
global_step=1418 episodic_return=[10.132871]
global_step=1439 episodic_return=[13.317439]
global_step=1459 episodic_return=[9.63476]
global_step=1471 episodic_return=[8.857546]
global_step=1496 episodic_return=[26.412758]
global_step=1528 episodic_return=[26.794802]
global_step=1541 episodic_return=[11.026333]
global_step=1557 episodic_return=[13.69577]
global_step=1579 episodic_return=[28.716854]
global_step=1599 episodic_return=[9.024298]
global_step=1614 episodic_return=[9.442001]
global_step=1654 episodic_return=[9.242851]
global_step=1666 episodic_return=[8.428309]
global_step=1691 episodic_return=[23.00464]
global_step=1727 episodic_return=[49.4310

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-125.mp4
global_step=2778 episodic_return=[49.823063]
global_step=2802 episodic_return=[15.493103]
global_step=2831 episodic_return=[30.624313]
global_step=2840 episodic_return=[5.999724]
global_step=2885 episodic_return=[65.253204]
global_step=2898 episodic_return=[9.018899]
global_step=2913 episodic_return=[13.841353]
global_step=2929 episodic_return=[12.966997]
global_step=2979 episodic_return=[43.31653]
global_step=3034 episodic_return=[78.35801]
global_step=3047 episodic_return=[10.714614]
global_step=3065 episodic_return=[6.0605927]
global_step=3079 episodic_return=[9.185012]
global_step=3100 episodic_return=[13.44805]
global_step=3116 episodic_return=[12.42791]
global_step=3131 episodic_return=[12.027919]
global_step=3144 episodic_return=[10.850214]
global_step=3174 episodic_return=[37.69775]
global_step=3197 episodic_return=[7.983907]
global_step=3233 episodic_return=[31.6

                                                            

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-216.mp4
global_step=4715 episodic_return=[4.7981777]
global_step=4735 episodic_return=[6.5388784]
global_step=4753 episodic_return=[10.49064]
global_step=4762 episodic_return=[7.2037997]
global_step=4784 episodic_return=[19.312824]
global_step=4808 episodic_return=[12.622039]
global_step=4824 episodic_return=[12.65584]
global_step=4840 episodic_return=[12.535695]
global_step=4856 episodic_return=[11.774805]
global_step=4876 episodic_return=[17.433376]
global_step=4903 episodic_return=[28.933664]
global_step=4925 episodic_return=[12.023536]
global_step=4948 episodic_return=[23.719315]
global_step=4968 episodic_return=[17.279522]
global_step=4987 episodic_return=[14.031914]
global_step=5006 episodic_return=[10.307442]
global_step=5043 episodic_return=[44.208046]
global_step=5055 episodic_return=[9.103758]
global_step=5076 episodic_return=[11.801316]
global_step=5108 episodic_return

                                                            

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-343.mp4
global_step=7600 episodic_return=[15.854401]
global_step=7613 episodic_return=[10.2198515]
global_step=7625 episodic_return=[10.379782]
global_step=7641 episodic_return=[10.612514]
global_step=7665 episodic_return=[28.423704]
global_step=7680 episodic_return=[12.555246]
global_step=7730 episodic_return=[76.6611]
global_step=7752 episodic_return=[16.293312]
global_step=7774 episodic_return=[12.400619]
global_step=7792 episodic_return=[11.926154]
global_step=7815 episodic_return=[11.521602]
global_step=7839 episodic_return=[27.055536]
global_step=7873 episodic_return=[19.595879]
global_step=7962 episodic_return=[93.37685]
global_step=7974 episodic_return=[7.926803]
global_step=8009 episodic_return=[19.147287]
global_step=8038 episodic_return=[11.003211]
global_step=8052 episodic_return=[13.674353]
global_step=8079 episodic_return=[28.103722]
global_step=8092 episodic_return

                                                            

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-512.mp4
global_step=11316 episodic_return=[15.481233]
global_step=11343 episodic_return=[25.667315]
global_step=11367 episodic_return=[20.730427]
global_step=11383 episodic_return=[15.380962]
global_step=11421 episodic_return=[26.607157]
global_step=11431 episodic_return=[7.203114]
global_step=11445 episodic_return=[12.956137]
global_step=11458 episodic_return=[8.291129]
global_step=11475 episodic_return=[17.488598]
global_step=11493 episodic_return=[14.890619]
global_step=11510 episodic_return=[6.215193]
global_step=11520 episodic_return=[7.564742]
global_step=11530 episodic_return=[7.5574517]
global_step=11554 episodic_return=[13.493882]
global_step=11566 episodic_return=[8.964566]
global_step=11587 episodic_return=[11.3729105]
global_step=11615 episodic_return=[15.404486]
global_step=11635 episodic_return=[15.950295]
global_step=11650 episodic_return=[11.795699]
global_step=11

                                                            

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-729.mp4
global_step=16251 episodic_return=[7.0388546]
global_step=16272 episodic_return=[15.929157]
global_step=16292 episodic_return=[14.264844]
global_step=16373 episodic_return=[121.38163]
global_step=16390 episodic_return=[9.459834]
global_step=16426 episodic_return=[18.502449]
global_step=16436 episodic_return=[7.615998]
global_step=16448 episodic_return=[9.339965]
global_step=16465 episodic_return=[11.2556925]
global_step=16490 episodic_return=[13.644708]
global_step=16501 episodic_return=[6.818907]
global_step=16513 episodic_return=[9.595403]
global_step=16524 episodic_return=[6.805711]
global_step=16556 episodic_return=[11.380069]
global_step=16571 episodic_return=[9.513685]
global_step=16597 episodic_return=[29.97246]
global_step=16609 episodic_return=[8.559511]
global_step=16630 episodic_return=[15.143441]
global_step=16734 episodic_return=[122.04427]
global_step=16757 

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-1000.mp4
global_step=22373 episodic_return=[10.096142]
global_step=22384 episodic_return=[7.0258117]
global_step=22397 episodic_return=[3.7219362]
global_step=22420 episodic_return=[9.825436]
global_step=22454 episodic_return=[14.15803]
global_step=22465 episodic_return=[7.620246]
global_step=22489 episodic_return=[19.603174]
global_step=22515 episodic_return=[14.366137]
global_step=22525 episodic_return=[6.643969]
global_step=22537 episodic_return=[10.237014]
global_step=22553 episodic_return=[11.841006]
global_step=22572 episodic_return=[16.463182]
global_step=22618 episodic_return=[44.89943]
global_step=22640 episodic_return=[27.653717]
global_step=22654 episodic_return=[12.877375]
global_step=22669 episodic_return=[9.727212]
global_step=22709 episodic_return=[28.283169]
global_step=22725 episodic_return=[13.169201]
global_step=22762 episodic_return=[21.200922]
global_step=227

                                                              

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-2000.mp4
global_step=85657 episodic_return=[337.43048]
global_step=85811 episodic_return=[322.03302]
global_step=85990 episodic_return=[351.85638]
global_step=86159 episodic_return=[337.5193]
global_step=86322 episodic_return=[334.69858]
global_step=86487 episodic_return=[332.96228]
global_step=86643 episodic_return=[323.6351]
global_step=86800 episodic_return=[322.49625]
global_step=86955 episodic_return=[319.60608]
global_step=87115 episodic_return=[325.00436]
global_step=87278 episodic_return=[330.36307]
global_step=87411 episodic_return=[296.64694]
global_step=87571 episodic_return=[326.2152]
global_step=87698 episodic_return=[288.51697]
global_step=87846 episodic_return=[306.14145]
global_step=87987 episodic_return=[302.80508]
global_step=88119 episodic_return=[300.62012]
global_step=88272 episodic_return=[325.4854]
global_step=88406 episodic_return=[305.50104]
global_step=8

                                                              

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Hopper-v2__1__1700234374/rl-video-episode-3000.mp4
global_step=329282 episodic_return=[284.38712]
global_step=329435 episodic_return=[286.03583]
global_step=329592 episodic_return=[296.95917]
global_step=329741 episodic_return=[289.4596]
global_step=329897 episodic_return=[298.84033]
global_step=330072 episodic_return=[371.979]
global_step=330221 episodic_return=[294.2175]
global_step=330389 episodic_return=[307.76233]
global_step=330538 episodic_return=[281.09015]
global_step=330692 episodic_return=[280.32526]
global_step=330853 episodic_return=[311.59134]
global_step=331013 episodic_return=[320.65845]
global_step=331165 episodic_return=[294.75766]
global_step=331313 episodic_return=[285.3443]
global_step=331460 episodic_return=[289.61658]
global_step=331613 episodic_return=[302.57736]
global_step=331761 episodic_return=[305.0718]
global_step=331908 episodic_return=[298.54553]
global_step=332059 episodic_return=[300.086

VBox(children=(Label(value='1.946 MB of 1.946 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
charts/SPS,█▅▄▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
charts/episodic_length,▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▁▂▃▃▃▂▃▃▂▂▃▂▁▄▅█▃▄▅▆▅
charts/episodic_return,▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▄▁▁▃▄▃▂▄▃▂▃▄▃▁▄▆▇▃▄██▇
global_step,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇██
losses/actor_loss,██▇▆▅▄▃▃▃▂▂▂▁▁▁▂▂▂▃▃▄▄▄▃▃▃▄▄▄▄▃▃▃▃▄▄▄▄▄▄
losses/qf1_loss,▁▁▂▃▆▇▆▇█▄▆▄▃▂▃▄▂▁▂▂▁▁▁▂▂▁▁▁▁▁▁▁▁▄▁▁▁▁▁▁
losses/qf1_values,▁▁▂▃▄▅▆▆▆▇▇▇███▇▇▇▇▆▅▅▅▆▆▆▅▅▅▆▆▆▆▆▅▅▅▅▅▅

0,1
charts/SPS,111.0
charts/episodic_length,204.0
charts/episodic_return,499.40213
global_step,499964.0
losses/actor_loss,-348.05139
losses/qf1_loss,23.08812
losses/qf1_values,346.2962
