In [1]:
# !apt-get install -y \
#     libgl1-mesa-dev \
#     libgl1-mesa-glx \
#     libglew-dev \
#     libosmesa6-dev \
#     software-properties-common

# !apt-get install -y patchelf

# !apt-get update --fix-missing
# !pip install stable-baselines3
# !pip install mujoco
# !pip install  --upgrade gymnasium==0.29
# !pip install free-mujoco-py

In [2]:
import os
import time
import wandb
import random

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from stable_baselines3.common.buffers import ReplayBuffer

In [3]:
def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0 :
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)

        env = gym.wrappers.RecordEpisodeStatistics(env)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env
    return thunk

In [4]:
class QNetwork(nn.Module):

    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, x, a):
        x = torch.cat([x, a], 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [5]:
class Actor(nn.Module):

    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape))
        self.register_buffer(
            "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32)
        )
        self.register_buffer(
            "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32)
        )

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = torch.tanh(self.fc_mu(x))

        return x * self.action_scale + self.action_bias

In [6]:
def train(
    env_id,
    seed,
    total_timesteps,
    learning_rate,
    buffer_size,
    gamma,
    tau,
    batch_size,
    exploration_noise,
    learning_starts,
    policy_frequency,
    noise_clip):

    run_name = f"{env_id}__{seed}__{int(time.time())}"
    wandb.init(
        project="ddpg-mujoco-benchmark",
        config={
            "env":env_id,
            "seed":seed,
            "timesteps":total_timesteps,
            "lr":learning_rate,
            "buffer_size":buffer_size,
            "gamma":gamma,
            "tau": tau,
            "batch_size": batch_size,
            "exploration_noise":exploration_noise,
            "learning_starts":learning_starts,
            "policy_frequency":policy_frequency,
            "noise_clip":noise_clip,
        },
        sync_tensorboard=True,
        monitor_gym=True,
        name=run_name
    )
    writer = SummaryWriter(f"runs/{run_name}")

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    envs = gym.vector.SyncVectorEnv(
        [make_env(env_id, seed, 0, True, run_name)]
    )

    actor = Actor(envs).to(device)
    qf1 = QNetwork(envs).to(device)
    qf1_target = QNetwork(envs).to(device)
    target_actor = Actor(envs).to(device)
    target_actor.load_state_dict(actor.state_dict())
    qf1_target.load_state_dict(qf1.state_dict())

    q_optimizer = optim.Adam(qf1.parameters(), lr=learning_rate)
    actor_optimizer = optim.Adam(actor.parameters(), lr=learning_rate)

    envs.single_observation_space.dtype = np.float32
    rb = ReplayBuffer(
        buffer_size,
        envs.single_observation_space,
        envs.single_action_space,
        device,
        handle_timeout_termination=False,
    )

    start_time = time.time()
    obs, _ = envs.reset(seed=seed)

    for global_step in range(total_timesteps):
        if global_step < learning_starts:
            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
        else:
            with torch.no_grad():
                actions = actor(torch.Tensor(obs).to(device))
                actions += torch.normal(0, actor.action_scale * exploration_noise)
                actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)

        next_obs, rewards, terminated, truncated, infos = envs.step(actions)
        if "final_info" in infos:
            for info in infos["final_info"]:
                print(f"global_step={global_step} episodic_return={info['episode']['r']}")
                writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
                break


        real_next_obs = next_obs.copy()
        for idx,d in enumerate(truncated):
            if d:
                real_next_obs[idx] = infos["final_observation"][idx]
        rb.add(obs, real_next_obs, actions, rewards, terminated, infos)

        obs = next_obs

        if global_step > learning_starts:
            data = rb.sample(batch_size)
            with torch.no_grad():
                next_state_actions = target_actor(data.next_observations)
                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
                next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * gamma * (qf1_next_target).view(-1)

            qf1_a_values = qf1(data.observations, data.actions).view(-1)
            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)

            q_optimizer.zero_grad()
            qf1_loss.backward()
            q_optimizer.step()

            if global_step % policy_frequency == 0:
                actor_loss = -qf1(data.observations, actor(data.observations)).mean()
                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                for param, target_param in zip(actor.parameters(), target_actor.parameters()):
                    target_param.data.copy_(tau * param.data + (1 -tau) * target_param.data)
                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 -tau) * target_param.data)

            if global_step % 100 == 0:
                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
                writer.add_scalar("charts/SPS", int(global_step/ (time.time()-start_time)), global_step)

    envs.close()
    writer.close()
    wandb.finish()


In [9]:
env = {"hopper":"Hopper-v2","humanoid":"Humanoid-v2","halfCheetah":"HalfCheetah-v2","ant":"Ant-v2"}
seed = 1
total_timesteps = 500000
learning_rate = 0.00003
buffer_size = 100000
gamma = 0.99
tau = 0.005
batch_size = 1024
exploration_noise = 0.1
learning_starts = 25000
policy_frequency = 2
noise_clip = 0.5

In [10]:
train(env["hopper"],seed, total_timesteps,
      learning_rate, buffer_size, gamma, tau, batch_size,
      exploration_noise,learning_starts,
      policy_frequency,noise_clip)

  logger.deprecation(
  logger.deprecation(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Moviepy - Building video /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-0.mp4.
Moviepy - Writing video /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-0.mp4
global_step=160 episodic_return=[42.22905]
Moviepy - Building video /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-1.mp4.
Moviepy - Writing video /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-1.mp4



                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-1.mp4
global_step=183 episodic_return=[-17.043615]
global_step=257 episodic_return=[-17.401993]
global_step=380 episodic_return=[-71.388084]
global_step=406 episodic_return=[-26.250313]
global_step=476 episodic_return=[-39.342514]
global_step=549 episodic_return=[-28.581144]
global_step=578 episodic_return=[-8.204443]
Moviepy - Building video /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-8.mp4.
Moviepy - Writing video /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-8.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-8.mp4
global_step=698 episodic_return=[-53.115475]
global_step=717 episodic_return=[-14.89556]
global_step=791 episodic_return=[-36.954395]
global_step=969 episodic_return=[-106.31509]
global_step=1004 episodic_return=[19.458998]
global_step=1050 episodic_return=[-50.96988]
global_step=1123 episodic_return=[13.090459]
global_step=1178 episodic_return=[-21.061117]
global_step=1304 episodic_return=[-118.259186]
global_step=1358 episodic_return=[-36.142654]
global_step=2358 episodic_return=[-362.41718]
global_step=2391 episodic_return=[-9.069893]
global_step=2467 episodic_return=[-11.695028]
global_step=3467 episodic_return=[-243.68365]
global_step=3498 episodic_return=[-22.39118]
global_step=3517 episodic_return=[-4.254495]
global_step=3678 episodic_return=[-89.99716]
global_step=3703 episodic_return=[-4.8209853]
global_step=3756 episodic_return=[-56.927444]
Moviepy - Building video /

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-27.mp4
global_step=3785 episodic_return=[-13.862714]
global_step=4038 episodic_return=[-58.926765]
global_step=5038 episodic_return=[-332.10446]
global_step=5079 episodic_return=[-9.97468]
global_step=5180 episodic_return=[-12.848262]
global_step=5361 episodic_return=[-155.4756]
global_step=5447 episodic_return=[-53.692616]
global_step=5468 episodic_return=[-18.544811]
global_step=5712 episodic_return=[-111.0167]
global_step=5880 episodic_return=[-77.27002]
global_step=5912 episodic_return=[-8.942126]
global_step=5940 episodic_return=[0.28478962]
global_step=5970 episodic_return=[-9.997675]
global_step=6105 episodic_return=[-51.17861]
global_step=6277 episodic_return=[-45.643555]
global_step=6291 episodic_return=[7.8121595]
global_step=6312 episodic_return=[6.445875]
global_step=7312 episodic_return=[-261.16873]
global_step=7419 episodic_return=[-53.867752]
global_step=7465 episodic

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-64.mp4
global_step=10581 episodic_return=[-96.02093]
global_step=10604 episodic_return=[11.243341]
global_step=10629 episodic_return=[4.436721]
global_step=10678 episodic_return=[-2.8331797]
global_step=10719 episodic_return=[19.080238]
global_step=10766 episodic_return=[-28.182762]
global_step=10778 episodic_return=[1.1352761]
global_step=10801 episodic_return=[-33.16514]
global_step=11801 episodic_return=[-346.8129]
global_step=11818 episodic_return=[1.3003843]
global_step=12818 episodic_return=[-324.15768]
global_step=12993 episodic_return=[-73.92575]
global_step=13073 episodic_return=[-11.169964]
global_step=13159 episodic_return=[-56.500404]
global_step=13225 episodic_return=[-35.084496]
global_step=13360 episodic_return=[13.749632]
global_step=13392 episodic_return=[-27.675552]
global_step=13525 episodic_return=[-19.427258]
global_step=13599 episodic_return=[-22.068495]
global

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-125.mp4
global_step=21860 episodic_return=[-73.75164]
global_step=21926 episodic_return=[-1.7979362]
global_step=21956 episodic_return=[-21.901964]
global_step=22037 episodic_return=[-66.11238]
global_step=22085 episodic_return=[-17.455462]
global_step=22109 episodic_return=[-20.132885]
global_step=22230 episodic_return=[19.55769]
global_step=22285 episodic_return=[-15.049617]
global_step=22358 episodic_return=[-18.241104]
global_step=22493 episodic_return=[-44.425224]
global_step=22793 episodic_return=[-219.90056]
global_step=22929 episodic_return=[-111.831856]
global_step=23102 episodic_return=[-81.14573]
global_step=23484 episodic_return=[-154.86003]
global_step=24015 episodic_return=[-178.98242]
global_step=24076 episodic_return=[23.21246]
global_step=24150 episodic_return=[-42.5105]
global_step=24210 episodic_return=[1.8589053]
global_step=24228 episodic_return=[-13.74565]
glob

                                                                

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-216.mp4
global_step=78425 episodic_return=[206.55595]
global_step=79425 episodic_return=[294.5092]
global_step=80425 episodic_return=[295.05118]
global_step=80473 episodic_return=[28.744265]
global_step=80552 episodic_return=[-17.93425]
global_step=81552 episodic_return=[306.06055]
global_step=82217 episodic_return=[142.98799]
global_step=83217 episodic_return=[252.85503]
global_step=84183 episodic_return=[-8.504642]
global_step=85183 episodic_return=[207.7507]
global_step=86008 episodic_return=[229.51973]
global_step=86105 episodic_return=[31.112091]
global_step=87105 episodic_return=[347.949]
global_step=88105 episodic_return=[143.34071]
global_step=89105 episodic_return=[304.149]
global_step=90105 episodic_return=[283.1623]
global_step=91105 episodic_return=[341.78552]
global_step=92105 episodic_return=[106.03972]
global_step=92536 episodic_return=[153.59904]
global_step=93536 ep

                                                                

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-343.mp4
global_step=190687 episodic_return=[713.0635]
global_step=191687 episodic_return=[770.29004]
global_step=192687 episodic_return=[612.95074]
global_step=193392 episodic_return=[735.97705]
global_step=194392 episodic_return=[788.5051]
global_step=195392 episodic_return=[717.4016]
global_step=195459 episodic_return=[80.034164]
global_step=196459 episodic_return=[691.00555]
global_step=197459 episodic_return=[853.1036]
global_step=198459 episodic_return=[891.38586]
global_step=198791 episodic_return=[346.4313]
global_step=198861 episodic_return=[73.29126]
global_step=199861 episodic_return=[712.89185]
global_step=200861 episodic_return=[710.7774]
global_step=201861 episodic_return=[864.56177]
global_step=202227 episodic_return=[432.75598]
global_step=203227 episodic_return=[640.1133]
global_step=204227 episodic_return=[1261.6677]
global_step=205227 episodic_return=[712.13055]
gl

                                                                

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-512.mp4
global_step=335604 episodic_return=[2134.5283]
global_step=336604 episodic_return=[762.4864]
global_step=337604 episodic_return=[815.12274]
global_step=338101 episodic_return=[1081.8219]
global_step=338113 episodic_return=[14.80005]
global_step=339091 episodic_return=[2260.2322]
global_step=339510 episodic_return=[968.3751]
global_step=340510 episodic_return=[725.63293]
global_step=341510 episodic_return=[2236.2727]
global_step=342510 episodic_return=[1294.0114]
global_step=343090 episodic_return=[1142.8048]
global_step=343713 episodic_return=[1494.1022]
global_step=344713 episodic_return=[2130.2598]
global_step=345713 episodic_return=[2048.7927]
global_step=346713 episodic_return=[2084.0037]
global_step=346772 episodic_return=[106.34516]
global_step=347772 episodic_return=[2272.9346]
global_step=348772 episodic_return=[1191.8914]
global_step=349772 episodic_return=[2153.606

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/Ant-v2__1__1700214840/rl-video-episode-729.mp4
global_step=496031 episodic_return=[307.95984]
global_step=496077 episodic_return=[75.84446]
global_step=496665 episodic_return=[1584.9948]
global_step=496974 episodic_return=[767.0973]
global_step=497741 episodic_return=[1624.9625]
global_step=498741 episodic_return=[1108.1196]
global_step=498778 episodic_return=[66.428825]
global_step=498821 episodic_return=[123.496284]
global_step=499009 episodic_return=[513.6162]


VBox(children=(Label(value='10.941 MB of 10.941 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
charts/SPS,█▆▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
charts/episodic_length,▁▁▁▁▂▂▁▃▁▂▂███▄██████████▁██▁█████▄████▁
charts/episodic_return,▂▂▂▂▂▁▁▁▁▂▂▂▂▃▂▃▄▃▄▃▄▄▆▇▄▂▇▇▂▃██▄▆▅▄▅▄▅▂
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
losses/actor_loss,██▇▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
losses/qf1_loss,▂▂▂▃▄▃▃▂▁▂▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃█▃▆▄▅
losses/qf1_values,▁▁▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇██████

0,1
charts/SPS,115.0
charts/episodic_length,188.0
charts/episodic_return,513.61621
global_step,499900.0
losses/actor_loss,-260.11719
losses/qf1_loss,9.44849
losses/qf1_values,259.00995
