In [10]:
import os
import random
import time
import wandb
from distutils.util import strtobool
from typing import Callable
import flax
import flax.linen as nn
import gymnasium as gym
import jax
import jax.numpy as jnp
import numpy as np
import optax

from flax.training.train_state import TrainState
from stable_baselines3.common.buffers import ReplayBuffer
from torch.utils.tensorboard import SummaryWriter

In [11]:
def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"video/{run_name}")
        else:
            env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env.action_space.seed(seed)

        return env

    return thunk

In [12]:
class QNetwork(nn.Module):
    action_dim: int

    @nn.compact
    def __call__(self, x: jnp.ndarray):
        x = nn.Dense(120)(x)
        x = nn.relu(x)
        x = nn.Dense(84)(x)
        x = nn.relu(x)
        x = nn.Dense(self.action_dim)(x)
        # x = nn.relu(x)

        return x


In [13]:
class TrainState(TrainState):
    target_params: flax.core.FrozenDict

In [14]:
def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
    slope = (end_e - start_e)/ duration
    return max(slope * t + start_e, end_e)

In [15]:
def evaluate(model_path: str,
             make_env: Callable,
             env_id: str,
             eval_episodes: int,
             run_name: str,
             Model: nn.Module,
             epsilon: float = 0.05,
             capture_video: bool = True,
             seed=1):
    envs = gym.vector.SyncVectorEnv([make_env(env_id, 0, 0, capture_video, run_name)])
    obs, _ = envs.reset()
    model = Model(action_dim=envs.single_action_space.n)
    q_key = jax.random.PRNGKey(seed)
    params = model.init(q_key, obs)
    with open(model_path, "rb") as f:
        params = flax.serialization.from_bytes(params, f.read())
    model.apply = jax.jit(model.apply)

    episodic_returns = []

    while len(episodic_returns) < eval_episodes:
        if random.random() < epsilon:
            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
        else:
            q_values = model.apply(params,obs)
            actions = q_values.argmax(axis=-1)
            actions = jax.device_get(actions)
        next_obs, _, _, _, infos = envs.step(actions)
        if "final_info" in infos:
            for info in infos["final_info"]:
                if "episode" not in info:
                    continue
                print(f"eval_episode={len(episodic_returns)}, episodic_return={info['episode']['r']}")
                episodic_returns += [info["episode"]["r"]]

        obs = next_obs

    return episodic_returns

In [16]:
envs = {"cartPole":"CartPole-v1","acrobot":"Acrobot-v1","mountainCar":"MountainCar-v0"}
n_timesteps = 500000
learning_rate = 0.00025
buffer_size = 10000
gamma = 0.99
target_network_frequency = 500
max_grad_norm = 0.5
batch_size = 1024
start_e = 1
end_e = 0.05
exploration_fraction = 0.5
learning_starts = 10000
train_frequency = 10
seed = 1
capture_video = True
tau = 1
num_envs = 1

In [17]:
def train(env_id,timesteps,learning_rate,buffer_size,gamma,
          target_network_frequency,max_grad_norm,batch_size,
          start_e,end_e,exploration_fraction,learning_starts,
          train_frequency,seed,tau,num_envs):

    run_name = f"{env_id}__{seed}__{int(time.time())}"
    wandb.init(
        project="rl-algos-benchmark",
        config={
            "env":env_id,
            "timesteps":timesteps,
            "lr":learning_rate,
            "buffer_size":buffer_size,
            "gamma":gamma,
            "target_network_frequency":target_network_frequency,
            "batch_size":batch_size,
            "start_e":start_e,
            "end_e":end_e,
            "exploration_fraction":exploration_fraction,
            "learning_starts":learning_starts,
            "train_frequency":train_frequency,
            "seed":seed,
            "tau":tau,
            "num_envs":num_envs,
        },
        sync_tensorboard=True,
        monitor_gym=True,
        name=run_name,
    )
    
    writer = SummaryWriter(f"runs/{run_name}")

    random.seed(seed)
    np.random.seed(seed)
    key = jax.random.PRNGKey(seed)
    key, q_key = jax.random.split(key, 2)

    envs = gym.vector.SyncVectorEnv(
        [make_env(env_id, seed + i, i, True, run_name) for i in range(num_envs)]
    )

    obs, _ = envs.reset(seed=seed)

    q_network = QNetwork(action_dim=envs.single_action_space.n)

    q_state = TrainState.create(
        apply_fn=q_network.apply,
        params=q_network.init(q_key, obs),
        target_params=q_network.init(q_key,obs),
        tx=optax.adam(learning_rate=learning_rate)
    )

    q_network.apply = jax.jit(q_network.apply)
    q_state = q_state.replace(target_params=optax.incremental_update(q_state.params, q_state.target_params, 1))

    rb = ReplayBuffer(
        buffer_size,
        envs.single_observation_space,
        envs.single_action_space,
        "cpu",
        n_envs=num_envs,
        handle_timeout_termination=False,
    )

    @jax.jit
    def update(q_state, observations, actions, next_observations, rewards, dones):
        q_next_target = q_network.apply(q_state.target_params, next_observations)
        q_next_target = jnp.max(q_next_target, axis=-1)
        next_q_value = rewards + (1 - dones) * gamma * q_next_target

        def mse_loss(params):
            q_pred = q_network.apply(params, observations)
            q_pred = q_pred[jnp.arange(q_pred.shape[0]), actions.squeeze()]
            return ((q_pred - next_q_value)**2).mean(), q_pred

        (loss_value,q_pred), grads = jax.value_and_grad(mse_loss,has_aux=True)(q_state.params)
        q_state = q_state.apply_gradients(grads=grads)
        return loss_value, q_pred, q_state

    start_time = time.time()

    obs, _ = envs.reset(seed=seed)
    for global_step in range(timesteps):
        epsilon = linear_schedule(start_e, end_e, exploration_fraction * timesteps, global_step)
        if random.random() < epsilon:
            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
        else:
            q_values = q_network.apply(q_state.params, obs)
            actions = q_values.argmax(axis=-1)
            actions = jax.device_get(actions)

        next_obs, rewards, terminated, truncated, infos = envs.step(actions)

        if "final_info" in infos:
            for info in infos["final_info"]:
                if "episode" not in info:
                    continue

                print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
                writer.add_scalar("charts/episodic_return",info["episode"]["r"], global_step)
                writer.add_scalar("charts/episodic_length",info["episode"]["l"], global_step)
                writer.add_scalar("charts/epsilon",epsilon,global_step)


        real_next_obs = next_obs.copy()
        for idx,d in enumerate(truncated):
            if d and "final_observations" in infos:
                real_next_obs[idx] = infos["final_observations"][idx]
        rb.add(obs, real_next_obs, actions, rewards, terminated, infos)

        obs = next_obs

        if global_step > learning_starts:
            if global_step % train_frequency == 0:
                data = rb.sample(batch_size)

                loss, old_val, q_state = update(
                    q_state,
                    data.observations.numpy(),
                    data.actions.numpy(),
                    data.next_observations.numpy(),
                    data.rewards.flatten().numpy(),
                    data.dones.flatten().numpy(),
                )

                if global_step % 100 == 0:
                    writer.add_scalar("losses/td_loss",jax.device_get(loss),global_step)
                    writer.add_scalar("losses/q_values",jax.device_get(old_val).mean(), global_step)
                    writer.add_scalar("charts/SPS",int(global_step / (time.time() - start_time)),global_step)

            if global_step % target_network_frequency == 0:
                q_state = q_state.replace(
                    target_params=optax.incremental_update(q_state.params, q_state.target_params, tau)
                )

    model_path = f"runs/{run_name}.cleanrl_model"
    with open(model_path,"wb") as f:
        f.write(flax.serialization.to_bytes(q_state.params))

    print("Model Saved")

    episodic_returns = evaluate(
        model_path,
        make_env,
        env_id,
        run_name=f"{run_name}-eval",
        Model=QNetwork,
        epsilon=0.05,
        eval_episodes=100
    )

    for idx, episodic_return in enumerate(episodic_returns):
        writer.add_scalar("eval/episodic_return", episodic_return,idx)

    envs.close()
    writer.close()
    wandb.finish()

In [18]:
train(envs["acrobot"],n_timesteps,learning_rate,buffer_size,gamma,target_network_frequency,max_grad_norm,batch_size,start_e,end_e,exploration_fraction,learning_starts,train_frequency,seed,tau,num_envs)



Moviepy - Building video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-0.mp4.
Moviepy - Writing video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-0.mp4
global_step=499, episodic_return=[-500.]
Moviepy - Building video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-1.mp4.
Moviepy - Writing video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-1.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-1.mp4
global_step=999, episodic_return=[-500.]
global_step=1499, episodic_return=[-500.]
global_step=1999, episodic_return=[-500.]
global_step=2499, episodic_return=[-500.]
global_step=2999, episodic_return=[-500.]
global_step=3499, episodic_return=[-500.]
global_step=3999, episodic_return=[-500.]
Moviepy - Building video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-8.mp4.
Moviepy - Writing video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-8.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-8.mp4
global_step=4499, episodic_return=[-500.]
global_step=4999, episodic_return=[-500.]
global_step=5499, episodic_return=[-500.]
global_step=5999, episodic_return=[-500.]
global_step=6499, episodic_return=[-500.]
global_step=6999, episodic_return=[-500.]
global_step=7499, episodic_return=[-500.]
global_step=7999, episodic_return=[-500.]
global_step=8499, episodic_return=[-500.]
global_step=8999, episodic_return=[-500.]
global_step=9499, episodic_return=[-500.]
global_step=9999, episodic_return=[-500.]
global_step=10499, episodic_return=[-500.]
global_step=10999, episodic_return=[-500.]
global_step=11499, episodic_return=[-500.]
global_step=11999, episodic_return=[-500.]
global_step=12499, episodic_return=[-500.]
global_step=12999, episodic_return=[-500.]
global_step=13499, episodic_return=[-500.]
Moviepy - Building video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-v

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-27.mp4
global_step=13999, episodic_return=[-500.]
global_step=14499, episodic_return=[-500.]
global_step=14999, episodic_return=[-500.]
global_step=15499, episodic_return=[-500.]
global_step=15999, episodic_return=[-500.]
global_step=16499, episodic_return=[-500.]
global_step=16999, episodic_return=[-500.]
global_step=17499, episodic_return=[-500.]
global_step=17999, episodic_return=[-500.]
global_step=18499, episodic_return=[-500.]
global_step=18999, episodic_return=[-500.]
global_step=19499, episodic_return=[-500.]
global_step=19999, episodic_return=[-500.]
global_step=20499, episodic_return=[-500.]
global_step=20999, episodic_return=[-500.]
global_step=21499, episodic_return=[-500.]
global_step=21999, episodic_return=[-500.]
global_step=22271, episodic_return=[-271.]
global_step=22771, episodic_return=[-500.]
global_step=23271, episodic_return=[-500.]
global_step=23771, episod

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-64.mp4
global_step=31984, episodic_return=[-266.]
global_step=32465, episodic_return=[-480.]
global_step=32965, episodic_return=[-500.]
global_step=33329, episodic_return=[-363.]
global_step=33829, episodic_return=[-500.]
global_step=34329, episodic_return=[-500.]
global_step=34629, episodic_return=[-299.]
global_step=35106, episodic_return=[-476.]
global_step=35606, episodic_return=[-500.]
global_step=35865, episodic_return=[-258.]
global_step=36365, episodic_return=[-500.]
global_step=36764, episodic_return=[-398.]
global_step=37264, episodic_return=[-500.]
global_step=37764, episodic_return=[-500.]
global_step=38103, episodic_return=[-338.]
global_step=38545, episodic_return=[-441.]
global_step=38837, episodic_return=[-291.]
global_step=39337, episodic_return=[-500.]
global_step=39837, episodic_return=[-500.]
global_step=40337, episodic_return=[-500.]
global_step=40751, episod

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-125.mp4
global_step=55469, episodic_return=[-257.]
global_step=55781, episodic_return=[-311.]
global_step=56056, episodic_return=[-274.]
global_step=56422, episodic_return=[-365.]
global_step=56757, episodic_return=[-334.]
global_step=56964, episodic_return=[-206.]
global_step=57305, episodic_return=[-340.]
global_step=57558, episodic_return=[-252.]
global_step=57872, episodic_return=[-313.]
global_step=58078, episodic_return=[-205.]
global_step=58516, episodic_return=[-437.]
global_step=58867, episodic_return=[-350.]
global_step=59168, episodic_return=[-300.]
global_step=59492, episodic_return=[-323.]
global_step=59722, episodic_return=[-229.]
global_step=59992, episodic_return=[-269.]
global_step=60341, episodic_return=[-348.]
global_step=60565, episodic_return=[-223.]
global_step=60758, episodic_return=[-192.]
global_step=61013, episodic_return=[-254.]
global_step=61240, episo

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-216.mp4
global_step=78814, episodic_return=[-161.]
global_step=79132, episodic_return=[-317.]
global_step=79354, episodic_return=[-221.]
global_step=79556, episodic_return=[-201.]
global_step=79723, episodic_return=[-166.]
global_step=79942, episodic_return=[-218.]
global_step=80131, episodic_return=[-188.]
global_step=80358, episodic_return=[-226.]
global_step=80602, episodic_return=[-243.]
global_step=80747, episodic_return=[-144.]
global_step=80975, episodic_return=[-227.]
global_step=81212, episodic_return=[-236.]
global_step=81448, episodic_return=[-235.]
global_step=81656, episodic_return=[-207.]
global_step=81875, episodic_return=[-218.]
global_step=82097, episodic_return=[-221.]
global_step=82314, episodic_return=[-216.]
global_step=82521, episodic_return=[-206.]
global_step=82679, episodic_return=[-157.]
global_step=82916, episodic_return=[-236.]
global_step=83154, episo

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-343.mp4
global_step=104774, episodic_return=[-141.]
global_step=104953, episodic_return=[-178.]
global_step=105115, episodic_return=[-161.]
global_step=105253, episodic_return=[-137.]
global_step=105407, episodic_return=[-153.]
global_step=105565, episodic_return=[-157.]
global_step=105762, episodic_return=[-196.]
global_step=105928, episodic_return=[-165.]
global_step=106134, episodic_return=[-205.]
global_step=106446, episodic_return=[-311.]
global_step=106605, episodic_return=[-158.]
global_step=106803, episodic_return=[-197.]
global_step=106948, episodic_return=[-144.]
global_step=107135, episodic_return=[-186.]
global_step=107322, episodic_return=[-186.]
global_step=107505, episodic_return=[-182.]
global_step=107704, episodic_return=[-198.]
global_step=107848, episodic_return=[-143.]
global_step=108012, episodic_return=[-163.]
global_step=108189, episodic_return=[-176.]
glob

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-512.mp4
global_step=132680, episodic_return=[-146.]
global_step=132778, episodic_return=[-97.]
global_step=132892, episodic_return=[-113.]
global_step=133105, episodic_return=[-212.]
global_step=133226, episodic_return=[-120.]
global_step=133354, episodic_return=[-127.]
global_step=133493, episodic_return=[-138.]
global_step=133659, episodic_return=[-165.]
global_step=133776, episodic_return=[-116.]
global_step=133948, episodic_return=[-171.]
global_step=134101, episodic_return=[-152.]
global_step=134231, episodic_return=[-129.]
global_step=134377, episodic_return=[-145.]
global_step=134503, episodic_return=[-125.]
global_step=134672, episodic_return=[-168.]
global_step=134828, episodic_return=[-155.]
global_step=134987, episodic_return=[-158.]
global_step=135163, episodic_return=[-175.]
global_step=135297, episodic_return=[-133.]
global_step=135510, episodic_return=[-212.]
globa

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-729.mp4
global_step=163080, episodic_return=[-118.]
global_step=163179, episodic_return=[-98.]
global_step=163284, episodic_return=[-104.]
global_step=163396, episodic_return=[-111.]
global_step=163499, episodic_return=[-102.]
global_step=163600, episodic_return=[-100.]
global_step=163746, episodic_return=[-145.]
global_step=163876, episodic_return=[-129.]
global_step=163984, episodic_return=[-107.]
global_step=164102, episodic_return=[-117.]
global_step=164204, episodic_return=[-101.]
global_step=164492, episodic_return=[-287.]
global_step=164588, episodic_return=[-95.]
global_step=164697, episodic_return=[-108.]
global_step=164827, episodic_return=[-129.]
global_step=164945, episodic_return=[-117.]
global_step=165053, episodic_return=[-107.]
global_step=165147, episodic_return=[-93.]
global_step=165259, episodic_return=[-111.]
global_step=165379, episodic_return=[-119.]
global_

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-1000.mp4
global_step=194278, episodic_return=[-89.]
global_step=194376, episodic_return=[-97.]
global_step=194479, episodic_return=[-102.]
global_step=194671, episodic_return=[-191.]
global_step=194770, episodic_return=[-98.]
global_step=194918, episodic_return=[-147.]
global_step=195051, episodic_return=[-132.]
global_step=195149, episodic_return=[-97.]
global_step=195225, episodic_return=[-75.]
global_step=195326, episodic_return=[-100.]
global_step=195440, episodic_return=[-113.]
global_step=195556, episodic_return=[-115.]
global_step=195671, episodic_return=[-114.]
global_step=195776, episodic_return=[-104.]
global_step=195893, episodic_return=[-116.]
global_step=195981, episodic_return=[-87.]
global_step=196082, episodic_return=[-100.]
global_step=196210, episodic_return=[-127.]
global_step=196300, episodic_return=[-89.]
global_step=196435, episodic_return=[-134.]
global_ste

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-2000.mp4
global_step=287538, episodic_return=[-77.]
global_step=287634, episodic_return=[-95.]
global_step=287722, episodic_return=[-87.]
global_step=287798, episodic_return=[-75.]
global_step=287875, episodic_return=[-76.]
global_step=287968, episodic_return=[-92.]
global_step=288043, episodic_return=[-74.]
global_step=288117, episodic_return=[-73.]
global_step=288208, episodic_return=[-90.]
global_step=288313, episodic_return=[-104.]
global_step=288416, episodic_return=[-102.]
global_step=288487, episodic_return=[-70.]
global_step=288580, episodic_return=[-92.]
global_step=288671, episodic_return=[-90.]
global_step=288757, episodic_return=[-85.]
global_step=288854, episodic_return=[-96.]
global_step=288930, episodic_return=[-75.]
global_step=289019, episodic_return=[-88.]
global_step=289082, episodic_return=[-62.]
global_step=289198, episodic_return=[-115.]
global_step=289260, 

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-3000.mp4
global_step=376591, episodic_return=[-81.]
global_step=376662, episodic_return=[-70.]
global_step=376725, episodic_return=[-62.]
global_step=376801, episodic_return=[-75.]
global_step=376877, episodic_return=[-75.]
global_step=376973, episodic_return=[-95.]
global_step=377067, episodic_return=[-93.]
global_step=377155, episodic_return=[-87.]
global_step=377237, episodic_return=[-81.]
global_step=377330, episodic_return=[-92.]
global_step=377407, episodic_return=[-76.]
global_step=377513, episodic_return=[-105.]
global_step=377601, episodic_return=[-87.]
global_step=377698, episodic_return=[-96.]
global_step=377778, episodic_return=[-79.]
global_step=377841, episodic_return=[-62.]
global_step=377947, episodic_return=[-105.]
global_step=378029, episodic_return=[-81.]
global_step=378103, episodic_return=[-73.]
global_step=378191, episodic_return=[-87.]
global_step=378267, e

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373/rl-video-episode-4000.mp4
global_step=464886, episodic_return=[-94.]
global_step=464950, episodic_return=[-63.]
global_step=465022, episodic_return=[-71.]
global_step=465084, episodic_return=[-61.]
global_step=465154, episodic_return=[-69.]
global_step=465230, episodic_return=[-75.]
global_step=465307, episodic_return=[-76.]
global_step=465409, episodic_return=[-101.]
global_step=465494, episodic_return=[-84.]
global_step=465577, episodic_return=[-82.]
global_step=465684, episodic_return=[-106.]
global_step=465764, episodic_return=[-79.]
global_step=465828, episodic_return=[-63.]
global_step=465930, episodic_return=[-101.]
global_step=466028, episodic_return=[-97.]
global_step=466098, episodic_return=[-69.]
global_step=466258, episodic_return=[-159.]
global_step=466341, episodic_return=[-82.]
global_step=466417, episodic_return=[-75.]
global_step=466540, episodic_return=[-122.]
global_step=466625

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-0.mp4
eval_episode=0, episodic_return=[-75.]
Moviepy - Building video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-1.mp4.
Moviepy - Writing video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-1.mp4



                                                              

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-1.mp4
eval_episode=1, episodic_return=[-101.]
eval_episode=2, episodic_return=[-76.]
eval_episode=3, episodic_return=[-81.]
eval_episode=4, episodic_return=[-99.]
eval_episode=5, episodic_return=[-75.]
eval_episode=6, episodic_return=[-90.]
eval_episode=7, episodic_return=[-69.]
Moviepy - Building video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-8.mp4.
Moviepy - Writing video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-8.mp4



                                                              

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-8.mp4
eval_episode=8, episodic_return=[-99.]
eval_episode=9, episodic_return=[-82.]
eval_episode=10, episodic_return=[-86.]
eval_episode=11, episodic_return=[-63.]
eval_episode=12, episodic_return=[-96.]
eval_episode=13, episodic_return=[-92.]
eval_episode=14, episodic_return=[-63.]
eval_episode=15, episodic_return=[-78.]
eval_episode=16, episodic_return=[-77.]
eval_episode=17, episodic_return=[-79.]
eval_episode=18, episodic_return=[-86.]
eval_episode=19, episodic_return=[-92.]
eval_episode=20, episodic_return=[-69.]
eval_episode=21, episodic_return=[-80.]
eval_episode=22, episodic_return=[-84.]
eval_episode=23, episodic_return=[-63.]
eval_episode=24, episodic_return=[-115.]
eval_episode=25, episodic_return=[-68.]
eval_episode=26, episodic_return=[-69.]
Moviepy - Building video /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-27.mp4.
Moviepy - Writi

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-27.mp4
eval_episode=27, episodic_return=[-74.]
eval_episode=28, episodic_return=[-80.]
eval_episode=29, episodic_return=[-83.]
eval_episode=30, episodic_return=[-82.]
eval_episode=31, episodic_return=[-76.]
eval_episode=32, episodic_return=[-103.]
eval_episode=33, episodic_return=[-70.]
eval_episode=34, episodic_return=[-75.]
eval_episode=35, episodic_return=[-91.]
eval_episode=36, episodic_return=[-83.]
eval_episode=37, episodic_return=[-78.]
eval_episode=38, episodic_return=[-108.]
eval_episode=39, episodic_return=[-76.]
eval_episode=40, episodic_return=[-100.]
eval_episode=41, episodic_return=[-126.]
eval_episode=42, episodic_return=[-85.]
eval_episode=43, episodic_return=[-102.]
eval_episode=44, episodic_return=[-92.]
eval_episode=45, episodic_return=[-85.]
eval_episode=46, episodic_return=[-143.]
eval_episode=47, episodic_return=[-89.]
eval_episode=48, episodic_return=[

                                                             

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/video/Acrobot-v1__1__1699875373-eval/rl-video-episode-64.mp4
eval_episode=64, episodic_return=[-82.]
eval_episode=65, episodic_return=[-79.]
eval_episode=66, episodic_return=[-86.]
eval_episode=67, episodic_return=[-87.]
eval_episode=68, episodic_return=[-94.]
eval_episode=69, episodic_return=[-87.]
eval_episode=70, episodic_return=[-82.]
eval_episode=71, episodic_return=[-75.]
eval_episode=72, episodic_return=[-64.]
eval_episode=73, episodic_return=[-75.]
eval_episode=74, episodic_return=[-74.]
eval_episode=75, episodic_return=[-71.]
eval_episode=76, episodic_return=[-118.]
eval_episode=77, episodic_return=[-75.]
eval_episode=78, episodic_return=[-90.]
eval_episode=79, episodic_return=[-69.]
eval_episode=80, episodic_return=[-69.]
eval_episode=81, episodic_return=[-101.]
eval_episode=82, episodic_return=[-95.]
eval_episode=83, episodic_return=[-72.]
eval_episode=84, episodic_return=[-69.]
eval_episode=85, episodic_return=[-121

VBox(children=(Label(value='4.383 MB of 4.383 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
charts/SPS,▁▃▅▇▇████████▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▅▅▅▅
charts/episodic_length,█▅▄▄▂▃▂▂▂▃▂▂▂▁▃▂▁▂▁▂▁▂▁▁▁▁▁▁▁▂▁▂▁▃▁▂▁▁▂▂
charts/episodic_return,▁▄▅▅▇▆▇▇▇▆▇▇▇█▆▇█▇█▇█▇███████▇█▇█▆█▇██▇▇
charts/epsilon,█▇▆▅▅▄▄▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/episodic_return,▇▇▇█▅▄▇▆█▆█▇▆█▅▂▄▃▆▄▆▆▃▅▃▅▅▄▇▇▁▅▃▇█▅▄▆▄▅
global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇███
losses/q_values,█▆▃▁▁▂▃▄▄▄▃▄▃▄▃▄▃▄▄▃▄▃▃▄▃▄▄▄▃▄▄▄▃▄▃▄▄▄▄▄
losses/td_loss,▁▂▂▅█▅▄▃▂▃▂▃▃▃▃▃▃▃▃▂▃▃▃▄▃▄▄▄▃▃▃▃▃▄▄▄▄▃▃▃

0,1
charts/SPS,1173.0
charts/episodic_length,64.0
charts/episodic_return,-63.0
charts/epsilon,0.05
eval/episodic_return,-90.0
global_step,499977.0
losses/q_values,-32.39475
losses/td_loss,1.19261
