In [1]:
import os
import time
import random
import wandb
import numpy as np
import gymnasium as gym
import flax.linen as nn
from flax.training.train_state import TrainState
from flax.training.common_utils import onehot

import jax.numpy as jnp
import jax
import optax

from torch.utils.tensorboard import SummaryWriter

In [2]:
def make_env(env_id,seed,idx,capture_video,run_name):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id,render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env,f"videos/{run_name}")
        else:
            env = gym.make(env_id)

        env  = gym.wrappers.RecordEpisodeStatistics(env)
        env.action_space.seed(seed)
        return env
    return thunk

In [3]:
class Actor(nn.Module):
    action_dims : int

    @nn.compact
    def __call__(self,input:jnp.ndarray):
        x = nn.Dense(16)(input)
        x = nn.relu(x)
        x = nn.Dense(16)(x)
        x = nn.relu(x)
        x = nn.Dense(self.action_dims)(x)
        probs = nn.softmax(x)
        log_probs = nn.log_softmax(x)

        return probs, log_probs

In [4]:
class Critic(nn.Module):
    hidden_size: int

    @nn.compact
    def __call__(self,input:jnp.ndarray):
        x = nn.Dense(self.hidden_size)(input)
        x = nn.relu(x)
        x = nn.Dense(self.hidden_size)(x)
        x = nn.relu(x)
        x = nn.Dense(1)(x)

        return x

In [5]:
def train(env_id,gamma,episodes,max_termination,seed,num_envs,learning_rate):
    run_name = f"{env_id}__{seed}__{int(time.time())}"
    wandb.init(
        project="a2c-classic-control-benchmark",
        config={
            "env":env_id,
            "gamma":gamma,
            "episodes":episodes,
            "max_termination":max_termination,
            "seed":seed,
            "lr":learning_rate,
        },
        sync_tensorboard=True,
        monitor_gym=True,
        name=run_name
    )
    writer = SummaryWriter(f"runs/{run_name}")
    device = jax.devices("cpu")[0]
    random.seed(seed)
    np.random.seed(seed)
    key = jax.random.PRNGKey(seed)

    env = make_env(env_id,seed,0,True,run_name)()

    obs, _ = env.reset(seed=seed)
    action_dims = env.action_space.n

    actor = Actor(action_dims=action_dims)
    critic = Critic(hidden_size=120)

    actor_state = TrainState.create(
        apply_fn=actor.apply,
        params=jax.device_put(actor.init(key, obs),device),
        tx=optax.adam(learning_rate=learning_rate)
    )

    critic_state = TrainState.create(
        apply_fn=critic.apply,
        params=jax.device_put(critic.init(key, obs),device),
        tx=optax.adam(learning_rate=learning_rate)
    )

    actor.apply = jax.jit(actor.apply)
    critic.apply = jax.jit(critic.apply)

    @jax.jit
    def update(actor_state,critic_state,states,next_states,actions,rewards,dones,gamma):
        def actor_loss_fn(actor_params,critic_params):
            pred_value = critic.apply(critic_params,states).reshape(rewards.shape)
            target_value = rewards + gamma * (critic.apply(critic_params,next_states)).reshape(rewards.shape) * (1 -dones)
            advantage = target_value - pred_value
            advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)

            _,logprobs = actor.apply(actor_params,states)
            onehot_actions = onehot(actions,num_classes=logprobs.shape[-1]).reshape(logprobs.shape)
            selected_action_logprobs = jnp.sum(onehot_actions * logprobs,axis=-1)
            actor_loss = -jnp.sum(selected_action_logprobs * advantage)

            return actor_loss

        def critic_loss_fn(critic_params):
            pred_value = critic.apply(critic_params,states)
            target_value = rewards + gamma * critic.apply(critic_params,next_states) * (1 -dones)
            advantage = target_value - pred_value
            # advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
            critic_loss = (advantage**2).mean()

            return critic_loss

        actor_loss,actor_gradients = jax.value_and_grad(actor_loss_fn)(actor_state.params,critic_state.params)
        critic_loss,critic_gradients = jax.value_and_grad(critic_loss_fn)(critic_state.params)

        actor_state = actor_state.apply_gradients(grads=actor_gradients)
        critic_state = critic_state.apply_gradients(grads=critic_gradients)

        return actor_loss,critic_loss,actor_state,critic_state


    for episode in range(episodes):
        states = []
        next_states = []
        rewards = []
        actions = []
        dones = []
        done = False
        obs, _ = env.reset(seed=seed)
        for _ in range(max_termination):
            states.append(obs)
            input = jnp.array(obs)[None,...]
            prob,_ = actor.apply(actor_state.params,jax.device_put(input,device))
            prob = jax.device_get(prob)[0]
            action = np.random.choice(action_dims,p=prob)
            actions.append(action)
            next_state,reward,done,_,_ = env.step(action)
            next_states.append(next_state)
            rewards.append(reward)
            if done:
                dones.append(1)
                break
            dones.append(0)

            obs = next_state

        states = jnp.array(states)
        next_states = jnp.array(next_states)
        rewards = jnp.array(rewards)
        actions = jnp.array(actions)
        dones = jnp.array(dones)

        states = jax.device_put(states,device)
        next_states = jax.device_put(next_states,device)
        rewards = jax.device_put(rewards,device)
        actions = jax.device_put(actions,device)
        dones = jax.device_put(dones,device)

        actor_loss,critic_loss,actor_state,critic_state = update(
            actor_state,
            critic_state,
            states,
            next_states,
            actions,
            rewards,
            dones,
            gamma
        )

        print(f"Episode:{episode}   ActorLoss:{actor_loss} CriticLoss:{critic_loss}  Reward:{sum(rewards)} ")
        writer.add_scalar("loss/actor_loss",jax.device_get(actor_loss),episode)
        writer.add_scalar("loss/critic_loss:",jax.device_get(critic_loss),episode)
        writer.add_scalar("rewards",sum(jax.device_get(rewards)),episode)
        # writer.add_scalar("advantage",jax.device_get(advantage),episode)

    env.close()
    writer.close()
    wandb.finish()

    return actor_state, critic_state



In [6]:
envs = {"cartPole":"CartPole-v1","acrobot":"Acrobot-v1","mountainCar":"MountainCar-v0"}
gamma = 1.0
episodes = 10000
max_termination = 1000
seed = 0
num_envs = 1
learning_rate = 0.0003

In [9]:
train(envs["mountainCar"],gamma,episodes,max_termination,seed,num_envs,learning_rate)

VBox(children=(Label(value='4.297 MB of 4.297 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/actor_loss,▄▂▁▆▇▇█▇██▇▇▇▇█▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇
loss/critic_loss:,▇▆▅▄▄▂▂▅▃▄▂▃▂▁▃▃▂▃▂▂▁▂▃▂▁▃▂▁▁▂▁▁▅▂▁▁▁▁▁█
rewards,▁▆▇▇▇██▆▇▇████▇▇▇▇████▇██▇▇██▇██▇▇█████▆

0,1
global_step,9999.0
loss/actor_loss,4.84908
loss/critic_loss:,0.80137
rewards,-62.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113883333746345, max=1.0…



Moviepy - Building video /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-0.mp4.
Moviepy - Writing video /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-0.mp4
Episode:0   ActorLoss:-3.4766998291015625 CriticLoss:0.9999780058860779  Reward:-1000.0 
Moviepy - Building video /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-1.mp4.
Moviepy - Writing video /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-1.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-1.mp4
Episode:1   ActorLoss:1.723388671875 CriticLoss:1.0002377033233643  Reward:-1000.0 
Episode:2   ActorLoss:0.43910980224609375 CriticLoss:1.000009298324585  Reward:-1000.0 
Episode:3   ActorLoss:-6.335899353027344 CriticLoss:0.9999366402626038  Reward:-1000.0 
Episode:4   ActorLoss:8.066776275634766 CriticLoss:0.9998784065246582  Reward:-1000.0 
Episode:5   ActorLoss:-0.9016666412353516 CriticLoss:0.9999385476112366  Reward:-1000.0 
Episode:6   ActorLoss:-9.008056640625 CriticLoss:1.0001568794250488  Reward:-1000.0 
Episode:7   ActorLoss:-18.988494873046875 CriticLoss:1.0001188516616821  Reward:-1000.0 
Moviepy - Building video /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-8.mp4.
Moviepy - Writing video /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-8.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-8.mp4
Episode:8   ActorLoss:-13.457311630249023 CriticLoss:1.0000417232513428  Reward:-1000.0 
Episode:9   ActorLoss:-18.58318328857422 CriticLoss:1.0000351667404175  Reward:-1000.0 
Episode:10   ActorLoss:4.3456573486328125 CriticLoss:0.9999343156814575  Reward:-1000.0 
Episode:11   ActorLoss:-0.3752098083496094 CriticLoss:1.0007034540176392  Reward:-726.0 
Episode:12   ActorLoss:-23.39531707763672 CriticLoss:0.9999529123306274  Reward:-1000.0 
Episode:13   ActorLoss:-20.40218734741211 CriticLoss:0.9999614357948303  Reward:-1000.0 
Episode:14   ActorLoss:-14.281719207763672 CriticLoss:1.0000317096710205  Reward:-1000.0 
Episode:15   ActorLoss:-29.85477066040039 CriticLoss:0.9999908208847046  Reward:-1000.0 
Episode:16   ActorLoss:-46.38102722167969 CriticLoss:0.9999642968177795  Reward:-1000.0 
Episode:17   ActorLoss:-61.16917419433594 CriticLoss:0.999997615814209  Reward:-

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-27.mp4
Episode:27   ActorLoss:-70.8912353515625 CriticLoss:1.0000076293945312  Reward:-1000.0 
Episode:28   ActorLoss:-52.13883590698242 CriticLoss:1.0000107288360596  Reward:-1000.0 
Episode:29   ActorLoss:-60.5880126953125 CriticLoss:1.0000067949295044  Reward:-1000.0 
Episode:30   ActorLoss:-47.6165771484375 CriticLoss:1.0000091791152954  Reward:-1000.0 
Episode:31   ActorLoss:-63.603477478027344 CriticLoss:1.0000065565109253  Reward:-1000.0 
Episode:32   ActorLoss:-51.6988525390625 CriticLoss:0.9999669194221497  Reward:-1000.0 
Episode:33   ActorLoss:-45.12483215332031 CriticLoss:1.0000216960906982  Reward:-1000.0 
Episode:34   ActorLoss:-28.270092010498047 CriticLoss:0.9999560713768005  Reward:-1000.0 
Episode:35   ActorLoss:-46.01752471923828 CriticLoss:1.0000238418579102  Reward:-1000.0 
Episode:36   ActorLoss:-22.349563598632812 CriticLoss:0.9999954104423523  Reward:

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-64.mp4
Episode:64   ActorLoss:-19.538259506225586 CriticLoss:0.9999485611915588  Reward:-1000.0 
Episode:65   ActorLoss:-12.918020248413086 CriticLoss:1.0000120401382446  Reward:-1000.0 
Episode:66   ActorLoss:-6.061237335205078 CriticLoss:0.9998967051506042  Reward:-1000.0 
Episode:67   ActorLoss:-6.164813995361328 CriticLoss:0.999983012676239  Reward:-1000.0 
Episode:68   ActorLoss:13.926589965820312 CriticLoss:0.9998974204063416  Reward:-1000.0 
Episode:69   ActorLoss:-3.0826034545898438 CriticLoss:0.9999580979347229  Reward:-1000.0 
Episode:70   ActorLoss:10.939598083496094 CriticLoss:0.9999180436134338  Reward:-1000.0 
Episode:71   ActorLoss:-0.943634033203125 CriticLoss:1.0000337362289429  Reward:-1000.0 
Episode:72   ActorLoss:5.0203704833984375 CriticLoss:0.9999318718910217  Reward:-1000.0 
Episode:73   ActorLoss:-0.96307373046875 CriticLoss:0.999792754650116  Reward

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-125.mp4
Episode:125   ActorLoss:-1.8610382080078125 CriticLoss:0.9999690055847168  Reward:-1000.0 
Episode:126   ActorLoss:-2.0472450256347656 CriticLoss:1.0001214742660522  Reward:-1000.0 
Episode:127   ActorLoss:7.66070556640625 CriticLoss:0.9998294115066528  Reward:-1000.0 
Episode:128   ActorLoss:23.079376220703125 CriticLoss:0.9998533129692078  Reward:-1000.0 
Episode:129   ActorLoss:-4.821846008300781 CriticLoss:0.9999924302101135  Reward:-1000.0 
Episode:130   ActorLoss:3.6916732788085938 CriticLoss:1.0001879930496216  Reward:-1000.0 
Episode:131   ActorLoss:9.095115661621094 CriticLoss:0.999687910079956  Reward:-1000.0 
Episode:132   ActorLoss:22.683652877807617 CriticLoss:1.0000994205474854  Reward:-1000.0 
Episode:133   ActorLoss:-16.136154174804688 CriticLoss:1.0000633001327515  Reward:-1000.0 
Episode:134   ActorLoss:2.2039451599121094 CriticLoss:1.00007438659667

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-216.mp4
Episode:216   ActorLoss:10.556756973266602 CriticLoss:0.9999392628669739  Reward:-1000.0 
Episode:217   ActorLoss:-0.46443986892700195 CriticLoss:0.9981387853622437  Reward:-839.0 
Episode:218   ActorLoss:0.8656082153320312 CriticLoss:1.0000271797180176  Reward:-1000.0 
Episode:219   ActorLoss:3.840902328491211 CriticLoss:1.0000330209732056  Reward:-1000.0 
Episode:220   ActorLoss:-0.6302604675292969 CriticLoss:1.0000145435333252  Reward:-1000.0 
Episode:221   ActorLoss:14.921539306640625 CriticLoss:0.9999246001243591  Reward:-1000.0 
Episode:222   ActorLoss:-1.1946830749511719 CriticLoss:0.9998180866241455  Reward:-1000.0 
Episode:223   ActorLoss:8.25566291809082 CriticLoss:0.9999668598175049  Reward:-1000.0 
Episode:224   ActorLoss:8.414905548095703 CriticLoss:1.000005841255188  Reward:-1000.0 
Episode:225   ActorLoss:-0.7228145599365234 CriticLoss:1.00004100799560

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-343.mp4
Episode:343   ActorLoss:-35.28107833862305 CriticLoss:0.999975860118866  Reward:-1000.0 
Episode:344   ActorLoss:-17.22473907470703 CriticLoss:1.0000238418579102  Reward:-1000.0 
Episode:345   ActorLoss:-34.306278228759766 CriticLoss:0.9998108744621277  Reward:-1000.0 
Episode:346   ActorLoss:-11.240711212158203 CriticLoss:1.0000338554382324  Reward:-1000.0 
Episode:347   ActorLoss:-59.51388931274414 CriticLoss:0.9992944002151489  Reward:-1000.0 
Episode:348   ActorLoss:-55.825904846191406 CriticLoss:0.9996883869171143  Reward:-1000.0 
Episode:349   ActorLoss:-25.96421241760254 CriticLoss:0.9998571276664734  Reward:-1000.0 
Episode:350   ActorLoss:3.4058361053466797 CriticLoss:0.9999542236328125  Reward:-1000.0 
Episode:351   ActorLoss:-25.985919952392578 CriticLoss:0.9997777342796326  Reward:-1000.0 
Episode:352   ActorLoss:-41.365745544433594 CriticLoss:0.999786555

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-512.mp4
Episode:512   ActorLoss:-38.880943298339844 CriticLoss:0.9998090863227844  Reward:-1000.0 
Episode:513   ActorLoss:-13.074246406555176 CriticLoss:0.9985834956169128  Reward:-1000.0 
Episode:514   ActorLoss:-19.641841888427734 CriticLoss:0.9985926151275635  Reward:-1000.0 
Episode:515   ActorLoss:-21.537317276000977 CriticLoss:0.9997833967208862  Reward:-1000.0 
Episode:516   ActorLoss:-40.57011032104492 CriticLoss:0.999931812286377  Reward:-1000.0 
Episode:517   ActorLoss:-23.560001373291016 CriticLoss:0.9996964931488037  Reward:-1000.0 
Episode:518   ActorLoss:-21.690189361572266 CriticLoss:0.9989565014839172  Reward:-1000.0 
Episode:519   ActorLoss:-31.752933502197266 CriticLoss:0.999932587146759  Reward:-1000.0 
Episode:520   ActorLoss:-4.576496124267578 CriticLoss:1.000203013420105  Reward:-1000.0 
Episode:521   ActorLoss:-20.394248962402344 CriticLoss:0.99930268

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-729.mp4
Episode:729   ActorLoss:-62.44927215576172 CriticLoss:0.9990639686584473  Reward:-1000.0 
Episode:730   ActorLoss:-16.56673812866211 CriticLoss:0.9993048906326294  Reward:-1000.0 
Episode:731   ActorLoss:-23.59239959716797 CriticLoss:0.999040961265564  Reward:-1000.0 
Episode:732   ActorLoss:3.8604774475097656 CriticLoss:0.999947190284729  Reward:-1000.0 
Episode:733   ActorLoss:19.112031936645508 CriticLoss:0.9983330368995667  Reward:-1000.0 
Episode:734   ActorLoss:-68.86683654785156 CriticLoss:0.9981240630149841  Reward:-1000.0 
Episode:735   ActorLoss:-55.96338653564453 CriticLoss:0.9988962411880493  Reward:-1000.0 
Episode:736   ActorLoss:-25.553985595703125 CriticLoss:0.9987087249755859  Reward:-1000.0 
Episode:737   ActorLoss:-13.400030136108398 CriticLoss:0.9986602067947388  Reward:-1000.0 
Episode:738   ActorLoss:-35.44041442871094 CriticLoss:0.9982370138168

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-1000.mp4
Episode:1000   ActorLoss:-59.55449295043945 CriticLoss:0.9986356496810913  Reward:-1000.0 
Episode:1001   ActorLoss:-47.336734771728516 CriticLoss:0.9999045729637146  Reward:-1000.0 
Episode:1002   ActorLoss:-79.91996765136719 CriticLoss:0.9992104172706604  Reward:-1000.0 
Episode:1003   ActorLoss:-19.341358184814453 CriticLoss:0.9844438433647156  Reward:-773.0 
Episode:1004   ActorLoss:-28.269309997558594 CriticLoss:0.9890973567962646  Reward:-909.0 
Episode:1005   ActorLoss:-81.51876831054688 CriticLoss:0.9975156188011169  Reward:-1000.0 
Episode:1006   ActorLoss:-113.58039855957031 CriticLoss:0.9986810684204102  Reward:-1000.0 
Episode:1007   ActorLoss:-74.95558166503906 CriticLoss:0.9990494847297668  Reward:-1000.0 
Episode:1008   ActorLoss:-70.77967834472656 CriticLoss:0.9999619722366333  Reward:-1000.0 
Episode:1009   ActorLoss:-2.971181869506836 CriticLoss:0.

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-2000.mp4
Episode:2000   ActorLoss:-64.17903900146484 CriticLoss:0.9600066542625427  Reward:-606.0 
Episode:2001   ActorLoss:-70.975830078125 CriticLoss:0.9632236957550049  Reward:-640.0 
Episode:2002   ActorLoss:-2.0802910327911377 CriticLoss:0.9467821717262268  Reward:-432.0 
Episode:2003   ActorLoss:-5.815560817718506 CriticLoss:0.9644057154655457  Reward:-630.0 
Episode:2004   ActorLoss:3.774383544921875 CriticLoss:0.953609049320221  Reward:-497.0 
Episode:2005   ActorLoss:-1.255157470703125 CriticLoss:0.9435612559318542  Reward:-396.0 
Episode:2006   ActorLoss:0.5998541116714478 CriticLoss:0.9403132796287537  Reward:-355.0 
Episode:2007   ActorLoss:0.33476805686950684 CriticLoss:0.9641578197479248  Reward:-675.0 
Episode:2008   ActorLoss:-51.071502685546875 CriticLoss:0.9462294578552246  Reward:-432.0 
Episode:2009   ActorLoss:-3.5640106201171875 CriticLoss:0.94432127475

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-3000.mp4
Episode:3000   ActorLoss:-0.09061241149902344 CriticLoss:0.9409428238868713  Reward:-371.0 
Episode:3001   ActorLoss:-1.776652216911316 CriticLoss:0.955361545085907  Reward:-537.0 
Episode:3002   ActorLoss:-0.8535889387130737 CriticLoss:0.9284461140632629  Reward:-291.0 
Episode:3003   ActorLoss:-1.285776138305664 CriticLoss:0.9262554049491882  Reward:-295.0 
Episode:3004   ActorLoss:-3.774988889694214 CriticLoss:0.9572947025299072  Reward:-542.0 
Episode:3005   ActorLoss:-2.685182571411133 CriticLoss:0.9489187598228455  Reward:-464.0 
Episode:3006   ActorLoss:-8.006889343261719 CriticLoss:0.9562374353408813  Reward:-543.0 
Episode:3007   ActorLoss:-5.247256278991699 CriticLoss:0.9454429149627686  Reward:-368.0 
Episode:3008   ActorLoss:-4.644786834716797 CriticLoss:0.9539735913276672  Reward:-483.0 
Episode:3009   ActorLoss:-9.40081787109375 CriticLoss:0.9664964079

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-4000.mp4
Episode:4000   ActorLoss:-1.0764567852020264 CriticLoss:0.9107986688613892  Reward:-214.0 
Episode:4001   ActorLoss:-1.5217032432556152 CriticLoss:0.9175847768783569  Reward:-236.0 
Episode:4002   ActorLoss:-0.8715811967849731 CriticLoss:0.9167501926422119  Reward:-246.0 
Episode:4003   ActorLoss:-1.411940097808838 CriticLoss:0.9250324964523315  Reward:-274.0 
Episode:4004   ActorLoss:-4.88481330871582 CriticLoss:0.9461444616317749  Reward:-429.0 
Episode:4005   ActorLoss:-3.797642230987549 CriticLoss:0.9332959055900574  Reward:-302.0 
Episode:4006   ActorLoss:3.0445797443389893 CriticLoss:0.9187278747558594  Reward:-250.0 
Episode:4007   ActorLoss:-0.10047399997711182 CriticLoss:0.9326445460319519  Reward:-314.0 
Episode:4008   ActorLoss:-0.4153546392917633 CriticLoss:0.9107107520103455  Reward:-211.0 
Episode:4009   ActorLoss:-1.6373820304870605 CriticLoss:0.92465

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-5000.mp4
Episode:5000   ActorLoss:-0.09368586540222168 CriticLoss:0.9026830196380615  Reward:-177.0 
Episode:5001   ActorLoss:-1.7542355060577393 CriticLoss:0.9129717946052551  Reward:-230.0 
Episode:5002   ActorLoss:-0.7877770662307739 CriticLoss:0.9057787656784058  Reward:-193.0 
Episode:5003   ActorLoss:-3.667980432510376 CriticLoss:0.9348154664039612  Reward:-304.0 
Episode:5004   ActorLoss:-0.07444918155670166 CriticLoss:0.8974233865737915  Reward:-173.0 
Episode:5005   ActorLoss:-3.403568744659424 CriticLoss:0.9336190819740295  Reward:-303.0 
Episode:5006   ActorLoss:-2.068891763687134 CriticLoss:0.9157416224479675  Reward:-242.0 
Episode:5007   ActorLoss:-3.112616777420044 CriticLoss:0.8946735858917236  Reward:-162.0 
Episode:5008   ActorLoss:-2.2195682525634766 CriticLoss:0.9169862866401672  Reward:-241.0 
Episode:5009   ActorLoss:-0.12224626541137695 CriticLoss:0.91

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-6000.mp4
Episode:6000   ActorLoss:-1.5772695541381836 CriticLoss:0.88450688123703  Reward:-155.0 
Episode:6001   ActorLoss:-3.264010190963745 CriticLoss:0.8959236145019531  Reward:-169.0 
Episode:6002   ActorLoss:-3.1423234939575195 CriticLoss:0.8905232548713684  Reward:-156.0 
Episode:6003   ActorLoss:-2.1784560680389404 CriticLoss:0.9115093946456909  Reward:-183.0 
Episode:6004   ActorLoss:-1.6107889413833618 CriticLoss:0.8918569087982178  Reward:-164.0 
Episode:6005   ActorLoss:-80.54733276367188 CriticLoss:0.9193882346153259  Reward:-233.0 
Episode:6006   ActorLoss:0.8902485370635986 CriticLoss:0.9157382845878601  Reward:-219.0 
Episode:6007   ActorLoss:-0.7055087089538574 CriticLoss:0.8823320865631104  Reward:-148.0 
Episode:6008   ActorLoss:-1.366044282913208 CriticLoss:0.8941649794578552  Reward:-159.0 
Episode:6009   ActorLoss:-1.9855031967163086 CriticLoss:0.9114092

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-7000.mp4
Episode:7000   ActorLoss:-1.1716175079345703 CriticLoss:0.8731068968772888  Reward:-152.0 
Episode:7001   ActorLoss:-2.005544900894165 CriticLoss:0.8831243515014648  Reward:-159.0 
Episode:7002   ActorLoss:-1.2266407012939453 CriticLoss:0.8890246748924255  Reward:-168.0 
Episode:7003   ActorLoss:0.06386715173721313 CriticLoss:0.8721104860305786  Reward:-145.0 
Episode:7004   ActorLoss:-0.8293002843856812 CriticLoss:0.8719105124473572  Reward:-151.0 
Episode:7005   ActorLoss:-0.7833711504936218 CriticLoss:0.8736335039138794  Reward:-153.0 
Episode:7006   ActorLoss:-0.8707468509674072 CriticLoss:0.8756109476089478  Reward:-152.0 
Episode:7007   ActorLoss:-3.0264530181884766 CriticLoss:0.8706402778625488  Reward:-147.0 
Episode:7008   ActorLoss:-1.7093327045440674 CriticLoss:0.8748108744621277  Reward:-151.0 
Episode:7009   ActorLoss:-1.5152746438980103 CriticLoss:0.87

                                                              

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-8000.mp4
Episode:8000   ActorLoss:-1.2369964122772217 CriticLoss:0.844162106513977  Reward:-108.0 
Episode:8001   ActorLoss:-0.35896700620651245 CriticLoss:0.8765950798988342  Reward:-148.0 
Episode:8002   ActorLoss:-1.8578954935073853 CriticLoss:0.8750340938568115  Reward:-154.0 
Episode:8003   ActorLoss:-2.258283853530884 CriticLoss:0.8790132999420166  Reward:-157.0 
Episode:8004   ActorLoss:-1.4558025598526 CriticLoss:0.8733274340629578  Reward:-148.0 
Episode:8005   ActorLoss:-2.346987247467041 CriticLoss:0.8709321618080139  Reward:-154.0 
Episode:8006   ActorLoss:-0.5462942123413086 CriticLoss:0.8692678809165955  Reward:-156.0 
Episode:8007   ActorLoss:-1.8943767547607422 CriticLoss:0.8717532157897949  Reward:-159.0 
Episode:8008   ActorLoss:-0.41735196113586426 CriticLoss:0.8706449866294861  Reward:-154.0 
Episode:8009   ActorLoss:-1.941080093383789 CriticLoss:0.870518

                                                               

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-9000.mp4
Episode:9000   ActorLoss:-1.4768134355545044 CriticLoss:0.8730974793434143  Reward:-149.0 
Episode:9001   ActorLoss:-1.1376954317092896 CriticLoss:0.8683357834815979  Reward:-155.0 
Episode:9002   ActorLoss:-2.3157050609588623 CriticLoss:0.8675617575645447  Reward:-159.0 
Episode:9003   ActorLoss:0.1144675612449646 CriticLoss:0.851065993309021  Reward:-118.0 
Episode:9004   ActorLoss:-0.9870502948760986 CriticLoss:0.8736968040466309  Reward:-168.0 
Episode:9005   ActorLoss:-2.5114974975585938 CriticLoss:0.8793404698371887  Reward:-149.0 
Episode:9006   ActorLoss:-0.8564256429672241 CriticLoss:0.8680811524391174  Reward:-155.0 
Episode:9007   ActorLoss:-1.3879289627075195 CriticLoss:0.8685438632965088  Reward:-157.0 
Episode:9008   ActorLoss:-0.06892868876457214 CriticLoss:0.8691335916519165  Reward:-162.0 
Episode:9009   ActorLoss:0.31972450017929077 CriticLoss:0.87

                                                  

Moviepy - Done !
Moviepy - video ready /notebooks/rl-algos/videos/MountainCar-v0__0__1700039789/rl-video-episode-10000.mp4


(TrainState(step=Array(10000, dtype=int32, weak_type=True), apply_fn=<bound method Module.apply of Actor(
     # attributes
     action_dims = 3
 )>, params=FrozenDict({
     params: {
         Dense_0: {
             bias: Array([ 0.3890526 , -0.24297777,  0.01788891, -0.01455689,  0.08531468,
                     0.4124792 ,  0.18495555, -0.06584055,  0.3103369 ,  0.274437  ,
                     0.2599745 ,  0.14618884,  0.31844646,  0.35240522,  0.17520374,
                     0.37202245], dtype=float32),
             kernel: Array([[-0.5168959 , -1.6821604 , -1.4188243 ,  0.7976215 , -0.12126786,
                      0.8608355 , -0.04409919,  0.12743327, -1.4020491 , -0.5442583 ,
                     -0.6169948 ,  0.06898312,  1.0469633 , -0.7097582 , -0.9452211 ,
                     -0.5557748 ],
                    [ 0.02562904,  0.8810868 ,  1.5217787 , -0.14143288,  2.4037855 ,
                      1.8099678 , -3.1252618 ,  0.3917418 , -0.14016785,  2.1242764 ,
           