# **Library Loading**


In [None]:
!pip install stable-baselines3[extra]
!pip install shimmy>=2.0
!pip install swig
!pip install pyvirtualdisplay
!pip install sbx-rl
!pip install optuna
!pip install "optuna>=3.3.0" "optuna-dashboard>=0.12.0"

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cublas_cu12-12.4.5.8-py

In [None]:
import gymnasium as gym
from gymnasium import spaces
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import numpy as np
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from math import radians
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3 import DQN
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.buffers import RolloutBuffer
import random as rand

import torch
import torch.nn as nn

from typing import Any, Dict

# **Video Loading**

In [None]:
#'eval_video/*.mp4'

def show_video(path):
    mp4list = glob.glob(path)
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

# **Clear Training Videos**

In [None]:
! rm -rf /content/video
! rm -rf /content/mountain_cart_tensorboard
! rm -rf /content/eval_video_dqn
! rm -rf /content/eval_video_ppo

# **Output Quantizing** [2]

In [None]:
def discretize_state_obs(obs, env):
  try:
    assert not np.any(np.isnan(obs)), "NaN in obs"
    assert not np.any(np.isinf(obs)), "inf in obs"
    upper_bounds = env.observation_space.high
    lower_bounds = env.observation_space.low
    state_buckets = (20, 20)

    scaling_factors = [(obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(obs))]
    new_obs = [int(round((state_buckets[i] - 1) * scaling_factors[i])) for i in range(len(obs))]
    new_obs = [min(max(0, new_obs[i]), state_buckets[i] - 1) for i in range(len(obs))]
    return tuple(new_obs)
  except Exception as e:
    print('Error in discretize_state_obs: ', e)
    print("OBS: ", obs)
    raise e


In [None]:
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """

    def _init():
        env = gym.make(env_id)
        env = gym.wrappers.TransformObservation(env, lambda obs: discretize_state_obs(obs, env), env.observation_space)
        # use a seed for reproducibility
        # Important: use a different seed for each environment
        # otherwise they would generate the same experiences
        env.reset(seed=seed + rank)
        return env

    set_random_seed(seed)
    return _init

# **Hyperparameter Tuning for A2C**

In [None]:
ENV = gym.make("MountainCar-v0")
ENV = gym.wrappers.TransformObservation(ENV, lambda obs: discretize_state_obs(obs, ENV), ENV.observation_space)

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV,
    "normalize_advantage": True
}

N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 5
N_TIMESTEPS = int(5e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 100

def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    gae_lambda = 1.0 - trial.suggest_float("gae_lambda", 0.001, 0.2, log=True)
    n_steps_exp = trial.suggest_int("n_steps_exp", 8, 15, log=True)
    n_steps = 2**n_steps_exp
    learning_rate = trial.suggest_float("lr", 1e-7, 1e-3, log=True)
    vf_coef = trial.suggest_float("vf_coef", 0.01, 1, log=True)
    ent_coef = trial.suggest_float("ent_coef", 0.00001, 0.1, log=True)
    #ortho_init = trial.suggest_categorical("ortho_init", [True, False])
    #activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])
    rms_prop_eps = trial.suggest_float("rms_prop_eps", 1e-8, 1e-3, log=True)
    use_rms_prop = trial.suggest_categorical("use_rms_prop", [True, False])

    """
    buffer_size = trial.suggest_int("buffer_size", 10000, 100000)
    rollout_buffer_type = trial.suggest_categorical("rollout_buffer_type", ["None", "RolloutBuffer"])

    if rollout_buffer_type == "RolloutBuffer":
        rollout_buffer_class = RolloutBuffer
        rollout_buffer_kwargs = {
            "buffer_size": buffer_size,
            "observation_space": ENV.observation_space,
            "action_space": ENV.action_space
        }

    else:
        rollout_buffer_class = None
        rollout_buffer_kwargs = None
    """

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "vf_coef":vf_coef,
        "max_grad_norm": max_grad_norm,
        "rms_prop_eps": rms_prop_eps,
        "use_rms_prop": use_rms_prop,
        #"rollout_buffer_class": rollout_buffer_class,
        #"rollout_buffer_kwargs": rollout_buffer_kwargs,
    }


In [None]:
class TrialEvalCallback(EvalCallback):
    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 100,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [None]:
def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    kwargs.update(sample_a2c_params(trial))

    # Create the RL model.
    model = A2C(**kwargs)
    eval_env = gym.make("MountainCar-v0")
    eval_env = gym.wrappers.TransformObservation(eval_env, lambda obs: discretize_state_obs(obs, eval_env), eval_env.observation_space)
    eval_env = Monitor(eval_env)

    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [None]:
if __name__ == '__main__':
  torch.set_num_threads(1)

  sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
  pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 4)

  study = optuna.create_study(sampler=sampler, storage="sqlite:///db.sqlite3", pruner=pruner, direction="maximize")

  try:
    study.optimize(objective, n_trials=N_TRIALS, timeout=86400)
  except KeyboardInterrupt:
    pass
  print("Number of finished trails; ", len(study.trials))
  print("Best Trial:")
  trial = study.best_trial

  print("  Value: ", trial.value)
  print("  Params: ")
  for k, v in trial.params.items():
    print("    {}: {}".format(k,v))

  print("User Attributes:")
  for k,v in trial.user_attrs.items():
    print("  {}: {}".format(k,v))

  ENV.close()



[I 2025-04-27 15:54:43,273] A new study created in RDB with name: no-name-abb6a19e-13bd-4615-9b70-ac8e70fae7f0
[I 2025-04-27 16:06:19,180] Trial 0 finished with value: -200.0 and parameters: {'gamma': 0.00012171855494882901, 'max_grad_norm': 0.4614504377134854, 'gae_lambda': 0.0025791430195552137, 'n_steps_exp': 14, 'lr': 2.850203830682807e-06, 'vf_coef': 0.04920649906995222, 'ent_coef': 1.4383752430327301e-05, 'rms_prop_eps': 5.69639110647575e-08, 'use_rms_prop': False}. Best is trial 0 with value: -200.0.
[I 2025-04-27 16:19:11,380] Trial 1 finished with value: -200.0 and parameters: {'gamma': 0.011016226132404548, 'max_grad_norm': 0.4326054811521159, 'gae_lambda': 0.03514206002313687, 'n_steps_exp': 15, 'lr': 1.3758281436290249e-05, 'vf_coef': 0.5706115078921707, 'ent_coef': 0.003818619323483286, 'rms_prop_eps': 3.0577581425137026e-08, 'use_rms_prop': False}. Best is trial 0 with value: -200.0.
[I 2025-04-27 16:31:49,182] Trial 2 finished with value: -200.0 and parameters: {'gamma':

In [None]:
! optuna-dashboard sqlite:///db.sqlite3 --artifact-dir ./artifact