In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/MyDrive/mini_project

/content/drive/MyDrive/mini_project


In [None]:
!pip install stable_baselines3[extra]



In [None]:
!pip install git+https://github.com/metadriverse/metadrive.git

Collecting git+https://github.com/metadriverse/metadrive.git
  Cloning https://github.com/metadriverse/metadrive.git to /tmp/pip-req-build-bz5p0uw1
  Running command git clone --filter=blob:none --quiet https://github.com/metadriverse/metadrive.git /tmp/pip-req-build-bz5p0uw1
  Resolved https://github.com/metadriverse/metadrive.git to commit a09bc963b067c9ce7e348586f43e7253cba55875
  Preparing metadata (setup.py) ... [?25l[?25hdone


## RL Environment

In [None]:
import copy
from metadrive.envs.safe_metadrive_env import SafeMetaDriveEnv

DEFAULT_CONFIG = {
    # The below are default configs copied from SafeMetaDriveEnv
    # Environment difficulty
    "accident_prob": 0.8,
    "traffic_density": 0.05,
    # Termination conditions
    "crash_vehicle_done": False,
    "crash_object_done": False,
    # Reward
    "success_reward": 10.0,
    "driving_reward": 1.0,
    "speed_reward": 0.1,
    # Penalty will be negated and added to reward
    "out_of_road_penalty": 5.0,
    "crash_vehicle_penalty": 1.0,
    "crash_object_penalty": 1.0,
    # Cost will be return in info["cost"] and you can do constrained optimization with it
    "crash_vehicle_cost": 1.0,
    "crash_object_cost": 1.0,
    "out_of_road_cost": 1.0,
}

# Use deepcopy to avoid modifying the DEFAULT_CONFIG
TRAINING_CONFIG = copy.deepcopy(DEFAULT_CONFIG)
TRAINING_CONFIG.update(
    {  # Environment setting
        "num_scenarios": 50,  # There are totally 50 possible maps.
        "start_seed": 100,  # We will use the map with seeds in [100, 150) as the default training environment.
    }
)


def get_training_env(extra_config=None):
    config = copy.deepcopy(TRAINING_CONFIG)
    if extra_config:
        config.update(extra_config)
    return SafeMetaDriveEnv(config)


VALIDATION_CONFIG = copy.deepcopy(DEFAULT_CONFIG)
VALIDATION_CONFIG.update(
    {  # Environment setting
        "num_scenarios": 50,  # There are totally 50 possible maps.
        "start_seed": 1000,  # We will use the map with seeds in [1000, 1050) as the default validation environment.
    }
)


def get_validation_env(extra_config=None):
    config = copy.deepcopy(VALIDATION_CONFIG)
    if extra_config:
        config.update(extra_config)
    return SafeMetaDriveEnv(config)


## Import and utilities

In [None]:
import argparse
import datetime
import logging
import os
import uuid
from collections import defaultdict
from pathlib import Path

import numpy as np
from metadrive.engine.logger import set_log_level
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.ppo import PPO
from stable_baselines3.ppo.policies import ActorCriticPolicy

from stable_baselines3.td3 import TD3
from stable_baselines3.td3.policies import TD3Policy

from wandb.integration.sb3 import WandbCallback

import wandb


# Remove MetaDrive's logging information when episode ends.
set_log_level(logging.ERROR)

In [None]:

def get_time_str():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


def remove_reset_seed_and_add_monitor(make_env, trial_dir):
    """
    MetaDrive env's reset function takes a seed argument and use it to determine the map to load.
    However, in stable-baselines3, it calls reset function with a seed argument serving as the random seed,
    which is not what we want. We do a trick here to remap the random seed to map index.

    Stable-baselines3 recommends using Monitor wrapper to log training data. We add a Monitor wrapper here.
    """
    from gymnasium import Wrapper
    from stable_baselines3.common.monitor import Monitor
    class NewClass(Wrapper):
        def reset(self, seed=None, **kwargs):
            # PZH: We do a trick here to remap the seed to the map index. This can help randomize the maps.
            if seed is not None:
                new_seed = self.env.start_index + (seed % self.env.num_scenarios)
            else:
                new_seed = None
            return self.env.reset(seed=new_seed, **kwargs)

    def new_make_env():
        env = make_env()
        NewClass.__name__ = env.__class__.__name__ + "WithoutResetSeed"
        wrapped_env = NewClass(env)
        wrapped_env = Monitor(env=wrapped_env, filename=str(trial_dir))
        return wrapped_env

    return new_make_env


class CustomizedEvalCallback(EvalCallback):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.evaluations_info_buffer = defaultdict(list)

    def _log_success_callback(self, locals_, globals_):
        info = locals_["info"]

        if locals_["done"]:
            maybe_is_success = info.get("is_success")
            if maybe_is_success is not None:
                self._is_success_buffer.append(maybe_is_success)

            maybe_is_success2 = info.get("arrive_dest", None)
            if maybe_is_success2 is not None:
                self._is_success_buffer.append(maybe_is_success2)

            assert (maybe_is_success is None) or (maybe_is_success2 is None), "We cannot have two success flags!"

            for k in ["route_completion", "total_cost", "arrive_dest", "max_step", "out_of_road", "crash"]:
                if k in info:
                    self.evaluations_info_buffer[k].append(info[k])

        if "raw_action" in info:
            self.evaluations_info_buffer["raw_action"].append(info["raw_action"])

    def _on_step(self) -> bool:
        """
        PZH Note: Overall this function is copied from original EvalCallback._on_step.
        We additionally record evaluations_info_buffer to the logger.
        """

        from stable_baselines3.common.evaluation import evaluate_policy
        from stable_baselines3.common.vec_env import sync_envs_normalization

        continue_training = True

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            if self.model.get_vec_normalize_env() is not None:
                try:
                    sync_envs_normalization(self.training_env, self.eval_env)
                except AttributeError as e:
                    raise AssertionError(
                        "Training and eval env are not wrapped the same way, "
                        "see https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html#evalcallback "
                        "and warning above."
                    ) from e

            # Reset success rate buffer
            self._is_success_buffer = []

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                warn=self.warn,
                callback=self._log_success_callback,
            )

            if self.log_path is not None:
                assert isinstance(episode_rewards, list)
                assert isinstance(episode_lengths, list)
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)

                kwargs = {}
                # Save success log if present
                if len(self._is_success_buffer) > 0:
                    self.evaluations_successes.append(self._is_success_buffer)
                    kwargs = dict(successes=self.evaluations_successes)

                # PZH: Save evaluations_info_buffer to the log file
                for k, v in self.evaluations_info_buffer.items():
                    kwargs[k] = v

                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                    **kwargs,  # type: ignore[arg-type]
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(episode_lengths)
            self.last_mean_reward = float(mean_reward)

            if self.verbose >= 1:
                print(
                    f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}")
            # Add to current Logger
            self.logger.record("eval/mean_reward", float(mean_reward))
            self.logger.record("eval/mean_ep_length", mean_ep_length)

            # PZH: Add this metric.
            self.logger.record("eval/num_episodes", len(episode_rewards))

            if len(self._is_success_buffer) > 0:
                success_rate = np.mean(self._is_success_buffer)
                if self.verbose >= 1:
                    print(f"Success rate: {100 * success_rate:.2f}%")
                self.logger.record("eval/success_rate", success_rate)

            # PZH: We record evaluations_info_buffer to the logger
            for k, v in self.evaluations_info_buffer.items():
                self.logger.record("eval/{}".format(k), np.mean(np.asarray(v)))

            # Dump log so the evaluation results are printed with the correct timestep
            self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
            self.logger.dump(self.num_timesteps)

            if mean_reward > self.best_mean_reward:
                if self.verbose >= 1:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(os.path.join(self.best_model_save_path, "best_model"))
                self.best_mean_reward = float(mean_reward)
                # Trigger callback on new best model, if needed
                if self.callback_on_new_best is not None:
                    continue_training = self.callback_on_new_best.on_step()

            # Trigger callback after every evaluation, if needed
            if self.callback is not None:
                continue_training = continue_training and self._on_event()

        return continue_training


## Setup PPO trainer


In [None]:

# ===== Set up some arguments =====
exp_name = "ppo_metadrive"
use_wandb = True

experiment_batch_name = "{}".format(exp_name)
trial_name = "{}_{}_{}".format(experiment_batch_name, get_time_str(), uuid.uuid4().hex[:8])
experiment_dir = Path("runs") / experiment_batch_name
trial_dir = experiment_dir / trial_name
os.makedirs(experiment_dir, exist_ok=True)
os.makedirs(trial_dir, exist_ok=True)
print(f"We start logging training data into {trial_dir}")


We start logging training data into runs/ppo_metadrive/ppo_metadrive_2025-03-17_00-19-44_726dad05


In [None]:
# ===== Setup environment =====
num_train_envs = 10
num_eval_envs = 5
train_env = make_vec_env(remove_reset_seed_and_add_monitor(get_training_env, trial_dir), n_envs=num_train_envs,
                            vec_env_cls=SubprocVecEnv)
eval_env = make_vec_env(remove_reset_seed_and_add_monitor(get_validation_env, trial_dir), n_envs=num_eval_envs,
                        vec_env_cls=SubprocVecEnv)

In [None]:
# ===== Setup evaluation, checkpointing, and wandb =====
save_freq = 10_000  # Number of steps per model checkpoint
eval_freq = 10_000  # Number of steps per evaluation

wandb_save_freq = 10_000  # Number of steps per evaluation

num_eval_episodes = 5

checkpoint_callback = CheckpointCallback(
    name_prefix="rl_model",
    verbose=2,
    save_freq=save_freq,
    save_path=str(trial_dir / "models")
)
eval_callback = CustomizedEvalCallback(
    eval_env,
    best_model_save_path=str(trial_dir / "eval"),
    log_path=str(trial_dir / "eval"),
    eval_freq=max(eval_freq // num_train_envs, 1),
    n_eval_episodes=num_eval_episodes,
)
callbacks = [checkpoint_callback, eval_callback]
if use_wandb:
    wandb.init(
        project="cs260r",
        id=trial_name,
        name=experiment_batch_name,
        sync_tensorboard=True,
        dir=str(trial_dir),
    )
    callbacks.append(WandbCallback(model_save_path=str(trial_dir / "wandb_models"), model_save_freq=wandb_save_freq))
callbacks = CallbackList(callbacks)

  return LooseVersion(v) >= LooseVersion(check)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcoltonrowe[0m ([33mcoltonrowe-ucla[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:

# ===== Setup the training algorithm =====
# model = TD3(
#     env=train_env,
#     policy=TD3Policy,
#     learning_rate=5e-5,
#     buffer_size=1_000_000,
#     learning_starts=100,
#     batch_size=256,
#     tau=0.005,
#     gamma = 0.99,
#     train_freq=1,
#     gradient_steps=1,
#     action_noise=None,
#     replay_buffer_class=None,
#     replay_buffer_kwargs=None,
#     optimize_memory_usage=False,
#     policy_delay=2,
#     target_policy_noise=0.2,
#     target_noise_clip=0.5,
#     stats_window_size=100,
#     tensorboard_log=None,
#     policy_kwargs=None,
#     verbose=2,
#     seed=None,
#     device='auto',
#     _init_setup_model=True
#     )
model = PPO(
    env=train_env,
    policy=ActorCriticPolicy,
    n_steps=500,  # n_steps * n_envs = total_batch_size
    n_epochs=20,
    learning_rate=5e-5,
    batch_size=256,
    clip_range=0.1,
    vf_coef=0.5,
    ent_coef=0.0,
    max_grad_norm=10.0,
    tensorboard_log=str(trial_dir),
    verbose=2,
    device="auto",
)


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=500 and n_envs=10)


In [None]:
ckpt = None
if ckpt:
    ckpt = Path(ckpt)
    print(f"Loading checkpoint from {ckpt}!")
    from stable_baselines3.common.save_util import load_from_zip_file
    data, params, pytorch_variables = load_from_zip_file(ckpt, device=model.device, print_system_info=False)
    model.set_parameters(params, exact_match=True, device=model.device)


In [None]:
# ===== Launch training =====
total_timesteps = 1_000_000  # 1M steps
model.learn(
    total_timesteps=total_timesteps,
    callback=callbacks,
    reset_num_timesteps=True,
    tb_log_name=experiment_batch_name,
    log_interval=1,
    progress_bar=True,
)

Logging to runs/ppo_metadrive/ppo_metadrive_2025-03-17_00-19-44_726dad05/ppo_metadrive_1


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 369      |
|    ep_rew_mean     | -0.553   |
| time/              |          |
|    fps             | 838      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 5000     |
---------------------------------


-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0           |
|    crash                | 0           |
|    max_step             | 0           |
|    mean_ep_length       | 192         |
|    mean_reward          | 41.6        |
|    num_episodes         | 5           |
|    out_of_road          | 1           |
|    raw_action           | 0.03362635  |
|    route_completion     | 0.127       |
|    success_rate         | 0           |
|    total_cost           | 1           |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.002651404 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.1         |
|    entropy_loss         | -2.84       |
|    explained_variance   | -0.00145    |
|    learning_rate        | 5e-05       |
|    loss                 | 0.0101      |
|    n_updates            | 20    

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 369      |
|    ep_rew_mean     | -0.553   |
| time/              |          |
|    fps             | 639      |
|    iterations      | 2        |
|    time_elapsed    | 15       |
|    total_timesteps | 10000    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 369          |
|    ep_rew_mean          | -0.553       |
| time/                   |              |
|    fps                  | 675          |
|    iterations           | 3            |
|    time_elapsed         | 22           |
|    total_timesteps      | 15000        |
| train/                  |              |
|    approx_kl            | 0.0042068968 |
|    clip_fraction        | 0.249        |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.83        |
|    explained_variance   | 0.0403       |
|    learning_r

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0           |
|    crash                | 0           |
|    max_step             | 0           |
|    mean_ep_length       | 106         |
|    mean_reward          | 42.4        |
|    num_episodes         | 5           |
|    out_of_road          | 1           |
|    raw_action           | 0.06419066  |
|    route_completion     | 0.123       |
|    success_rate         | 0           |
|    total_cost           | 1           |
| time/                   |             |
|    total_timesteps      | 20000       |
| train/                  |             |
|    approx_kl            | 0.003851429 |
|    clip_fraction        | 0.269       |
|    clip_range           | 0.1         |
|    entropy_loss         | -2.81       |
|    explained_variance   | 0.0683      |
|    learning_rate        | 5e-05       |
|    loss                 | 0.0104      |
|    n_updates            | 60    

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.2e+03  |
|    ep_rew_mean     | 12.9     |
| time/              |          |
|    fps             | 632      |
|    iterations      | 4        |
|    time_elapsed    | 31       |
|    total_timesteps | 20000    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.32e+03     |
|    ep_rew_mean          | 16.3         |
| time/                   |              |
|    fps                  | 655          |
|    iterations           | 5            |
|    time_elapsed         | 38           |
|    total_timesteps      | 25000        |
| train/                  |              |
|    approx_kl            | 0.0031472477 |
|    clip_fraction        | 0.184        |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.8         |
|    explained_variance   | 0.0133       |
|    learning_r

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0           |
|    crash                | 0           |
|    max_step             | 0           |
|    mean_ep_length       | 60.6        |
|    mean_reward          | 17.9        |
|    num_episodes         | 5           |
|    out_of_road          | 1           |
|    raw_action           | 0.08423468  |
|    route_completion     | 0.114       |
|    success_rate         | 0           |
|    total_cost           | 1           |
| time/                   |             |
|    total_timesteps      | 30000       |
| train/                  |             |
|    approx_kl            | 0.003858048 |
|    clip_fraction        | 0.258       |
|    clip_range           | 0.1         |
|    entropy_loss         | -2.78       |
|    explained_variance   | 0.000143    |
|    learning_rate        | 5e-05       |
|    loss                 | 0.0418      |
|    n_updates            | 100   

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0            |
|    crash                | 0            |
|    max_step             | 0            |
|    mean_ep_length       | 62.2         |
|    mean_reward          | 28.4         |
|    num_episodes         | 5            |
|    out_of_road          | 1            |
|    raw_action           | 0.11059802   |
|    route_completion     | 0.116        |
|    success_rate         | 0            |
|    total_cost           | 1            |
| time/                   |              |
|    total_timesteps      | 40000        |
| train/                  |              |
|    approx_kl            | 0.0025350326 |
|    clip_fraction        | 0.146        |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.76        |
|    explained_variance   | 0.0359       |
|    learning_rate        | 5e-05        |
|    loss                 | 0.345        |
|    n_upda

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0            |
|    crash                | 0            |
|    max_step             | 0            |
|    mean_ep_length       | 142          |
|    mean_reward          | 97.8         |
|    num_episodes         | 5            |
|    out_of_road          | 1            |
|    raw_action           | 0.16267653   |
|    route_completion     | 0.175        |
|    success_rate         | 0            |
|    total_cost           | 4.8          |
| time/                   |              |
|    total_timesteps      | 50000        |
| train/                  |              |
|    approx_kl            | 0.0016656378 |
|    clip_fraction        | 0.0638       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.73        |
|    explained_variance   | 0.0904       |
|    learning_rate        | 5e-05        |
|    loss                 | 0.596        |
|    n_upda

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 850      |
|    ep_rew_mean     | 17.6     |
| time/              |          |
|    fps             | 554      |
|    iterations      | 10       |
|    time_elapsed    | 90       |
|    total_timesteps | 50000    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 752          |
|    ep_rew_mean          | 17.7         |
| time/                   |              |
|    fps                  | 544          |
|    iterations           | 11           |
|    time_elapsed         | 101          |
|    total_timesteps      | 55000        |
| train/                  |              |
|    approx_kl            | 0.0023093007 |
|    clip_fraction        | 0.146        |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.72        |
|    explained_variance   | 0.0762       |
|    learning_r

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0            |
|    crash                | 0.0333       |
|    max_step             | 0            |
|    mean_ep_length       | 67           |
|    mean_reward          | 43.5         |
|    num_episodes         | 5            |
|    out_of_road          | 1            |
|    raw_action           | 0.18630944   |
|    route_completion     | 0.17         |
|    success_rate         | 0            |
|    total_cost           | 4.17         |
| time/                   |              |
|    total_timesteps      | 60000        |
| train/                  |              |
|    approx_kl            | 0.0012999474 |
|    clip_fraction        | 0.0513       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.71        |
|    explained_variance   | -0.00672     |
|    learning_rate        | 5e-05        |
|    loss                 | 0.89         |
|    n_upda

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0            |
|    crash                | 0.0286       |
|    max_step             | 0            |
|    mean_ep_length       | 60.6         |
|    mean_reward          | 37.9         |
|    num_episodes         | 5            |
|    out_of_road          | 1            |
|    raw_action           | 0.20788956   |
|    route_completion     | 0.165        |
|    success_rate         | 0            |
|    total_cost           | 3.71         |
| time/                   |              |
|    total_timesteps      | 70000        |
| train/                  |              |
|    approx_kl            | 0.0016013015 |
|    clip_fraction        | 0.0599       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.7         |
|    explained_variance   | -0.0303      |
|    learning_rate        | 5e-05        |
|    loss                 | 0.967        |
|    n_upda

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0            |
|    crash                | 0.05         |
|    max_step             | 0            |
|    mean_ep_length       | 48.8         |
|    mean_reward          | 26.2         |
|    num_episodes         | 5            |
|    out_of_road          | 1            |
|    raw_action           | 0.2233046    |
|    route_completion     | 0.157        |
|    success_rate         | 0            |
|    total_cost           | 3.38         |
| time/                   |              |
|    total_timesteps      | 80000        |
| train/                  |              |
|    approx_kl            | 0.0014195004 |
|    clip_fraction        | 0.0398       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.68        |
|    explained_variance   | 0.0509       |
|    learning_rate        | 5e-05        |
|    loss                 | 2.26         |
|    n_upda

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0            |
|    crash                | 0.0889       |
|    max_step             | 0            |
|    mean_ep_length       | 94.8         |
|    mean_reward          | 111          |
|    num_episodes         | 5            |
|    out_of_road          | 1            |
|    raw_action           | 0.25366578   |
|    route_completion     | 0.177        |
|    success_rate         | 0            |
|    total_cost           | 3.18         |
| time/                   |              |
|    total_timesteps      | 90000        |
| train/                  |              |
|    approx_kl            | 0.0015916002 |
|    clip_fraction        | 0.0762       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.66        |
|    explained_variance   | 0.0685       |
|    learning_rate        | 5e-05        |
|    loss                 | 2.11         |
|    n_upda

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 241      |
|    ep_rew_mean     | 16.5     |
| time/              |          |
|    fps             | 492      |
|    iterations      | 18       |
|    time_elapsed    | 182      |
|    total_timesteps | 90000    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 233          |
|    ep_rew_mean          | 17.8         |
| time/                   |              |
|    fps                  | 488          |
|    iterations           | 19           |
|    time_elapsed         | 194          |
|    total_timesteps      | 95000        |
| train/                  |              |
|    approx_kl            | 0.0016711758 |
|    clip_fraction        | 0.0434       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.65        |
|    explained_variance   | 0.0496       |
|    learning_r

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0            |
|    crash                | 0.1          |
|    max_step             | 0            |
|    mean_ep_length       | 81.4         |
|    mean_reward          | 81.5         |
|    num_episodes         | 5            |
|    out_of_road          | 1            |
|    raw_action           | 0.27489084   |
|    route_completion     | 0.183        |
|    success_rate         | 0            |
|    total_cost           | 3.16         |
| time/                   |              |
|    total_timesteps      | 100000       |
| train/                  |              |
|    approx_kl            | 0.0010825213 |
|    clip_fraction        | 0.0324       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.63        |
|    explained_variance   | -0.0155      |
|    learning_rate        | 5e-05        |
|    loss                 | 2.15         |
|    n_upda

OSError: [Errno 95] Operation not supported: '/content/drive/MyDrive/mini_project/runs/ppo_metadrive/ppo_metadrive_2025-03-17_00-19-44_726dad05/wandb_models/model.zip' -> 'runs/ppo_metadrive/ppo_metadrive_2025-03-17_00-19-44_726dad05/wandb/run-20250317_002006-ppo_metadrive_2025-03-17_00-19-44_726dad05/files/model.zip'