In [1]:
SCENARIOS = 5000
VALIDATION_SCENARIOS = 50

## RL Environment

In [2]:
import copy
from metadrive.envs.safe_metadrive_env import SafeMetaDriveEnv

DEFAULT_CONFIG = {
    # The below are default configs copied from SafeMetaDriveEnv
    # Environment difficulty
    "accident_prob": 0.8,
    "traffic_density": 0.05,
    # Termination conditions
    "crash_vehicle_done": False,
    "crash_object_done": False,
    # Reward
    "success_reward": 10.0,
    "driving_reward": 1.0,
    "speed_reward": 0.1,
    # Penalty will be negated and added to reward
    "out_of_road_penalty": 2.0,
    "crash_vehicle_penalty": 0.5,
    "crash_object_penalty": 0.5,
    # Cost will be return in info["cost"] and you can do constrained optimization with it
    "crash_vehicle_cost": 1.0,
    "crash_object_cost": 1.0,
    "out_of_road_cost": 1.0,
}

# Use deepcopy to avoid modifying the DEFAULT_CONFIG
TRAINING_CONFIG = copy.deepcopy(DEFAULT_CONFIG)
TRAINING_CONFIG.update(
    {  # Environment setting
        "num_scenarios": SCENARIOS,  # There are totally 50 possible maps.
        "start_seed": 100,  # We will use the map with seeds in [100, 150) as the default training environment.
    }
)


def get_training_env(extra_config=None):
    config = copy.deepcopy(TRAINING_CONFIG)
    if extra_config:
        config.update(extra_config)
    return SafeMetaDriveEnv(config)


VALIDATION_CONFIG = copy.deepcopy(DEFAULT_CONFIG)
VALIDATION_CONFIG.update(
    {  # Environment setting
        "num_scenarios": VALIDATION_SCENARIOS,  # There are totally 50 possible maps.
        "start_seed": SCENARIOS * 2,  # We will use the map with seeds in [1000, 1050) as the default validation environment.
    }
)


def get_validation_env(extra_config=None):
    config = copy.deepcopy(VALIDATION_CONFIG)
    if extra_config:
        config.update(extra_config)
    return SafeMetaDriveEnv(config)


## Import and utilities

In [3]:
import argparse
import datetime
import logging
import os
import uuid
from collections import defaultdict
from pathlib import Path

import numpy as np
from metadrive.engine.logger import set_log_level
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.ppo import PPO
from stable_baselines3.ppo.policies import ActorCriticPolicy
from stable_baselines3.ppo import MultiInputPolicy

from stable_baselines3.td3 import TD3
from stable_baselines3.td3.policies import TD3Policy

from stable_baselines3.sac import SAC
from stable_baselines3.sac.policies import SACPolicy

from wandb.integration.sb3 import WandbCallback

import wandb


# Remove MetaDrive's logging information when episode ends.
set_log_level(logging.ERROR)

In [4]:

def get_time_str():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


def remove_reset_seed_and_add_monitor(make_env, trial_dir):
    """
    MetaDrive env's reset function takes a seed argument and use it to determine the map to load.
    However, in stable-baselines3, it calls reset function with a seed argument serving as the random seed,
    which is not what we want. We do a trick here to remap the random seed to map index.

    Stable-baselines3 recommends using Monitor wrapper to log training data. We add a Monitor wrapper here.
    """
    from gymnasium import Wrapper
    from stable_baselines3.common.monitor import Monitor
    class NewClass(Wrapper):
        def reset(self, seed=None, **kwargs):
            # PZH: We do a trick here to remap the seed to the map index. This can help randomize the maps.
            if seed is not None:
                new_seed = self.env.start_index + (seed % self.env.num_scenarios)
            else:
                new_seed = None
            return self.env.reset(seed=new_seed, **kwargs)

    def new_make_env():
        env = make_env()
        NewClass.__name__ = env.__class__.__name__ + "WithoutResetSeed"
        wrapped_env = NewClass(env)
        wrapped_env = Monitor(env=wrapped_env, filename=str(trial_dir))
        return wrapped_env

    return new_make_env


class CustomizedEvalCallback(EvalCallback):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.evaluations_info_buffer = defaultdict(list)
        self.training_info_buffer = defaultdict(list)
        self.train_timesteps = list()
        self.train_results = list()
        self.train_length = list()

    def _log_success_callback(self, locals_, globals_):
        info = locals_["info"]

        if locals_["done"]:
            maybe_is_success = info.get("is_success")
            if maybe_is_success is not None:
                self._is_success_buffer.append(maybe_is_success)

            maybe_is_success2 = info.get("arrive_dest", None)
            if maybe_is_success2 is not None:
                self._is_success_buffer.append(maybe_is_success2)

            assert (maybe_is_success is None) or (maybe_is_success2 is None), "We cannot have two success flags!"

            for k in ["route_completion", "total_cost", "arrive_dest", "max_step", "out_of_road", "crash"]:
                if k in info:
                    self.evaluations_info_buffer[k].append(info[k])
    
    def _log_train_success_callback(self, locals_, globals_):
        info = locals_["info"]

        if locals_["done"]:
            maybe_is_success = info.get("is_success")
            if maybe_is_success is not None:
                self._is_success_buffer.append(maybe_is_success)

            maybe_is_success2 = info.get("arrive_dest", None)
            if maybe_is_success2 is not None:
                self._is_success_buffer.append(maybe_is_success2)

            assert (maybe_is_success is None) or (maybe_is_success2 is None), "We cannot have two success flags!"

            for k in ["route_completion", "total_cost", "arrive_dest", "max_step", "out_of_road", "crash"]:
                if k in info:
                    self.training_info_buffer[k].append(info[k])

        if "raw_action" in info:
            self.evaluations_info_buffer["raw_action"].append(info["raw_action"])

    def _on_step(self) -> bool:
        """
        PZH Note: Overall this function is copied from original EvalCallback._on_step.
        We additionally record evaluations_info_buffer to the logger.
        """

        from stable_baselines3.common.evaluation import evaluate_policy
        from stable_baselines3.common.vec_env import sync_envs_normalization

        continue_training = True

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            if self.model.get_vec_normalize_env() is not None:
                try:
                    sync_envs_normalization(self.training_env, self.eval_env)
                except AttributeError as e:
                    raise AssertionError(
                        "Training and eval env are not wrapped the same way, "
                        "see https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html#evalcallback "
                        "and warning above."
                    ) from e

            # Reset success rate buffer
            self._is_success_buffer = []

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                warn=self.warn,
                callback=self._log_success_callback,
            )

            train_episode_rewards, train_episode_lengths = evaluate_policy(
                self.model,
                self.training_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                warn=self.warn,
                callback=self._log_train_success_callback,
            )

            if self.log_path is not None:
                assert isinstance(episode_rewards, list)
                assert isinstance(episode_lengths, list)
                assert isinstance(train_episode_rewards, list)
                assert isinstance(train_episode_lengths, list)
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)

                kwargs = {}
                # Save success log if present
                if len(self._is_success_buffer) > 0:
                    self.evaluations_successes.append(self._is_success_buffer)
                    kwargs = dict(successes=self.evaluations_successes)

                # PZH: Save evaluations_info_buffer to the log file
                for k, v in self.evaluations_info_buffer.items():
                    kwargs[k] = v

                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                    **kwargs,  # type: ignore[arg-type]
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(episode_lengths)
            self.last_mean_reward = float(mean_reward)

            if self.verbose >= 1:
                print(
                    f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}")
            # Add to current Logger
            self.logger.record("eval/mean_reward", float(mean_reward))
            self.logger.record("eval/mean_ep_length", mean_ep_length)

            # PZH: Add this metric.
            self.logger.record("eval/num_episodes", len(episode_rewards))

            if len(self._is_success_buffer) > 0:
                success_rate = np.mean(self._is_success_buffer)
                if self.verbose >= 1:
                    print(f"Success rate: {100 * success_rate:.2f}%")
                self.logger.record("eval/success_rate", success_rate)

            # PZH: We record evaluations_info_buffer to the logger
            for k, v in self.evaluations_info_buffer.items():
                self.logger.record("eval/{}".format(k), np.mean(np.asarray(v)))

            for k, v in self.training_info_buffer.items():
                self.logger.record("train/{}".format(k), np.mean(np.asarray(v)))

            # Dump log so the evaluation results are printed with the correct timestep
            self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
            self.logger.dump(self.num_timesteps)

            if mean_reward > self.best_mean_reward:
                if self.verbose >= 1:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(os.path.join(self.best_model_save_path, "best_model"))
                self.best_mean_reward = float(mean_reward)
                # Trigger callback on new best model, if needed
                if self.callback_on_new_best is not None:
                    continue_training = self.callback_on_new_best.on_step()

            # Trigger callback after every evaluation, if needed
            if self.callback is not None:
                continue_training = continue_training and self._on_event()

        return continue_training


## Setup PPO trainer


In [5]:

# ===== Set up some arguments =====
exp_name = "ppo_metadrive" + "_new_reward_" + str(SCENARIOS)
use_wandb = True

experiment_batch_name = "{}".format(exp_name)
trial_name = "{}_{}_{}".format(experiment_batch_name, get_time_str(), uuid.uuid4().hex[:8])
experiment_dir = Path("runs") / experiment_batch_name
trial_dir = experiment_dir / trial_name
os.makedirs(experiment_dir, exist_ok=True)
os.makedirs(trial_dir, exist_ok=True)
print(f"We start logging training data into {trial_dir}")


We start logging training data into runs\ppo_metadrive_new_reward_5000\ppo_metadrive_new_reward_5000_2025-03-20_10-44-32_97412f1e


In [6]:
# ===== Setup environment =====
num_train_envs = 10
num_eval_envs = 5
train_env = make_vec_env(remove_reset_seed_and_add_monitor(get_training_env, trial_dir), n_envs=num_train_envs,
                            vec_env_cls=SubprocVecEnv)
eval_env = make_vec_env(remove_reset_seed_and_add_monitor(get_validation_env, trial_dir), n_envs=num_eval_envs,
                        vec_env_cls=SubprocVecEnv)

In [7]:
# ===== Setup evaluation, checkpointing, and wandb =====
save_freq = 10_000  # Number of steps per model checkpoint
eval_freq = 10_000  # Number of steps per evaluation

wandb_save_freq = 10_000  # Number of steps per evaluation

num_eval_episodes = 5

checkpoint_callback = CheckpointCallback(
    name_prefix="rl_model",
    verbose=2,
    save_freq=save_freq,
    save_path=str(trial_dir / "models")
)
eval_callback = CustomizedEvalCallback(
    eval_env,
    best_model_save_path=str(trial_dir / "eval"),
    log_path=str(trial_dir / "eval"),
    eval_freq=max(eval_freq // num_train_envs, 1),
    n_eval_episodes=num_eval_episodes,
)
callbacks = [checkpoint_callback, eval_callback]
if use_wandb:
    wandb.init(
        project="cs260r",
        id=trial_name,
        name=experiment_batch_name,
        sync_tensorboard=True,
        dir=str(trial_dir),
    )
    callbacks.append(WandbCallback(model_save_path=str(trial_dir / "wandb_models"), model_save_freq=wandb_save_freq))
callbacks = CallbackList(callbacks)

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: coltonrowe (coltonrowe-ucla) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


In [8]:
model = PPO(
    env=train_env,
    policy=ActorCriticPolicy,
    n_steps=512,  # n_steps * n_envs = total_batch_size
    n_epochs=20,
    learning_rate=5e-5,
    batch_size=256,
    gamma = 0.995,
    clip_range=0.1,
    vf_coef=0.5,
    ent_coef=0.0,
    max_grad_norm=10.0,
    tensorboard_log=str(trial_dir),
    verbose=2,
    device="auto",
)


Using cpu device


In [None]:
ckpt = r"C:\Users\Colton\Documents\GitHub\cs260r-assignment-2025winter\mini_project\runs\ppo_metadrive_new_reward_5000\ppo_metadrive_new_reward_5000_2025-03-20_10-44-32_97412f1e\models\rl_model_500000_steps.zip"
if ckpt:
    ckpt = Path(ckpt)
    print(f"Loading checkpoint from {ckpt}!")
    from stable_baselines3.common.save_util import load_from_zip_file
    data, params, pytorch_variables = load_from_zip_file(ckpt, device=model.device, print_system_info=False)
    model.set_parameters(params, exact_match=True, device=model.device)


Loading checkpoint from C:\Users\Colton\Documents\GitHub\cs260r-assignment-2025winter\mini_project\runs\ppo_metadrive_new_reward_5000\ppo_metadrive_new_reward_5000_2025-03-20_09-12-37_679e3d6f\models\rl_model_1300000_steps.zip!


In [None]:
# ===== Launch training =====
total_timesteps = 2_000_000  # 1M steps
model.learn(
    total_timesteps=total_timesteps,
    callback=callbacks,
    reset_num_timesteps=True,
    tb_log_name=experiment_batch_name,
    log_interval=1,
    progress_bar=True,
)

Logging to runs\ppo_metadrive_new_reward_5000\ppo_metadrive_new_reward_5000_2025-03-20_10-44-32_97412f1e\ppo_metadrive_new_reward_5000_1


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 370      |
|    ep_rew_mean     | 325      |
| time/              |          |
|    fps             | 772      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 5120     |
---------------------------------


------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0            |
|    crash                | 0.4          |
|    max_step             | 0            |
|    mean_ep_length       | 197          |
|    mean_reward          | 179          |
|    num_episodes         | 5            |
|    out_of_road          | 1            |
|    raw_action           | 0.44354254   |
|    route_completion     | 0.55         |
|    success_rate         | 0.1          |
|    total_cost           | 30.6         |
| time/                   |              |
|    total_timesteps      | 10000        |
| train/                  |              |
|    approx_kl            | 0.0011008839 |
|    arrive_dest          | 0.2          |
|    clip_fraction        | 0.135        |
|    clip_range           | 0.1          |
|    crash                | 0.4          |
|    entropy_loss         | -1.91        |
|    explained_variance   | 0.776        |
|    learni

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 362      |
|    ep_rew_mean     | 321      |
| time/              |          |
|    fps             | 391      |
|    iterations      | 2        |
|    time_elapsed    | 26       |
|    total_timesteps | 10240    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 353         |
|    ep_rew_mean          | 316         |
| time/                   |             |
|    fps                  | 459         |
|    iterations           | 3           |
|    time_elapsed         | 33          |
|    total_timesteps      | 15360       |
| train/                  |             |
|    approx_kl            | 0.017673183 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0.1         |
|    entropy_loss         | -1.91       |
|    explained_variance   | 0.755       |
|    learning_rate        | 5e

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.2          |
|    crash                | 0.2          |
|    max_step             | 0            |
|    mean_ep_length       | 164          |
|    mean_reward          | 238          |
|    num_episodes         | 5            |
|    out_of_road          | 0.8          |
|    raw_action           | 0.45574346   |
|    route_completion     | 0.607        |
|    success_rate         | 0.2          |
|    total_cost           | 16.6         |
| time/                   |              |
|    total_timesteps      | 20000        |
| train/                  |              |
|    approx_kl            | 0.0016996687 |
|    arrive_dest          | 0.1          |
|    clip_fraction        | 0.135        |
|    clip_range           | 0.1          |
|    crash                | 0.4          |
|    entropy_loss         | -1.91        |
|    explained_variance   | 0.698        |
|    learni

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 352      |
|    ep_rew_mean     | 314      |
| time/              |          |
|    fps             | 428      |
|    iterations      | 4        |
|    time_elapsed    | 47       |
|    total_timesteps | 20480    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 339          |
|    ep_rew_mean          | 306          |
| time/                   |              |
|    fps                  | 455          |
|    iterations           | 5            |
|    time_elapsed         | 56           |
|    total_timesteps      | 25600        |
| train/                  |              |
|    approx_kl            | 0.0013729624 |
|    clip_fraction        | 0.119        |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.91        |
|    explained_variance   | 0.759        |
|    learning_r

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.133        |
|    crash                | 0.2          |
|    max_step             | 0            |
|    mean_ep_length       | 126          |
|    mean_reward          | 157          |
|    num_episodes         | 5            |
|    out_of_road          | 0.867        |
|    raw_action           | 0.48023555   |
|    route_completion     | 0.558        |
|    success_rate         | 0            |
|    total_cost           | 13.3         |
| time/                   |              |
|    total_timesteps      | 30000        |
| train/                  |              |
|    approx_kl            | 0.0012976391 |
|    arrive_dest          | 0.0667       |
|    clip_fraction        | 0.128        |
|    clip_range           | 0.1          |
|    crash                | 0.467        |
|    entropy_loss         | -1.9         |
|    explained_variance   | 0.626        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.15         |
|    crash                | 0.2          |
|    max_step             | 0            |
|    mean_ep_length       | 139          |
|    mean_reward          | 192          |
|    num_episodes         | 5            |
|    out_of_road          | 0.85         |
|    raw_action           | 0.4722455    |
|    route_completion     | 0.556        |
|    success_rate         | 0.4          |
|    total_cost           | 11.1         |
| time/                   |              |
|    total_timesteps      | 40000        |
| train/                  |              |
|    approx_kl            | 0.0015635375 |
|    arrive_dest          | 0.2          |
|    clip_fraction        | 0.0725       |
|    clip_range           | 0.1          |
|    crash                | 0.5          |
|    entropy_loss         | -1.9         |
|    explained_variance   | 0.662        |
|    learni

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.2         |
|    crash                | 0.24        |
|    max_step             | 0           |
|    mean_ep_length       | 178         |
|    mean_reward          | 213         |
|    num_episodes         | 5           |
|    out_of_road          | 0.8         |
|    raw_action           | 0.4777479   |
|    route_completion     | 0.568       |
|    success_rate         | 0.2         |
|    total_cost           | 12.3        |
| time/                   |             |
|    total_timesteps      | 50000       |
| train/                  |             |
|    approx_kl            | 0.003964329 |
|    arrive_dest          | 0.16        |
|    clip_fraction        | 0.146       |
|    clip_range           | 0.1         |
|    crash                | 0.48        |
|    entropy_loss         | -1.89       |
|    explained_variance   | 0.687       |
|    learning_rate        | 5e-05 

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.2         |
|    crash                | 0.267       |
|    max_step             | 0           |
|    mean_ep_length       | 166         |
|    mean_reward          | 147         |
|    num_episodes         | 5           |
|    out_of_road          | 0.8         |
|    raw_action           | 0.47977427  |
|    route_completion     | 0.541       |
|    success_rate         | 0.1         |
|    total_cost           | 14.4        |
| time/                   |             |
|    total_timesteps      | 60000       |
| train/                  |             |
|    approx_kl            | 0.009700579 |
|    arrive_dest          | 0.133       |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.1         |
|    crash                | 0.433       |
|    entropy_loss         | -1.89       |
|    explained_variance   | 0.659       |
|    learning_rate        | 5e-05 

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.2         |
|    crash                | 0.286       |
|    max_step             | 0           |
|    mean_ep_length       | 149         |
|    mean_reward          | 190         |
|    num_episodes         | 5           |
|    out_of_road          | 0.8         |
|    raw_action           | 0.47800848  |
|    route_completion     | 0.531       |
|    success_rate         | 0.1         |
|    total_cost           | 13.6        |
| time/                   |             |
|    total_timesteps      | 70000       |
| train/                  |             |
|    approx_kl            | 0.001163894 |
|    arrive_dest          | 0.114       |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.1         |
|    crash                | 0.429       |
|    entropy_loss         | -1.88       |
|    explained_variance   | 0.588       |
|    learning_rate        | 5e-05 

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.225       |
|    crash                | 0.25        |
|    max_step             | 0           |
|    mean_ep_length       | 165         |
|    mean_reward          | 180         |
|    num_episodes         | 5           |
|    out_of_road          | 0.775       |
|    raw_action           | 0.488241    |
|    route_completion     | 0.538       |
|    success_rate         | 0.3         |
|    total_cost           | 14          |
| time/                   |             |
|    total_timesteps      | 80000       |
| train/                  |             |
|    approx_kl            | 0.005062279 |
|    arrive_dest          | 0.125       |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.1         |
|    crash                | 0.425       |
|    entropy_loss         | -1.87       |
|    explained_variance   | 0.787       |
|    learning_rate        | 5e-05 

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.2          |
|    crash                | 0.244        |
|    max_step             | 0            |
|    mean_ep_length       | 124          |
|    mean_reward          | 108          |
|    num_episodes         | 5            |
|    out_of_road          | 0.8          |
|    raw_action           | 0.49509937   |
|    route_completion     | 0.523        |
|    success_rate         | 0            |
|    total_cost           | 14.5         |
| time/                   |              |
|    total_timesteps      | 90000        |
| train/                  |              |
|    approx_kl            | 0.0036336407 |
|    arrive_dest          | 0.111        |
|    clip_fraction        | 0.156        |
|    clip_range           | 0.1          |
|    crash                | 0.422        |
|    entropy_loss         | -1.86        |
|    explained_variance   | 0.765        |
|    learni

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.2         |
|    crash                | 0.26        |
|    max_step             | 0           |
|    mean_ep_length       | 174         |
|    mean_reward          | 154         |
|    num_episodes         | 5           |
|    out_of_road          | 0.8         |
|    raw_action           | 0.4971272   |
|    route_completion     | 0.521       |
|    success_rate         | 0.2         |
|    total_cost           | 15.9        |
| time/                   |             |
|    total_timesteps      | 100000      |
| train/                  |             |
|    approx_kl            | 0.005645705 |
|    arrive_dest          | 0.12        |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.1         |
|    crash                | 0.44        |
|    entropy_loss         | -1.86       |
|    explained_variance   | 0.762       |
|    learning_rate        | 5e-05 

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.218        |
|    crash                | 0.255        |
|    max_step             | 0            |
|    mean_ep_length       | 192          |
|    mean_reward          | 283          |
|    num_episodes         | 5            |
|    out_of_road          | 0.782        |
|    raw_action           | 0.49764657   |
|    route_completion     | 0.547        |
|    success_rate         | 0.3          |
|    total_cost           | 15.3         |
| time/                   |              |
|    total_timesteps      | 110000       |
| train/                  |              |
|    approx_kl            | 0.0019727172 |
|    arrive_dest          | 0.127        |
|    clip_fraction        | 0.144        |
|    clip_range           | 0.1          |
|    crash                | 0.4          |
|    entropy_loss         | -1.85        |
|    explained_variance   | 0.747        |
|    learni

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 318      |
|    ep_rew_mean     | 278      |
| time/              |          |
|    fps             | 424      |
|    iterations      | 22       |
|    time_elapsed    | 265      |
|    total_timesteps | 112640   |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 321          |
|    ep_rew_mean          | 282          |
| time/                   |              |
|    fps                  | 431          |
|    iterations           | 23           |
|    time_elapsed         | 272          |
|    total_timesteps      | 117760       |
| train/                  |              |
|    approx_kl            | 0.0025173412 |
|    clip_fraction        | 0.166        |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.85        |
|    explained_variance   | 0.666        |
|    learning_r

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.2          |
|    crash                | 0.25         |
|    max_step             | 0            |
|    mean_ep_length       | 123          |
|    mean_reward          | 152          |
|    num_episodes         | 5            |
|    out_of_road          | 0.8          |
|    raw_action           | 0.48189306   |
|    route_completion     | 0.543        |
|    success_rate         | 0.1          |
|    total_cost           | 14.5         |
| time/                   |              |
|    total_timesteps      | 120000       |
| train/                  |              |
|    approx_kl            | 0.0036262642 |
|    arrive_dest          | 0.133        |
|    clip_fraction        | 0.177        |
|    clip_range           | 0.1          |
|    crash                | 0.367        |
|    entropy_loss         | -1.85        |
|    explained_variance   | 0.734        |
|    learni

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.231       |
|    crash                | 0.231       |
|    max_step             | 0           |
|    mean_ep_length       | 251         |
|    mean_reward          | 298         |
|    num_episodes         | 5           |
|    out_of_road          | 0.769       |
|    raw_action           | 0.47500363  |
|    route_completion     | 0.565       |
|    success_rate         | 0.4         |
|    total_cost           | 15.4        |
| time/                   |             |
|    total_timesteps      | 130000      |
| train/                  |             |
|    approx_kl            | 0.001286828 |
|    arrive_dest          | 0.138       |
|    clip_fraction        | 0.249       |
|    clip_range           | 0.1         |
|    crash                | 0.354       |
|    entropy_loss         | -1.85       |
|    explained_variance   | 0.744       |
|    learning_rate        | 5e-05 

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 325      |
|    ep_rew_mean     | 280      |
| time/              |          |
|    fps             | 411      |
|    iterations      | 26       |
|    time_elapsed    | 323      |
|    total_timesteps | 133120   |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 341          |
|    ep_rew_mean          | 289          |
| time/                   |              |
|    fps                  | 416          |
|    iterations           | 27           |
|    time_elapsed         | 331          |
|    total_timesteps      | 138240       |
| train/                  |              |
|    approx_kl            | 0.0022764057 |
|    clip_fraction        | 0.238        |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.85        |
|    explained_variance   | 0.672        |
|    learning_r

-------------------------------------------
| eval/                   |               |
|    arrive_dest          | 0.257         |
|    crash                | 0.243         |
|    max_step             | 0             |
|    mean_ep_length       | 224           |
|    mean_reward          | 288           |
|    num_episodes         | 5             |
|    out_of_road          | 0.743         |
|    raw_action           | 0.47673443    |
|    route_completion     | 0.575         |
|    success_rate         | 0.3           |
|    total_cost           | 15.6          |
| time/                   |               |
|    total_timesteps      | 140000        |
| train/                  |               |
|    approx_kl            | 0.00091250084 |
|    arrive_dest          | 0.129         |
|    clip_fraction        | 0.145         |
|    clip_range           | 0.1           |
|    crash                | 0.329         |
|    entropy_loss         | -1.85         |
|    explained_variance   | 0.77

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.28         |
|    crash                | 0.24         |
|    max_step             | 0            |
|    mean_ep_length       | 209          |
|    mean_reward          | 200          |
|    num_episodes         | 5            |
|    out_of_road          | 0.72         |
|    raw_action           | 0.47494453   |
|    route_completion     | 0.587        |
|    success_rate         | 0.5          |
|    total_cost           | 17.3         |
| time/                   |              |
|    total_timesteps      | 150000       |
| train/                  |              |
|    approx_kl            | 0.0012269138 |
|    arrive_dest          | 0.147        |
|    clip_fraction        | 0.142        |
|    clip_range           | 0.1          |
|    crash                | 0.32         |
|    entropy_loss         | -1.84        |
|    explained_variance   | 0.717        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.275        |
|    crash                | 0.275        |
|    max_step             | 0            |
|    mean_ep_length       | 213          |
|    mean_reward          | 218          |
|    num_episodes         | 5            |
|    out_of_road          | 0.725        |
|    raw_action           | 0.47713265   |
|    route_completion     | 0.595        |
|    success_rate         | 0.1          |
|    total_cost           | 17.8         |
| time/                   |              |
|    total_timesteps      | 160000       |
| train/                  |              |
|    approx_kl            | 0.0065000714 |
|    arrive_dest          | 0.138        |
|    clip_fraction        | 0.206        |
|    clip_range           | 0.1          |
|    crash                | 0.312        |
|    entropy_loss         | -1.83        |
|    explained_variance   | 0.745        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.259        |
|    crash                | 0.282        |
|    max_step             | 0            |
|    mean_ep_length       | 125          |
|    mean_reward          | 174          |
|    num_episodes         | 5            |
|    out_of_road          | 0.741        |
|    raw_action           | 0.4766919    |
|    route_completion     | 0.59         |
|    success_rate         | 0.1          |
|    total_cost           | 16.9         |
| time/                   |              |
|    total_timesteps      | 170000       |
| train/                  |              |
|    approx_kl            | 0.0066787787 |
|    arrive_dest          | 0.141        |
|    clip_fraction        | 0.103        |
|    clip_range           | 0.1          |
|    crash                | 0.306        |
|    entropy_loss         | -1.83        |
|    explained_variance   | 0.659        |
|    learni

----------------------------------------
| eval/                   |            |
|    arrive_dest          | 0.256      |
|    crash                | 0.278      |
|    max_step             | 0          |
|    mean_ep_length       | 171        |
|    mean_reward          | 221        |
|    num_episodes         | 5          |
|    out_of_road          | 0.744      |
|    raw_action           | 0.47883412 |
|    route_completion     | 0.587      |
|    success_rate         | 0.2        |
|    total_cost           | 16.5       |
| time/                   |            |
|    total_timesteps      | 180000     |
| train/                  |            |
|    approx_kl            | 0.02924499 |
|    arrive_dest          | 0.144      |
|    clip_fraction        | 0.158      |
|    clip_range           | 0.1        |
|    crash                | 0.311      |
|    entropy_loss         | -1.83      |
|    explained_variance   | 0.74       |
|    learning_rate        | 5e-05      |
|    loss       

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.263       |
|    crash                | 0.295       |
|    max_step             | 0           |
|    mean_ep_length       | 196         |
|    mean_reward          | 293         |
|    num_episodes         | 5           |
|    out_of_road          | 0.737       |
|    raw_action           | 0.47689342  |
|    route_completion     | 0.595       |
|    success_rate         | 0.3         |
|    total_cost           | 16.1        |
| time/                   |             |
|    total_timesteps      | 190000      |
| train/                  |             |
|    approx_kl            | 0.002075826 |
|    arrive_dest          | 0.147       |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.1         |
|    crash                | 0.305       |
|    entropy_loss         | -1.83       |
|    explained_variance   | 0.764       |
|    learning_rate        | 5e-05 

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.26         |
|    crash                | 0.29         |
|    max_step             | 0            |
|    mean_ep_length       | 153          |
|    mean_reward          | 186          |
|    num_episodes         | 5            |
|    out_of_road          | 0.74         |
|    raw_action           | 0.4751262    |
|    route_completion     | 0.593        |
|    success_rate         | 0.2          |
|    total_cost           | 15.7         |
| time/                   |              |
|    total_timesteps      | 200000       |
| train/                  |              |
|    approx_kl            | 0.0026646722 |
|    arrive_dest          | 0.15         |
|    clip_fraction        | 0.292        |
|    clip_range           | 0.1          |
|    crash                | 0.3          |
|    entropy_loss         | -1.83        |
|    explained_variance   | 0.878        |
|    learni

----------------------------------------
| eval/                   |            |
|    arrive_dest          | 0.276      |
|    crash                | 0.286      |
|    max_step             | 0          |
|    mean_ep_length       | 214        |
|    mean_reward          | 298        |
|    num_episodes         | 5          |
|    out_of_road          | 0.724      |
|    raw_action           | 0.4769191  |
|    route_completion     | 0.604      |
|    success_rate         | 0.4        |
|    total_cost           | 15.8       |
| time/                   |            |
|    total_timesteps      | 210000     |
| train/                  |            |
|    approx_kl            | 0.00337044 |
|    arrive_dest          | 0.152      |
|    clip_fraction        | 0.211      |
|    clip_range           | 0.1        |
|    crash                | 0.305      |
|    entropy_loss         | -1.83      |
|    explained_variance   | 0.822      |
|    learning_rate        | 5e-05      |
|    loss       

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.282        |
|    crash                | 0.282        |
|    max_step             | 0            |
|    mean_ep_length       | 273          |
|    mean_reward          | 288          |
|    num_episodes         | 5            |
|    out_of_road          | 0.718        |
|    raw_action           | 0.47824928   |
|    route_completion     | 0.615        |
|    success_rate         | 0.3          |
|    total_cost           | 16.7         |
| time/                   |              |
|    total_timesteps      | 220000       |
| train/                  |              |
|    approx_kl            | 0.0015342102 |
|    arrive_dest          | 0.155        |
|    clip_fraction        | 0.0728       |
|    clip_range           | 0.1          |
|    crash                | 0.309        |
|    entropy_loss         | -1.83        |
|    explained_variance   | 0.721        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.278        |
|    crash                | 0.287        |
|    max_step             | 0            |
|    mean_ep_length       | 171          |
|    mean_reward          | 227          |
|    num_episodes         | 5            |
|    out_of_road          | 0.722        |
|    raw_action           | 0.4776758    |
|    route_completion     | 0.613        |
|    success_rate         | 0.1          |
|    total_cost           | 16.3         |
| time/                   |              |
|    total_timesteps      | 230000       |
| train/                  |              |
|    approx_kl            | 0.0022064976 |
|    arrive_dest          | 0.148        |
|    clip_fraction        | 0.166        |
|    clip_range           | 0.1          |
|    crash                | 0.304        |
|    entropy_loss         | -1.83        |
|    explained_variance   | 0.73         |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.283        |
|    crash                | 0.283        |
|    max_step             | 0            |
|    mean_ep_length       | 170          |
|    mean_reward          | 224          |
|    num_episodes         | 5            |
|    out_of_road          | 0.717        |
|    raw_action           | 0.47423217   |
|    route_completion     | 0.614        |
|    success_rate         | 0.3          |
|    total_cost           | 16           |
| time/                   |              |
|    total_timesteps      | 240000       |
| train/                  |              |
|    approx_kl            | 0.0016437477 |
|    arrive_dest          | 0.15         |
|    clip_fraction        | 0.255        |
|    clip_range           | 0.1          |
|    crash                | 0.3          |
|    entropy_loss         | -1.83        |
|    explained_variance   | 0.762        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.28         |
|    crash                | 0.28         |
|    max_step             | 0            |
|    mean_ep_length       | 170          |
|    mean_reward          | 210          |
|    num_episodes         | 5            |
|    out_of_road          | 0.72         |
|    raw_action           | 0.47393602   |
|    route_completion     | 0.616        |
|    success_rate         | 0.1          |
|    total_cost           | 16.1         |
| time/                   |              |
|    total_timesteps      | 250000       |
| train/                  |              |
|    approx_kl            | 0.0024006295 |
|    arrive_dest          | 0.144        |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.1          |
|    crash                | 0.296        |
|    entropy_loss         | -1.83        |
|    explained_variance   | 0.7          |
|    learni

----------------------------------------
| eval/                   |            |
|    arrive_dest          | 0.269      |
|    crash                | 0.277      |
|    max_step             | 0          |
|    mean_ep_length       | 137        |
|    mean_reward          | 190        |
|    num_episodes         | 5          |
|    out_of_road          | 0.731      |
|    raw_action           | 0.4737059  |
|    route_completion     | 0.617      |
|    success_rate         | 0.1        |
|    total_cost           | 15.6       |
| time/                   |            |
|    total_timesteps      | 260000     |
| train/                  |            |
|    approx_kl            | 0.00538031 |
|    arrive_dest          | 0.146      |
|    clip_fraction        | 0.16       |
|    clip_range           | 0.1        |
|    crash                | 0.3        |
|    entropy_loss         | -1.82      |
|    explained_variance   | 0.71       |
|    learning_rate        | 5e-05      |
|    loss       

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.274       |
|    crash                | 0.274       |
|    max_step             | 0           |
|    mean_ep_length       | 230         |
|    mean_reward          | 243         |
|    num_episodes         | 5           |
|    out_of_road          | 0.726       |
|    raw_action           | 0.472062    |
|    route_completion     | 0.622       |
|    success_rate         | 0.5         |
|    total_cost           | 16.2        |
| time/                   |             |
|    total_timesteps      | 270000      |
| train/                  |             |
|    approx_kl            | 0.003555112 |
|    arrive_dest          | 0.163       |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.1         |
|    crash                | 0.296       |
|    entropy_loss         | -1.82       |
|    explained_variance   | 0.729       |
|    learning_rate        | 5e-05 

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.286       |
|    crash                | 0.271       |
|    max_step             | 0           |
|    mean_ep_length       | 247         |
|    mean_reward          | 289         |
|    num_episodes         | 5           |
|    out_of_road          | 0.714       |
|    raw_action           | 0.47067106  |
|    route_completion     | 0.624       |
|    success_rate         | 0.3         |
|    total_cost           | 17          |
| time/                   |             |
|    total_timesteps      | 280000      |
| train/                  |             |
|    approx_kl            | 0.002657501 |
|    arrive_dest          | 0.157       |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.1         |
|    crash                | 0.293       |
|    entropy_loss         | -1.82       |
|    explained_variance   | 0.746       |
|    learning_rate        | 5e-05 

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.283        |
|    crash                | 0.269        |
|    max_step             | 0            |
|    mean_ep_length       | 154          |
|    mean_reward          | 225          |
|    num_episodes         | 5            |
|    out_of_road          | 0.717        |
|    raw_action           | 0.47039354   |
|    route_completion     | 0.622        |
|    success_rate         | 0.3          |
|    total_cost           | 16.5         |
| time/                   |              |
|    total_timesteps      | 290000       |
| train/                  |              |
|    approx_kl            | 0.0030865634 |
|    arrive_dest          | 0.166        |
|    clip_fraction        | 0.208        |
|    clip_range           | 0.1          |
|    crash                | 0.29         |
|    entropy_loss         | -1.83        |
|    explained_variance   | 0.835        |
|    learni

----------------------------------------
| eval/                   |            |
|    arrive_dest          | 0.28       |
|    crash                | 0.26       |
|    max_step             | 0          |
|    mean_ep_length       | 167        |
|    mean_reward          | 212        |
|    num_episodes         | 5          |
|    out_of_road          | 0.72       |
|    raw_action           | 0.47068194 |
|    route_completion     | 0.622      |
|    success_rate         | 0.1        |
|    total_cost           | 16.5       |
| time/                   |            |
|    total_timesteps      | 300000     |
| train/                  |            |
|    approx_kl            | 0.03281691 |
|    arrive_dest          | 0.16       |
|    clip_fraction        | 0.162      |
|    clip_range           | 0.1        |
|    crash                | 0.28       |
|    entropy_loss         | -1.82      |
|    explained_variance   | 0.772      |
|    learning_rate        | 5e-05      |
|    loss       

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.277        |
|    crash                | 0.277        |
|    max_step             | 0            |
|    mean_ep_length       | 118          |
|    mean_reward          | 153          |
|    num_episodes         | 5            |
|    out_of_road          | 0.723        |
|    raw_action           | 0.47339454   |
|    route_completion     | 0.619        |
|    success_rate         | 0.1          |
|    total_cost           | 16.1         |
| time/                   |              |
|    total_timesteps      | 310000       |
| train/                  |              |
|    approx_kl            | 0.0019080732 |
|    arrive_dest          | 0.155        |
|    clip_fraction        | 0.108        |
|    clip_range           | 0.1          |
|    crash                | 0.277        |
|    entropy_loss         | -1.82        |
|    explained_variance   | 0.744        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.275        |
|    crash                | 0.275        |
|    max_step             | 0            |
|    mean_ep_length       | 145          |
|    mean_reward          | 172          |
|    num_episodes         | 5            |
|    out_of_road          | 0.725        |
|    raw_action           | 0.47195148   |
|    route_completion     | 0.617        |
|    success_rate         | 0.3          |
|    total_cost           | 15.9         |
| time/                   |              |
|    total_timesteps      | 320000       |
| train/                  |              |
|    approx_kl            | 0.0043355552 |
|    arrive_dest          | 0.163        |
|    clip_fraction        | 0.217        |
|    clip_range           | 0.1          |
|    crash                | 0.275        |
|    entropy_loss         | -1.82        |
|    explained_variance   | 0.846        |
|    learni

----------------------------------------
| eval/                   |            |
|    arrive_dest          | 0.273      |
|    crash                | 0.279      |
|    max_step             | 0          |
|    mean_ep_length       | 139        |
|    mean_reward          | 209        |
|    num_episodes         | 5          |
|    out_of_road          | 0.727      |
|    raw_action           | 0.47176835 |
|    route_completion     | 0.615      |
|    success_rate         | 0.1        |
|    total_cost           | 15.5       |
| time/                   |            |
|    total_timesteps      | 330000     |
| train/                  |            |
|    approx_kl            | 0.01937558 |
|    arrive_dest          | 0.158      |
|    clip_fraction        | 0.152      |
|    clip_range           | 0.1        |
|    crash                | 0.273      |
|    entropy_loss         | -1.81      |
|    explained_variance   | 0.765      |
|    learning_rate        | 5e-05      |
|    loss       

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.276       |
|    crash                | 0.271       |
|    max_step             | 0           |
|    mean_ep_length       | 218         |
|    mean_reward          | 258         |
|    num_episodes         | 5           |
|    out_of_road          | 0.724       |
|    raw_action           | 0.4716397   |
|    route_completion     | 0.615       |
|    success_rate         | 0.2         |
|    total_cost           | 15.8        |
| time/                   |             |
|    total_timesteps      | 340000      |
| train/                  |             |
|    approx_kl            | 0.008300016 |
|    arrive_dest          | 0.153       |
|    clip_fraction        | 0.222       |
|    clip_range           | 0.1         |
|    crash                | 0.288       |
|    entropy_loss         | -1.81       |
|    explained_variance   | 0.707       |
|    learning_rate        | 5e-05 

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.274        |
|    crash                | 0.263        |
|    max_step             | 0            |
|    mean_ep_length       | 185          |
|    mean_reward          | 185          |
|    num_episodes         | 5            |
|    out_of_road          | 0.726        |
|    raw_action           | 0.47059762   |
|    route_completion     | 0.614        |
|    success_rate         | 0.1          |
|    total_cost           | 15.7         |
| time/                   |              |
|    total_timesteps      | 350000       |
| train/                  |              |
|    approx_kl            | 0.0031379648 |
|    arrive_dest          | 0.149        |
|    clip_fraction        | 0.247        |
|    clip_range           | 0.1          |
|    crash                | 0.291        |
|    entropy_loss         | -1.81        |
|    explained_variance   | 0.742        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.272        |
|    crash                | 0.267        |
|    max_step             | 0            |
|    mean_ep_length       | 159          |
|    mean_reward          | 235          |
|    num_episodes         | 5            |
|    out_of_road          | 0.728        |
|    raw_action           | 0.4706432    |
|    route_completion     | 0.613        |
|    success_rate         | 0.2          |
|    total_cost           | 15.3         |
| time/                   |              |
|    total_timesteps      | 360000       |
| train/                  |              |
|    approx_kl            | 0.0018677076 |
|    arrive_dest          | 0.15         |
|    clip_fraction        | 0.124        |
|    clip_range           | 0.1          |
|    crash                | 0.289        |
|    entropy_loss         | -1.8         |
|    explained_variance   | 0.82         |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.27         |
|    crash                | 0.265        |
|    max_step             | 0            |
|    mean_ep_length       | 171          |
|    mean_reward          | 192          |
|    num_episodes         | 5            |
|    out_of_road          | 0.73         |
|    raw_action           | 0.4712652    |
|    route_completion     | 0.611        |
|    success_rate         | 0.1          |
|    total_cost           | 15.2         |
| time/                   |              |
|    total_timesteps      | 370000       |
| train/                  |              |
|    approx_kl            | 0.0018459832 |
|    arrive_dest          | 0.146        |
|    clip_fraction        | 0.124        |
|    clip_range           | 0.1          |
|    crash                | 0.297        |
|    entropy_loss         | -1.8         |
|    explained_variance   | 0.745        |
|    learni

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.274       |
|    crash                | 0.258       |
|    max_step             | 0           |
|    mean_ep_length       | 165         |
|    mean_reward          | 235         |
|    num_episodes         | 5           |
|    out_of_road          | 0.726       |
|    raw_action           | 0.47149524  |
|    route_completion     | 0.613       |
|    success_rate         | 0.4         |
|    total_cost           | 15          |
| time/                   |             |
|    total_timesteps      | 380000      |
| train/                  |             |
|    approx_kl            | 0.005401003 |
|    arrive_dest          | 0.153       |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.1         |
|    crash                | 0.3         |
|    entropy_loss         | -1.79       |
|    explained_variance   | 0.714       |
|    learning_rate        | 5e-05 

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.277        |
|    crash                | 0.251        |
|    max_step             | 0            |
|    mean_ep_length       | 219          |
|    mean_reward          | 267          |
|    num_episodes         | 5            |
|    out_of_road          | 0.723        |
|    raw_action           | 0.46902418   |
|    route_completion     | 0.618        |
|    success_rate         | 0.3          |
|    total_cost           | 15.1         |
| time/                   |              |
|    total_timesteps      | 390000       |
| train/                  |              |
|    approx_kl            | 0.0051685125 |
|    arrive_dest          | 0.154        |
|    clip_fraction        | 0.169        |
|    clip_range           | 0.1          |
|    crash                | 0.297        |
|    entropy_loss         | -1.79        |
|    explained_variance   | 0.761        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.285        |
|    crash                | 0.25         |
|    max_step             | 0            |
|    mean_ep_length       | 209          |
|    mean_reward          | 286          |
|    num_episodes         | 5            |
|    out_of_road          | 0.715        |
|    raw_action           | 0.46950218   |
|    route_completion     | 0.622        |
|    success_rate         | 0.3          |
|    total_cost           | 15           |
| time/                   |              |
|    total_timesteps      | 400000       |
| train/                  |              |
|    approx_kl            | 0.0017913686 |
|    arrive_dest          | 0.15         |
|    clip_fraction        | 0.147        |
|    clip_range           | 0.1          |
|    crash                | 0.295        |
|    entropy_loss         | -1.79        |
|    explained_variance   | 0.781        |
|    learni

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.288       |
|    crash                | 0.244       |
|    max_step             | 0           |
|    mean_ep_length       | 204         |
|    mean_reward          | 252         |
|    num_episodes         | 5           |
|    out_of_road          | 0.712       |
|    raw_action           | 0.47030258  |
|    route_completion     | 0.623       |
|    success_rate         | 0.4         |
|    total_cost           | 15          |
| time/                   |             |
|    total_timesteps      | 410000      |
| train/                  |             |
|    approx_kl            | 0.010846978 |
|    arrive_dest          | 0.156       |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.1         |
|    crash                | 0.293       |
|    entropy_loss         | -1.79       |
|    explained_variance   | 0.761       |
|    learning_rate        | 5e-05 

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.281        |
|    crash                | 0.238        |
|    max_step             | 0            |
|    mean_ep_length       | 120          |
|    mean_reward          | 164          |
|    num_episodes         | 5            |
|    out_of_road          | 0.719        |
|    raw_action           | 0.46998504   |
|    route_completion     | 0.619        |
|    success_rate         | 0.1          |
|    total_cost           | 14.7         |
| time/                   |              |
|    total_timesteps      | 420000       |
| train/                  |              |
|    approx_kl            | 0.0023419536 |
|    arrive_dest          | 0.157        |
|    clip_fraction        | 0.151        |
|    clip_range           | 0.1          |
|    crash                | 0.29         |
|    entropy_loss         | -1.8         |
|    explained_variance   | 0.724        |
|    learni

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.279       |
|    crash                | 0.247       |
|    max_step             | 0           |
|    mean_ep_length       | 113         |
|    mean_reward          | 141         |
|    num_episodes         | 5           |
|    out_of_road          | 0.721       |
|    raw_action           | 0.46923736  |
|    route_completion     | 0.614       |
|    success_rate         | 0.4         |
|    total_cost           | 14.4        |
| time/                   |             |
|    total_timesteps      | 430000      |
| train/                  |             |
|    approx_kl            | 0.009174183 |
|    arrive_dest          | 0.167       |
|    clip_fraction        | 0.249       |
|    clip_range           | 0.1         |
|    crash                | 0.288       |
|    entropy_loss         | -1.8        |
|    explained_variance   | 0.866       |
|    learning_rate        | 5e-05 

----------------------------------------
| eval/                   |            |
|    arrive_dest          | 0.277      |
|    crash                | 0.245      |
|    max_step             | 0          |
|    mean_ep_length       | 179        |
|    mean_reward          | 244        |
|    num_episodes         | 5          |
|    out_of_road          | 0.723      |
|    raw_action           | 0.46964824 |
|    route_completion     | 0.614      |
|    success_rate         | 0.2        |
|    total_cost           | 14.3       |
| time/                   |            |
|    total_timesteps      | 440000     |
| train/                  |            |
|    approx_kl            | 0.00512914 |
|    arrive_dest          | 0.168      |
|    clip_fraction        | 0.148      |
|    clip_range           | 0.1        |
|    crash                | 0.282      |
|    entropy_loss         | -1.79      |
|    explained_variance   | 0.721      |
|    learning_rate        | 5e-05      |
|    loss       

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.276        |
|    crash                | 0.244        |
|    max_step             | 0            |
|    mean_ep_length       | 162          |
|    mean_reward          | 227          |
|    num_episodes         | 5            |
|    out_of_road          | 0.724        |
|    raw_action           | 0.46956268   |
|    route_completion     | 0.614        |
|    success_rate         | 0.1          |
|    total_cost           | 14.1         |
| time/                   |              |
|    total_timesteps      | 450000       |
| train/                  |              |
|    approx_kl            | 0.0022484353 |
|    arrive_dest          | 0.164        |
|    clip_fraction        | 0.158        |
|    clip_range           | 0.1          |
|    crash                | 0.28         |
|    entropy_loss         | -1.79        |
|    explained_variance   | 0.735        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.274        |
|    crash                | 0.239        |
|    max_step             | 0            |
|    mean_ep_length       | 139          |
|    mean_reward          | 165          |
|    num_episodes         | 5            |
|    out_of_road          | 0.726        |
|    raw_action           | 0.4689889    |
|    route_completion     | 0.611        |
|    success_rate         | 0.2          |
|    total_cost           | 13.8         |
| time/                   |              |
|    total_timesteps      | 460000       |
| train/                  |              |
|    approx_kl            | 0.0024810757 |
|    arrive_dest          | 0.165        |
|    clip_fraction        | 0.13         |
|    clip_range           | 0.1          |
|    crash                | 0.278        |
|    entropy_loss         | -1.78        |
|    explained_variance   | 0.771        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.277        |
|    crash                | 0.234        |
|    max_step             | 0            |
|    mean_ep_length       | 245          |
|    mean_reward          | 310          |
|    num_episodes         | 5            |
|    out_of_road          | 0.723        |
|    raw_action           | 0.4689677    |
|    route_completion     | 0.616        |
|    success_rate         | 0.4          |
|    total_cost           | 14.1         |
| time/                   |              |
|    total_timesteps      | 470000       |
| train/                  |              |
|    approx_kl            | 0.0028213665 |
|    arrive_dest          | 0.17         |
|    clip_fraction        | 0.303        |
|    clip_range           | 0.1          |
|    crash                | 0.272        |
|    entropy_loss         | -1.78        |
|    explained_variance   | 0.836        |
|    learni

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 343      |
|    ep_rew_mean     | 312      |
| time/              |          |
|    fps             | 419      |
|    iterations      | 92       |
|    time_elapsed    | 1123     |
|    total_timesteps | 471040   |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 357         |
|    ep_rew_mean          | 319         |
| time/                   |             |
|    fps                  | 420         |
|    iterations           | 93          |
|    time_elapsed         | 1132        |
|    total_timesteps      | 476160      |
| train/                  |             |
|    approx_kl            | 0.002065826 |
|    clip_fraction        | 0.251       |
|    clip_range           | 0.1         |
|    entropy_loss         | -1.77       |
|    explained_variance   | 0.83        |
|    learning_rate        | 5e

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.271        |
|    crash                | 0.229        |
|    max_step             | 0            |
|    mean_ep_length       | 146          |
|    mean_reward          | 207          |
|    num_episodes         | 5            |
|    out_of_road          | 0.729        |
|    raw_action           | 0.46868563   |
|    route_completion     | 0.613        |
|    success_rate         | 0.2          |
|    total_cost           | 13.8         |
| time/                   |              |
|    total_timesteps      | 480000       |
| train/                  |              |
|    approx_kl            | 0.0038056772 |
|    arrive_dest          | 0.175        |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.1          |
|    crash                | 0.271        |
|    entropy_loss         | -1.77        |
|    explained_variance   | 0.78         |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.278        |
|    crash                | 0.224        |
|    max_step             | 0            |
|    mean_ep_length       | 308          |
|    mean_reward          | 349          |
|    num_episodes         | 5            |
|    out_of_road          | 0.722        |
|    raw_action           | 0.4676172    |
|    route_completion     | 0.619        |
|    success_rate         | 0.4          |
|    total_cost           | 14.4         |
| time/                   |              |
|    total_timesteps      | 490000       |
| train/                  |              |
|    approx_kl            | 0.0019659437 |
|    arrive_dest          | 0.176        |
|    clip_fraction        | 0.118        |
|    clip_range           | 0.1          |
|    crash                | 0.265        |
|    entropy_loss         | -1.77        |
|    explained_variance   | 0.839        |
|    learni

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 353      |
|    ep_rew_mean     | 312      |
| time/              |          |
|    fps             | 416      |
|    iterations      | 96       |
|    time_elapsed    | 1180     |
|    total_timesteps | 491520   |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 351          |
|    ep_rew_mean          | 303          |
| time/                   |              |
|    fps                  | 417          |
|    iterations           | 97           |
|    time_elapsed         | 1188         |
|    total_timesteps      | 496640       |
| train/                  |              |
|    approx_kl            | 0.0016720811 |
|    clip_fraction        | 0.11         |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.77        |
|    explained_variance   | 0.685        |
|    learning_r

-----------------------------------------
| eval/                   |             |
|    arrive_dest          | 0.276       |
|    crash                | 0.232       |
|    max_step             | 0           |
|    mean_ep_length       | 174         |
|    mean_reward          | 222         |
|    num_episodes         | 5           |
|    out_of_road          | 0.724       |
|    raw_action           | 0.46830213  |
|    route_completion     | 0.618       |
|    success_rate         | 0.2         |
|    total_cost           | 14.2        |
| time/                   |             |
|    total_timesteps      | 500000      |
| train/                  |             |
|    approx_kl            | 0.006918856 |
|    arrive_dest          | 0.176       |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.1         |
|    crash                | 0.264       |
|    entropy_loss         | -1.77       |
|    explained_variance   | 0.779       |
|    learning_rate        | 5e-05 

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.275        |
|    crash                | 0.231        |
|    max_step             | 0            |
|    mean_ep_length       | 167          |
|    mean_reward          | 271          |
|    num_episodes         | 5            |
|    out_of_road          | 0.725        |
|    raw_action           | 0.46827397   |
|    route_completion     | 0.619        |
|    success_rate         | 0.2          |
|    total_cost           | 14           |
| time/                   |              |
|    total_timesteps      | 510000       |
| train/                  |              |
|    approx_kl            | 0.0016225384 |
|    arrive_dest          | 0.176        |
|    clip_fraction        | 0.168        |
|    clip_range           | 0.1          |
|    crash                | 0.271        |
|    entropy_loss         | -1.76        |
|    explained_variance   | 0.722        |
|    learni

------------------------------------------
| eval/                   |              |
|    arrive_dest          | 0.281        |
|    crash                | 0.227        |
|    max_step             | 0            |
|    mean_ep_length       | 194          |
|    mean_reward          | 328          |
|    num_episodes         | 5            |
|    out_of_road          | 0.719        |
|    raw_action           | 0.46826023   |
|    route_completion     | 0.622        |
|    success_rate         | 0.4          |
|    total_cost           | 13.8         |
| time/                   |              |
|    total_timesteps      | 520000       |
| train/                  |              |
|    approx_kl            | 0.0026812104 |
|    arrive_dest          | 0.177        |
|    clip_fraction        | 0.184        |
|    clip_range           | 0.1          |
|    crash                | 0.269        |
|    entropy_loss         | -1.75        |
|    explained_variance   | 0.866        |
|    learni

In [None]:
if use_wandb:
        wandb.finish()

train_env.close()
eval_env.close()
del train_env
del eval_env
del model

0,1
eval/arrive_dest,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▅▅▅▆▆▇▇▇▇▇▇███████████▇▇
eval/crash,▁▅▆▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
eval/max_step,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/mean_ep_length,▃▁▂▃▂▅▃▆▂▃▆▆▆▄▄▅▃▅▄▃▃▆▆▃▆▃▃▄▃▅▇▃▅▃▄▅▃█▂▃
eval/mean_reward,▁▁▄▃▃▃▄▅▃▆▆▃▇▆▅▄▄▄▆▇▅▄▄▄▇▃▄▅▄▄▆▇▄▄██▆▄▃▃
eval/num_episodes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/out_of_road,████▇▇▇▇▆▆▆▅▅▃▃▃▃▃▂▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂
eval/raw_action,▁▂▂▃▅▇▇▇████████████████████████████████
eval/route_completion,▁▂▃▃▃▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███████████████
eval/success_rate,▁▁▁▁▁▃▁▁▁▁▃▆▅▁▅▃▁▆▃▃▃▁▃▅▅▅█▅▆▃▅▃▃▁▁▃▅▅▁▆

0,1
eval/arrive_dest,0.084
eval/crash,0.237
eval/max_step,0.0
eval/mean_ep_length,239.2
eval/mean_reward,252.39171
eval/num_episodes,5.0
eval/out_of_road,0.916
eval/raw_action,0.46668
eval/route_completion,0.4564
eval/success_rate,0.1


KeyboardInterrupt: 

In [None]:
import gc
gc.collect()

2042