# Training demo: Highway

## Installation

Set up Stablebaseline3 environment:

```

# using pip
pip install swig cmake ffmpeg
pip install -r requirements.txt

# using Conda
conda create --name <env_name> python=3.9
conda activate <env_name>
pip install swig cmake ffmpeg
pip install -r requirements.txt
```

In [1]:
import gym
import highway_env
import numpy as np
import torch as th
from stable_baselines3 import DQN, DQN_ME, ResidualSoftDQN
from highway_env.vehicle.controller import ControlledVehicle
import warnings
warnings.filterwarnings('ignore')

## Define the evaluation

In [2]:
def evaluation(model):
    episode_lane = 0.0
    episode_speed = 0.0
    episode_reward = 0.0
    basic_reward = 0.0
    added_reward = 0.0
    episode_added_rewards,episode_basic_rewards = [],[]
    episode_rewards, episode_lengths, episode_speeds, episode_lanes = [], [], [], []
    ep_len = 0
    successes = []
    env = gym.make("highway-ME-basic-AddRightRewardALL-v0")
    obs = env.reset()
    for _ in range(400):
            action, lstm_states = model.predict(
                obs,  
                deterministic = True,
            )
            obs, reward, done, infos = env.step(int (action))
            
            neighbours = env.road.network.all_side_lanes(env.vehicle.lane_index)
            lane = env.vehicle.target_lane_index[2] if isinstance(env.vehicle, ControlledVehicle) \
                else env.vehicle.lane_index[2]
            lane = lane / max(len(neighbours) - 1, 1)
            
            episode_lane = episode_lane + lane
            
            forward_speed = env.vehicle.speed * np.cos(env.vehicle.heading)
            
            episode_speed = episode_speed + forward_speed
            
            basic_reward +=  env.basic_reward
            added_reward +=  env.added_reward
            episode_reward += reward
            ep_len += 1
            
            if done:
                if ep_len == 40:
                    successes.append(1)
                    episode_speeds.append(episode_speed/ep_len) 
                    episode_lanes.append(episode_lane/ep_len) 
                    episode_rewards.append(episode_reward)
                    episode_added_rewards.append(added_reward)
                    episode_basic_rewards.append(basic_reward)
                    episode_lengths.append(ep_len)

                else:
                    successes.append(0)
                    episode_speeds.append(episode_speed/ep_len) 
                    episode_lanes.append(episode_lane/ep_len) 
                    episode_rewards.append(episode_reward)
                    episode_added_rewards.append(added_reward)
                    episode_basic_rewards.append(basic_reward)
                    episode_lengths.append(ep_len)
                episode_reward = 0.0
                added_reward = 0.0
                basic_reward = 0.0
                ep_len = 0
                episode_speed = 0.0
                episode_lane = 0.0
                obs = env.reset()
                
    print(f"Success rate: {100 * np.mean(successes):.2f}%")
    print(f"{len(successes)} Episodes")
    print(f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}")
    print(f"Mean basic reward: {np.mean(episode_basic_rewards):.2f} +/- {np.std(episode_basic_rewards):.2f}")
    print(f"Mean added reward: {np.mean(episode_added_rewards):.2f} +/- {np.std(episode_added_rewards):.2f}")
    print(f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}")
    print(f"Mean lane: {np.mean(episode_lanes):.2f} +/- {np.std(episode_lanes):.2f}")
    print(f"Mean speed: {np.mean(episode_speeds):.2f} +/- {np.std(episode_speeds):.2f}")
    print("________________________________________________")

## Train RL priror policy

In [3]:
# Init env
env = gym.make("highway-ME-basic-v0")

# Init soft Q model
model = DQN_ME(
    env=env,
    policy='MlpPolicy',
    batch_size=32,
    buffer_size=15000,
    learning_rate=1e-4,
    gamma=0.8,
    target_update_interval=50,
    train_freq=1,
    gradient_steps=1,
    exploration_fraction=0.7,
    policy_kwargs=dict(net_arch=[256, 256])
)

# Training. For better performance, we recomand train 5e5 steps 
model.learn(int(1e5))

# Save model
model.save(f"./logs/highway-ME-basic-v0_Example")

In [None]:
evaluation(model)

## Train customized policy

In [4]:
# Init env
env = gym.make("highway-ME-basic-AddRightReward-v0")

# Init residual soft Q model
model_cust = ResidualSoftDQN(
    env=env,
    prior_model_path='./logs/highway-ME-basic-v0_Example_Pretrained',
    policy='MlpPolicy',
    batch_size=32,
    buffer_size=15000,
    learning_rate=1e-4,
    gamma=0.8,
    target_update_interval=50,
    train_freq=1,
    gradient_steps=1,
    exploration_fraction=0.7,
    policy_kwargs=dict(net_arch=[256, 256])
)

# Training. For better performance, we recomand train 5e5 steps 
model_cust.learn(int(1e5))

# Save model 
model_cust.save(f"./logs/highway-ME-basic-AddRightReward-v0_Example")

In [None]:
evaluation(model_cust)

## Residual MCTS


## Define the evaluation

In [None]:
import gym
import highway_env
import numpy as np
import torch as th
import matplotlib.pyplot as plt
import warnings
from highway_env.vehicle.controller import ControlledVehicle
from stable_baselines3 import DQN, DQN_ME, ResidualSoftDQN
import datetime
import os
from pathlib import Path
import json
from itertools import product
from multiprocessing.pool import Pool
from rl_agents.agents.common.factory import load_agent, load_environment
from highway_env.vehicle.controller import ControlledVehicle
from highway_env import utils
warnings.filterwarnings('ignore')

def evaluation(agent, env, evalsteps):
    episode_lane = 0.0
    episode_speed = 0.0
    episode_reward = 0.0
    total_reward = 0.0
    episode_rewards, episode_lengths, episode_speeds, episode_lanes = [], [], [], []
    episode_total_rewards = []
    ep_len = 0
    successes = []
    obs = env.reset()
    for timestep in range(evalsteps):
        actions = agent.plan(obs)
        action = actions[0]
        obs, reward, done, infos = env.step(int (action))
        print("time : ",timestep, "; action:",  action)
        neighbours = env.road.network.all_side_lanes(env.vehicle.lane_index)
        lane = env.vehicle.target_lane_index[2] if isinstance(env.vehicle, ControlledVehicle) \
            else env.vehicle.lane_index[2]
        lane = lane / max(len(neighbours) - 1, 1)

        episode_lane = episode_lane + lane

        forward_speed = env.vehicle.speed * np.cos(env.vehicle.heading)

        episode_speed = episode_speed + forward_speed

        episode_reward += reward
        ep_len += 1

        
        scaled_speed = utils.lmap(forward_speed, env.config["reward_speed_range"], [0, 1])
        
        reward = \
            + (-0.5) * env.vehicle.crashed \
            + 0.0 * lane \
            + 0.4 * np.clip(scaled_speed, 0, 1)  + 1
        reward = 0 if not env.vehicle.on_road else reward
        
        total_reward += reward
        
        if done:
            if ep_len == 40:
                successes.append(1)
                episode_speeds.append(episode_speed/ep_len) 
                episode_lanes.append(episode_lane/ep_len) 
                episode_rewards.append(episode_reward)
                episode_total_rewards.append(total_reward)
                episode_lengths.append(ep_len)

            else:
                successes.append(0)
                episode_speeds.append(episode_speed/ep_len) 
                episode_lanes.append(episode_lane/ep_len) 
                episode_rewards.append(episode_reward)
                episode_total_rewards.append(total_reward)
                episode_lengths.append(ep_len)
            episode_reward = 0.0
            total_reward = 0.0
            ep_len = 0
            episode_speed = 0.0
            episode_lane = 0.0
            obs = env.reset()
            print(f"Success rate: {100 * np.mean(successes):.2f}%")
            print(f"{len(successes)} Episodes")
            print(f"Mean added reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}")
            print(f"Mean basic reward: {np.mean(episode_total_rewards):.2f} +/- {np.std(episode_total_rewards):.2f}")
            print(f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}")
            print(f"Mean lane: {np.mean(episode_lanes):.2f} +/- {np.std(episode_lanes):.2f}")
            print(f"Mean speed: {np.mean(episode_speeds):.2f} +/- {np.std(episode_speeds):.2f}")
            print("-----------")
            
    print("________________________________________________")
    print(f"Success rate: {100 * np.mean(successes):.2f}%")
    print(f"{len(successes)} Episodes")
    print(f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}")
    print(f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}")
    print(f"Mean lane: {np.mean(episode_lanes):.2f} +/- {np.std(episode_lanes):.2f}")
    print(f"Mean speed: {np.mean(episode_speeds):.2f} +/- {np.std(episode_speeds):.2f}")
    print("________________________________________________")

## Running online MCTS

In [None]:
# Init env
env = gym.make("highway-ME-basic-AddRightReward-v0")
obs = env.reset()

# Load hyperparams. If you want to change the prior model, you can change the following config file.
agent_config = "./configs/HighwayEnv/agents/MCTSAgent/baselineMEPreRL.json"
agent = load_agent(agent_config, env)

# Simulation
evalsteps = 40
evaluation(agent,env, evalsteps)