### Agent planning and coordination performance

See waymax table 3

In [1]:
from eval_policy import evaluate_policy

from utils.policies import load_policy
from utils.config import load_config_nb
from utils.sb3.reg_ppo import RegularizedPPO
import pandas as pd

TOTAL_STEPS = 1000

env_config = load_config_nb("env_config")
env_config.data_path = "../data_full/train/"
env_config.val_data_path = "../data_full/valid/"

In [2]:
# Imitation learning: load human policy
IL_BASE_PATH = "../models/il/S1000"
BC_POLICY_NAME = 'human_policy_S1000_01_12_11_11'

human_policy = load_policy(
    data_path=IL_BASE_PATH,
    file_name=BC_POLICY_NAME, 
)

# Load RL policy
HR_RL_BASE_PATH = "../models/hr_rl/S1000"
PPO_MODEL = "policy_L0.0_S1000_I606"
rl_policy = RegularizedPPO.load(f"{HR_RL_BASE_PATH}/{PPO_MODEL}");

INFO:root:No regularization weight specified, using default PPO.


In [6]:
goal_rate, off_road, coll_rate, total_samples = evaluate_policy(
    env_config=env_config, 
    mode='rl_policy', 
    policy=rl_policy,
    controlled_agents=1, 
    total_steps=TOTAL_STEPS
)

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:02<00:00, 420.40it/s]


In [7]:
goal_rate, off_road, coll_rate, total_samples

(81.25, 0.0, 18.75, 32)

In [6]:
df_res = pd.DataFrame(
    columns=['goal_rate', 'off_road', 'collision', 'total_samples'],
)

policy_names = ['policy_L0.0_S1000_I606']
agent_namens = ['PPO MA',] #'PPO SA', 'HR-PPO SA', 'HR-PPO MA']

for model_name, type in zip(policy_names, agent_namens):
    
    # Load RL policy
    policy = RegularizedPPO.load(f"{HR_RL_BASE_PATH}/{model_name}");
    
    # Evaluate
    goal_rate, off_road, coll_rate, total_samples = evaluate_policy(
        env_config=env_config, 
        mode='policy', 
        policy=rl_policy,
        controlled_agents=1, 
        total_steps=TOTAL_STEPS
    )
    
    df_rl_pol = pd.DataFrame(
        {
            'goal_rate': goal_rate, 
            'off_road': off_road, 
            'collision': coll_rate, 
            'total_samples': total_samples,
            'cat': type,
        },
    )    

    # Store
    df_res = pd.concat([df_res, df_rl_pol], ignore_index=True)

INFO:root:No regularization weight specified, using default PPO.
  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:01<00:00, 983.77it/s]


ValueError: If using all scalar values, you must pass an index

In [None]:
# Evaluate human policy
evaluate_policy(
    env_config=env_config, 
    mode='il_policy', 
    policy=human_policy,
    controlled_agents=1, 
    total_steps=TOTAL_STEPS
)

# Expert replay
evaluate_policy(
    env_config=env_config, 
    mode='expert_replay', 
    policy=None,
    controlled_agents=1, 
    total_steps=TOTAL_STEPS
)
