## Evaluation
Please provide a list of directories `result_dirs` to be evalutated. Use the [table generating section](#results-table) and [plot generating section](#results-plots) to obtain a table and plots respectively. The result metrics for the same hyperparameter settings will be averaged over multiple seed runs (if multiple exist), and the plots will be shown for all seeds.


In [1]:
import torch
import numpy as np
import argparse
import json
import os
import sys
from collections import defaultdict
from tqdm import tqdm
from main import set_seed
from environment import Environment
from pathlib import Path
from constraints import ForwardConstraints
from trainer import Trainer
from reward import od_utility

from constants import device

result_dirs = ["example-runs/diagonal_5x5_20220820_14_32_40.304876"]

for r in result_dirs:
    assert os.path.exists(r), f"Result dir list contains at least one directory that does not exist: {r}"

def load_state(resdir):
    """Load the model and corresponding environment to be evaluated. Also returns the arguments.
       Args:
       ----
       resdir (str): the path to the trained model directory."""
    with open(os.path.join(resdir, "args.txt")) as argfile:
        args_dict = json.load(argfile)
    args = argparse.Namespace(**args_dict)
    if args.seed:
        set_seed(args.seed)
    environment = Environment(Path(f"./environments/{args.environment}"), 
                              groups_file=args.groups_file, 
                              reward_scaling_fn=args.cf_reward_scaling, 
                              efficient_station_fn=args.cf_efficient_station,  # Not used
                              dmin=args.cf_dmin,  # Not used
                              dmax=args.cf_dmax)  # Not used
    constraints = ForwardConstraints(environment.grid_x_size, environment.grid_y_size, environment.existing_lines_full, environment.grid_to_vector)
    trainer = Trainer(environment, constraints, args)
    trainer.actor.eval()

    return args, environment, trainer

def evaluate_model(args, environment, trainer):
    """Evaluates a given model on a given environment."""

    # Setup the initial static and dynamic states.
    static = environment.static
    dynamic = torch.zeros((1, args.dynamic_size, environment.grid_size),
                              device=device).float()  # size with batch
    
    with torch.no_grad():
        # Since the line generation process is deterministic due to the greedy sampling
        # approach, we only need one line to determine the reward
        generated_line, _ = trainer.actor(static, dynamic, args.station_num_lim, decoder_input=None, last_hh=None)
        satisfied_od = od_utility(generated_line, environment, args.constraint_free)
    
    return satisfied_od, generated_line


In [2]:
args = []
seeds = defaultdict(list)
evaluation = defaultdict(list)

# Evaluate all models using their own reward function
for resdir in tqdm(result_dirs):
    arg, environment, trainer = load_state(resdir)
    satisfied_od, generated_line = evaluate_model(arg, environment, trainer)
    arg = vars(arg)
    seed = arg.pop("seed")

    # if the exact arguments (without seed) are already in the list set arg_id to that index
    if arg in args:
        arg_id = args.index(arg)
    # else, make a new arg_id and append the argument settings
    else:
        arg_id = len(args)
        args.append(arg)
    seeds[arg_id].append(seed)
    evaluation[arg_id].append((satisfied_od, generated_line))

print("#"*30, "\nAll models evaluated, please note the argument keys below.")
print(list(args[0].keys()))


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
[W NNPACK.cpp:79] Could not initialize NNPACK! Reason: Unsupported hardware.
  grid_x = (vector_idx // self.grid_y_size)
100%|██████████| 1/1 [00:00<00:00, 16.85it/s]

No groups file provided. Trying to use the default groups file.
Number of trainable parameters actor-critic: 182912 / 139181
############################## 
All models evaluated, please note the argument keys below.
['hidden_size', 'static_size', 'dynamic_size', 'num_layers', 'dropout', 'checkpoint', 'test', 'epoch_max', 'train_size', 'line_unit_price', 'station_price', 'result_path', 'actor_lr', 'critic_lr', 'actor_mlp_layers', 'critic_mlp_layers', 'station_num_lim', 'budget', 'max_grad_norm', 'environment', 'reward', 'ses_weight', 'var_lambda', 'ggi_weight', 'groups_file', 'arch', 'no_log', 'use_abs', 'early_stopping', 'constraint_free', 'cf_reward_scaling', 'cf_efficient_station', 'cf_station_density', 'cf_dmin', 'cf_dmax', 'plot_every']





## Results table
Set the column names (`TABLE_COLUMNS`) based on the ouput of the previous cell. The mean od reward and od reward standard deviation are always used. Furthermore, set the desired reward decimal expansion using `rounding`, and the desired column separator using `sep`. The `seed` argument cannot be selected as a column name, since it is used for averaging.

In [3]:
TABLE_COLUMNS = ["arch", "actor_mlp_layers", "critic_mlp_layers", "constraint_free", "cf_reward_scaling"]
rounding = 3
sep = '    '

table_header = f"{sep.join(TABLE_COLUMNS)}{sep}mean reward{sep}std. reward"
table_column_len = [len(c) for c in table_header.split(sep)]
print(table_header)

for arg_id, arg_setting in enumerate(args):
    # Get the rewards
    if len(evaluation[arg_id][0]) == 1:
        mean_reward = round(evaluation[arg_id][0], rounding)
        std_reward = "n.a."
    else:
        reward_list = np.array([r for r, _ in evaluation[arg_id]])
        mean_reward = round(np.mean(reward_list), rounding)
        std_reward = round(np.std(reward_list), rounding)
    
    # Make a list with the table values
    entries = [str(arg_setting[val]) for val in TABLE_COLUMNS]
    entries.append(str(mean_reward))
    entries.append(str(std_reward))

    # Truncate the values to prevent cells with too much with, and right pad values that are too short
    entries = [s[:l] for s, l in zip(entries, table_column_len)]
    entries = [s.rjust(l, ' ') for s, l in zip(entries, table_column_len)]
    print(f"{sep.join(entries)}")


arch    actor_mlp_layers    critic_mlp_layers    constraint_free    cf_reward_scaling    mean reward    std. reward
poin                   5                    4              False               linear          0.417            0.0


## Results plots
Todo