## Comparing best agent's performance with classical baseline

### Three Steps

1. Identify the checkpoint with best evaluation performance.
2. Restore the agent with identified checkpoint.
3. Run the restored agent on 100000 episodes with the same seed used for computing baseline.

### Step 2: Restoring agent from checkpoint

In [1]:
from pathlib import Path

from ray.rllib.agents.ppo import PPOTrainer
from ray.tune import ExperimentAnalysis
from ray.tune.registry import register_env

from inventory_env.env_creator import inventory_env_creator

register_env("inventory_env", inventory_env_creator)

path_to_results_dir = Path().absolute().parent / "inventory_management_rl" / "experiments" / "experiment_results" / "experiment_wrappers"
analysis = ExperimentAnalysis(path_to_results_dir, default_metric="evaluation/episode_reward_mean", default_mode="max")
best_trial = analysis.get_best_trial(scope="all")
best_checkpoint = analysis.get_best_checkpoint(best_trial)
config = best_trial.config
agent = PPOTrainer(config=config)
agent.restore(best_checkpoint)

[2m[36m(RolloutWorker pid=1732)[0m   logger.warn(
  logger.warn(
2023-04-27 17:09:53,237	INFO trainable.py:495 -- Restored on 192.168.0.178 from checkpoint: /home/dibya/Dropbox/rl_course/real_world_deep_rl/inventory_management_rl/experiments/experiment_results/experiment_wrappers/PPO_inventory_env_b7bbd_00003_3_obs_filter=my_normalize,reward_filter=gym_scale_rewards,obs_filter=my_normalize_2023-03-30_14-31-15/checkpoint_003000/checkpoint-3000
2023-04-27 17:09:53,238	INFO trainable.py:503 -- Current state after restoring: {'_iteration': 3000, '_timesteps_total': 12000000, '_time_total': 32642.497244119644, '_episodes_total': 133333}


### Step 3: Run restored agent for 100000 episodes with seed 0

In [2]:
import numpy as np

env = inventory_env_creator(
    {
        "obs_filter": "my_normalize", 
        "reward_filter": None, 
    }, 
    seed=0
)
num_episodes = 100000
all_r = []
for _ in range(num_episodes):
    obs = env.reset()
    ep_r = 0
    while True:
        action = agent.compute_action(obs, unsquash_action=True)
        obs, r, done, _ = env.step(np.around(action))
        ep_r += r
        if done:
            break
    all_r.append(ep_r)
baseline = sum(all_r) / num_episodes
print(baseline)



175694.70564031467
