# RL Taxi Driver 
One of the tutorials recommended on open-ai gym docs. Instructions found here https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/

In [1]:
import gym
from IPython.display import clear_output
from time import sleep
import numpy as np
import random
from IPython.display import clear_output

In [2]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)

Radom walk to taxi

In [3]:
env = gym.make("Taxi-v3").env
env.s = 328  # set environment to illustration's state
env.render()

epochs = 0
penalties, reward = 0, 0

frames = []  # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1

    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
    }
    )

    epochs += 1


print_frames(frames)
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 3191
State: 0
Action: 5
Reward: 20
Timesteps taken: 3191
Penalties incurred: 1023


Q Learning

In [4]:
%%time
"""Training the agent"""

q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 47 s, sys: 8.62 s, total: 55.6 s
Wall time: 46.2 s


In [5]:
q_table[328]

array([ -2.40416292,  -2.27325184,  -2.39922046,  -2.35773077,
       -10.52651192, -10.21776502])

In [6]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 12.41
Average penalties per episode: 0.0


In [10]:
import ray
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
# Configure the algorithm.
config = {
    # Environment (RLlib understands openAI gym registered strings).
    "env": "Taxi-v3",
    "render_env": "~/Documents/ExploringRL/Recordings",
    # Use 2 environment workers (aka "rollout workers") that parallelly
    # collect samples from their own environment clone(s).
    "num_workers": 4,
    "horizon": 10000,
    # Change this to "framework: torch", if you are using PyTorch.
    # Also, use "framework: tf2" for tf2.x eager execution.
    "framework": "torch",
    # Since learning is most of the time done on the local worker,
    # it may help to provide one or more GPUs to that worker via the num_gpus setting
    "num_gpus": 1,
    # Tweak the default model provided automatically by RLlib,
    # given the environment's observation- and action spaces.
    "model": {
        "fcnet_hiddens": [64, 64],
        "fcnet_activation": "relu",
    },
    # Set up a separate evaluation worker set for the
    # `trainer.evaluate()` call after training (see below).
    "evaluation_num_workers": 1,
    # Only for evaluation runs, render the env.
    "evaluation_config": {
        "render_env": True,
    },
}

# Create our RLlib Trainer.
trainer = PPOTrainer(config=config)

# Run it for n training iterations. A training iteration includes
# parallel sample collection by the environment workers as well as
# loss calculation on the collected batch and a model update.
for _ in range(10):
    print(trainer.train())

# Evaluate the trained Trainer (and render each timestep to the shell's
# output).
trainer.evaluate()

In [26]:
import pickle
with open("trainer.pkl", 'wb') as f:
    pickle.dump(agent, f)

In [None]:
"""Evaluate PPO agent's performance after learning"""

total_epochs, total_penalties = 0, 0
episodes = 100
for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = trainer.compute_single_action(state)
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

In [None]:
env = gym.make("Taxi-v3").env
print(env.s)
env.render()

In [None]:
env.reset()

In [6]:
import pandas as pd
import json
import os
import shutil
import sys
import ray
import ray.rllib.agents.ppo as ppo


In [1]:
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.integration.wandb import WandbLogger

In [None]:
info = ray.init(ignore_reinit_error=True)

In [8]:
checkpoint_root = "tmp/ppo/taxi"
shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None)   # clean up old runs

In [9]:
SELECT_ENV = "Taxi-v3"
N_ITER = 20

config = ppo.DEFAULT_CONFIG.copy()
config["log_level"] = "WARN"
config["framework"] = "torch"
config["num_gpus"] = 4
# config["callbacks"] = WandbLoggerCallback(api_key_file="/home/sem22h2/.netrc", project="taxi-v3")

agent = ppo.PPOTrainer(config, env=SELECT_ENV)
results = []
episode_data = []
episode_json = []

for n in range(N_ITER):
    result = agent.train()
    results.append(result)
    
    episode = {'n': n, 
               'episode_reward_min': result['episode_reward_min'], 
               'episode_reward_mean': result['episode_reward_mean'], 
               'episode_reward_max': result['episode_reward_max'],  
               'episode_len_mean': result['episode_len_mean']
              }
    
    episode_data.append(episode)
    episode_json.append(json.dumps(episode))
    file_name = agent.save(checkpoint_root)
    
    print(f'{n+1:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f}, len mean: {result["episode_len_mean"]:8.4f}. Checkpoint saved to {file_name}')

ValueError: `callbacks` must be a callable method that returns a subclass of DefaultCallbacks, got <ray.tune.integration.wandb.WandbLoggerCallback object at 0x7fc5c6d16d30>!

In [21]:
import pprint

policy = agent.get_policy()
model = policy.model

pprint.pprint(model.variables())
pprint.pprint(model.value_function())


[Parameter containing:
tensor([[ 0.0484,  0.0386, -0.0780,  ..., -0.0539, -0.0186, -0.0189],
        [-0.0605, -0.0826, -0.0100,  ..., -0.0543, -0.0494, -0.0333],
        [ 0.0751,  0.0094,  0.0455,  ...,  0.1059,  0.0092, -0.0261],
        [-0.0117, -0.0598,  0.0818,  ...,  0.0364, -0.0492, -0.0255],
        [-0.0592,  0.0431, -0.0732,  ..., -0.0676,  0.0676,  0.0304],
        [ 0.0083,  0.0732,  0.0018,  ...,  0.0290,  0.0547,  0.0975]],
       device='cuda:0', requires_grad=True),
 Parameter containing:
tensor([ 0.0043,  0.0148,  0.0079,  0.0059, -0.0152, -0.0172], device='cuda:0',
       requires_grad=True),
 Parameter containing:
tensor([[ 0.0708,  0.0101, -0.0280,  ..., -0.0074, -0.0139, -0.0591],
        [ 0.0276,  0.0846, -0.0561,  ...,  0.0012, -0.0517,  0.1160],
        [-0.0172,  0.0208, -0.0893,  ..., -0.0526, -0.0121, -0.0697],
        ...,
        [-0.0116, -0.0449, -0.0796,  ..., -0.0394,  0.0614,  0.0321],
        [ 0.0129, -0.0071,  0.0489,  ...,  0.0340, -0.0889,  0.0

AttributeError: 'FullyConnectedNetwork' object has no attribute 'base_model'

In [1]:
import ray
from ray.tune.integration.wandb import WandbLoggerCallback

In [17]:
SELECT_ENV = "Taxi-v3"
N_ITER = 20


ray.init()
ray.tune.run(
    "PPO",
    stop={"training_iteration": 15},
    config={
        "env": SELECT_ENV,
        "record_env": True,
        "framework": "torch",
        "num_cpus_per_worker": 2,
        "num_gpus": 2,
        "num_workers": 4,
    },
    local_dir="logs",
    callbacks=[
            WandbLoggerCallback(api_key="c36c598399c6c7f2f0b446aac164da6c7956a263", project="Taxi_v3")],
)


[34m[1mwandb[0m: Currently logged in as: [33mdanky[0m. Use [1m`wandb login --relogin`[0m to force relogin


[2m[36m(PPOTrainer pid=39908)[0m 2022-07-08 07:50:26,739	INFO ppo.py:414 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=39908)[0m 2022-07-08 07:50:26,739	INFO trainer.py:903 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(RolloutWorker pid=40009)[0m Setting the path for recording to /home/sem22h2/Documents/ExploringRL/logs/PPO/PPO_Taxi-v3_d9332_00000_0_2022-07-08_07-50-19/
[2m[36m(RolloutWorker pid=40010)[0m Setting the path for recording to /home/sem22h2/Documents/ExploringRL/logs/PPO/PPO_Taxi-v3_d9332_00000_0_2022-07-08_07-50-19/
[2m[36m(RolloutWorker pid=40008)[0m Setting the path for recording to /home/sem22h2/Documents/ExploringRL/logs/PPO/PPO_Taxi-v3_d9332_00000_0_2022-07-08_07-50-19/
[2m[36m(RolloutWorker pid=40007)[0m Setting the path for recording to /home/sem22h2/Documents/ExploringRL/logs/PPO/PPO_Taxi-v3_d9332_00000_0_2022-07-08_07-50-19/


Trial name,status,loc
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908




Trial name,status,loc
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908


Trial name,status,loc
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908


Trial name,status,loc
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908


Trial name,status,loc
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 4000
  counters:
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000
    num_env_steps_sampled: 4000
    num_env_steps_trained: 4000
  custom_metrics: {}
  date: 2022-07-08_07-50-55
  done: false
  episode_len_mean: 192.35
  episode_media: {}
  episode_reward_max: -161.0
  episode_reward_mean: -718.25
  episode_reward_min: -911.0
  episodes_this_iter: 20
  episodes_total: 20
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.7863648080056713
          entropy_coeff: 0.0
          kl: 0.005373008478970527
          policy_loss: -0.007753264146947092
          total_loss: 9.930198399738599
          vf_explained_var: -0.00038039838114092427
          vf_loss: 9.936877073267455
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,1,20.833,4000,-718.25,-161,-911,192.35


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,1,20.833,4000,-718.25,-161,-911,192.35


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,1,20.833,4000,-718.25,-161,-911,192.35


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 8000
  counters:
    num_agent_steps_sampled: 8000
    num_agent_steps_trained: 8000
    num_env_steps_sampled: 8000
    num_env_steps_trained: 8000
  custom_metrics: {}
  date: 2022-07-08_07-51-12
  done: false
  episode_len_mean: 195.825
  episode_media: {}
  episode_reward_max: -161.0
  episode_reward_mean: -747.375
  episode_reward_min: -992.0
  episodes_this_iter: 20
  episodes_total: 40
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.7641091427495403
          entropy_coeff: 0.0
          kl: 0.011139419508396909
          policy_loss: -0.02964115735864447
          total_loss: 9.86916392234064
          vf_explained_var: -0.00023465028373144007
          vf_loss: 9.8965771859692
        mo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,2,37.7589,8000,-747.375,-161,-992,195.825


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,2,37.7589,8000,-747.375,-161,-992,195.825


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,2,37.7589,8000,-747.375,-161,-992,195.825


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,2,37.7589,8000,-747.375,-161,-992,195.825


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 12000
  counters:
    num_agent_steps_sampled: 12000
    num_agent_steps_trained: 12000
    num_env_steps_sampled: 12000
    num_env_steps_trained: 12000
  custom_metrics: {}
  date: 2022-07-08_07-51-33
  done: false
  episode_len_mean: 197.21666666666667
  episode_media: {}
  episode_reward_max: -161.0
  episode_reward_mean: -750.6166666666667
  episode_reward_min: -992.0
  episodes_this_iter: 20
  episodes_total: 60
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.7331176734739735
          entropy_coeff: 0.0
          kl: 0.012614530745003521
          policy_loss: -0.035756791350982524
          total_loss: 9.860579955193304
          vf_explained_var: 0.0026598346489731983
          vf_loss: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,3,58.1432,12000,-750.617,-161,-992,197.217


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,3,58.1432,12000,-750.617,-161,-992,197.217


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,3,58.1432,12000,-750.617,-161,-992,197.217


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 16000
  counters:
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    num_env_steps_sampled: 16000
    num_env_steps_trained: 16000
  custom_metrics: {}
  date: 2022-07-08_07-51-51
  done: false
  episode_len_mean: 195.4125
  episode_media: {}
  episode_reward_max: -161.0
  episode_reward_mean: -734.625
  episode_reward_min: -992.0
  episodes_this_iter: 20
  episodes_total: 80
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.697235063199074
          entropy_coeff: 0.0
          kl: 0.015385766949908463
          policy_loss: -0.04312300187765911
          total_loss: 9.803589567574122
          vf_explained_var: 0.0005163910247946298
          vf_loss: 9.843635450383669
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,4,76.9114,16000,-734.625,-161,-992,195.412


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,4,76.9114,16000,-734.625,-161,-992,195.412


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,4,76.9114,16000,-734.625,-161,-992,195.412


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 20000
  counters:
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
    num_env_steps_sampled: 20000
    num_env_steps_trained: 20000
  custom_metrics: {}
  date: 2022-07-08_07-52-12
  done: false
  episode_len_mean: 192.24
  episode_media: {}
  episode_reward_max: 1.0
  episode_reward_mean: -707.4
  episode_reward_min: -992.0
  episodes_this_iter: 23
  episodes_total: 103
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.6548978668387218
          entropy_coeff: 0.0
          kl: 0.017177758069235376
          policy_loss: -0.05036356307325825
          total_loss: 9.775694449229906
          vf_explained_var: -0.005549960046686152
          vf_loss: 9.822622440707299
        m

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,5,96.9814,20000,-707.4,1,-992,192.24


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,5,96.9814,20000,-707.4,1,-992,192.24


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,5,96.9814,20000,-707.4,1,-992,192.24


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,5,96.9814,20000,-707.4,1,-992,192.24


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 24000
  counters:
    num_agent_steps_sampled: 24000
    num_agent_steps_trained: 24000
    num_env_steps_sampled: 24000
    num_env_steps_trained: 24000
  custom_metrics: {}
  date: 2022-07-08_07-52-31
  done: false
  episode_len_mean: 193.84
  episode_media: {}
  episode_reward_max: 1.0
  episode_reward_mean: -699.85
  episode_reward_min: -992.0
  episodes_this_iter: 20
  episodes_total: 123
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.6063096974485664
          entropy_coeff: 0.0
          kl: 0.01739487789018194
          policy_loss: -0.0452695393033566
          total_loss: 9.819673211087462
          vf_explained_var: -0.006531480275174623
          vf_loss: 9.861463797989712
        mo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,6,115.918,24000,-699.85,1,-992,193.84


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,6,115.918,24000,-699.85,1,-992,193.84


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,6,115.918,24000,-699.85,1,-992,193.84


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 28000
  counters:
    num_agent_steps_sampled: 28000
    num_agent_steps_trained: 28000
    num_env_steps_sampled: 28000
    num_env_steps_trained: 28000
  custom_metrics: {}
  date: 2022-07-08_07-52-51
  done: false
  episode_len_mean: 192.36
  episode_media: {}
  episode_reward_max: 1.0
  episode_reward_mean: -672.06
  episode_reward_min: -947.0
  episodes_this_iter: 21
  episodes_total: 144
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.5517668376686753
          entropy_coeff: 0.0
          kl: 0.015291083890931617
          policy_loss: -0.03673599322717036
          total_loss: 9.803154233706895
          vf_explained_var: 0.0011765173365992883
          vf_loss: 9.836832031126946
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,7,135.958,28000,-672.06,1,-947,192.36


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,7,135.958,28000,-672.06,1,-947,192.36


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,7,135.958,28000,-672.06,1,-947,192.36


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,7,135.958,28000,-672.06,1,-947,192.36


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 32000
  counters:
    num_agent_steps_sampled: 32000
    num_agent_steps_trained: 32000
    num_env_steps_sampled: 32000
    num_env_steps_trained: 32000
  custom_metrics: {}
  date: 2022-07-08_07-53-09
  done: false
  episode_len_mean: 190.69
  episode_media: {}
  episode_reward_max: 1.0
  episode_reward_mean: -638.74
  episode_reward_min: -947.0
  episodes_this_iter: 20
  episodes_total: 164
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.5699470486692202
          entropy_coeff: 0.0
          kl: 0.019756655226010573
          policy_loss: -0.050471311515217186
          total_loss: 9.828002715367143
          vf_explained_var: -0.0049945534557424565
          vf_loss: 9.874522669597338
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,8,153.754,32000,-638.74,1,-947,190.69


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,8,153.754,32000,-638.74,1,-947,190.69


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,8,153.754,32000,-638.74,1,-947,190.69


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 36000
  counters:
    num_agent_steps_sampled: 36000
    num_agent_steps_trained: 36000
    num_env_steps_sampled: 36000
    num_env_steps_trained: 36000
  custom_metrics: {}
  date: 2022-07-08_07-53-29
  done: false
  episode_len_mean: 190.54
  episode_media: {}
  episode_reward_max: -103.0
  episode_reward_mean: -610.57
  episode_reward_min: -947.0
  episodes_this_iter: 22
  episodes_total: 186
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.5202861071914755
          entropy_coeff: 0.0
          kl: 0.016428695617064674
          policy_loss: -0.04578051389225068
          total_loss: 9.744190662137923
          vf_explained_var: -0.00655690130367074
          vf_loss: 9.786685432926301
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,9,173.61,36000,-610.57,-103,-947,190.54


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,9,173.61,36000,-610.57,-103,-947,190.54


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,9,173.61,36000,-610.57,-103,-947,190.54


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 40000
  counters:
    num_agent_steps_sampled: 40000
    num_agent_steps_trained: 40000
    num_env_steps_sampled: 40000
    num_env_steps_trained: 40000
  custom_metrics: {}
  date: 2022-07-08_07-53-44
  done: false
  episode_len_mean: 188.77
  episode_media: {}
  episode_reward_max: -103.0
  episode_reward_mean: -582.16
  episode_reward_min: -947.0
  episodes_this_iter: 21
  episodes_total: 207
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.441570834703343
          entropy_coeff: 0.0
          kl: 0.021028203042513547
          policy_loss: -0.04576854933333653
          total_loss: 9.721857324210546
          vf_explained_var: -0.01445419551223837
          vf_loss: 9.763420242391607
       

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,10,189.464,40000,-582.16,-103,-947,188.77


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,10,189.464,40000,-582.16,-103,-947,188.77


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,10,189.464,40000,-582.16,-103,-947,188.77


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 44000
  counters:
    num_agent_steps_sampled: 44000
    num_agent_steps_trained: 44000
    num_env_steps_sampled: 44000
    num_env_steps_trained: 44000
  custom_metrics: {}
  date: 2022-07-08_07-54-04
  done: false
  episode_len_mean: 181.52
  episode_media: {}
  episode_reward_max: -7.0
  episode_reward_mean: -530.0
  episode_reward_min: -875.0
  episodes_this_iter: 24
  episodes_total: 231
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.4191789801402759
          entropy_coeff: 0.0
          kl: 0.01774508064150573
          policy_loss: -0.043029183958486844
          total_loss: 9.706453352077032
          vf_explained_var: -0.0009574598202141383
          vf_loss: 9.744159033990675
        model: {}
    n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,11,208.877,44000,-530,-7,-875,181.52


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,11,208.877,44000,-530,-7,-875,181.52


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,11,208.877,44000,-530,-7,-875,181.52


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 48000
  counters:
    num_agent_steps_sampled: 48000
    num_agent_steps_trained: 48000
    num_env_steps_sampled: 48000
    num_env_steps_trained: 48000
  custom_metrics: {}
  date: 2022-07-08_07-54-23
  done: false
  episode_len_mean: 175.69
  episode_media: {}
  episode_reward_max: -7.0
  episode_reward_mean: -486.01
  episode_reward_min: -812.0
  episodes_this_iter: 25
  episodes_total: 256
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.3725370958928138
          entropy_coeff: 0.0
          kl: 0.01842762184769469
          policy_loss: -0.046458250524536254
          total_loss: 9.666787348511399
          vf_explained_var: -0.007567577528697188
          vf_loss: 9.70771726690313
        model: {}
    nu

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,12,228.1,48000,-486.01,-7,-812,175.69


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,12,228.1,48000,-486.01,-7,-812,175.69


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,12,228.1,48000,-486.01,-7,-812,175.69


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 52000
  counters:
    num_agent_steps_sampled: 52000
    num_agent_steps_trained: 52000
    num_env_steps_sampled: 52000
    num_env_steps_trained: 52000
  custom_metrics: {}
  date: 2022-07-08_07-54-43
  done: false
  episode_len_mean: 179.91
  episode_media: {}
  episode_reward_max: -7.0
  episode_reward_mean: -482.13
  episode_reward_min: -812.0
  episodes_this_iter: 20
  episodes_total: 276
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.293533303083912
          entropy_coeff: 0.0
          kl: 0.017980076153063324
          policy_loss: -0.03469065641683917
          total_loss: 9.721818778335408
          vf_explained_var: -0.034611613199275026
          vf_loss: 9.751115405174994
        model: {}
    nu

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,13,247.956,52000,-482.13,-7,-812,179.91


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,13,247.956,52000,-482.13,-7,-812,179.91


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,13,247.956,52000,-482.13,-7,-812,179.91


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,13,247.956,52000,-482.13,-7,-812,179.91


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 56000
  counters:
    num_agent_steps_sampled: 56000
    num_agent_steps_trained: 56000
    num_env_steps_sampled: 56000
    num_env_steps_trained: 56000
  custom_metrics: {}
  date: 2022-07-08_07-55-04
  done: false
  episode_len_mean: 177.14
  episode_media: {}
  episode_reward_max: -7.0
  episode_reward_mean: -435.8
  episode_reward_min: -812.0
  episodes_this_iter: 23
  episodes_total: 299
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.1546392670241736
          entropy_coeff: 0.0
          kl: 0.015396402288773566
          policy_loss: -0.03685002359411409
          total_loss: 9.64412277590844
          vf_explained_var: -0.030009642435658363
          vf_loss: 9.676353892459664
        model: {}
    num

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,14,268.241,56000,-435.8,-7,-812,177.14


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,14,268.241,56000,-435.8,-7,-812,177.14


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,14,268.241,56000,-435.8,-7,-812,177.14


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,RUNNING,129.132.4.155:39908,14,268.241,56000,-435.8,-7,-812,177.14


Result for PPO_Taxi-v3_d9332_00000:
  agent_timesteps_total: 60000
  counters:
    num_agent_steps_sampled: 60000
    num_agent_steps_trained: 60000
    num_env_steps_sampled: 60000
    num_env_steps_trained: 60000
  custom_metrics: {}
  date: 2022-07-08_07-55-24
  done: true
  episode_len_mean: 177.92
  episode_media: {}
  episode_reward_max: -7.0
  episode_reward_mean: -404.75
  episode_reward_min: -776.0
  episodes_this_iter: 21
  episodes_total: 320
  experiment_id: 7b251ced4d844a0fb8570d8f93aed46f
  hostname: sassauna2.ee.ethz.ch
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.2126518983353851
          entropy_coeff: 0.0
          kl: 0.01616111890676137
          policy_loss: -0.040561869820599915
          total_loss: 9.644420728375835
          vf_explained_var: -0.04814397872135203
          vf_loss: 9.680134253348074
        model: {}
    num

0,1
agent_timesteps_total,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
counters/num_agent_steps_sampled,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
counters/num_agent_steps_trained,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
counters/num_env_steps_sampled,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
counters/num_env_steps_trained,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
episode_len_mean,▆██▇▆▇▆▆▆▅▃▁▂▁▂
episode_reward_max,▁▁▁▁████▄▄█████
episode_reward_mean,▂▁▁▁▂▂▃▃▄▄▅▆▆▇█
episode_reward_min,▄▁▁▁▁▁▂▂▂▂▅▇▇▇█
episodes_this_iter,▁▁▁▁▅▁▂▁▄▂▇█▁▅▂

0,1
agent_timesteps_total,60000.0
counters/num_agent_steps_sampled,60000.0
counters/num_agent_steps_trained,60000.0
counters/num_env_steps_sampled,60000.0
counters/num_env_steps_trained,60000.0
episode_len_mean,177.92
episode_reward_max,-7.0
episode_reward_mean,-404.75
episode_reward_min,-776.0
episodes_this_iter,21.0


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Taxi-v3_d9332_00000,TERMINATED,129.132.4.155:39908,15,288.313,60000,-404.75,-7,-776,177.92


[2m[36m(RolloutWorker pid=40009)[0m E0708 07:55:28.858038367   40113 chttp2_transport.cc:1111]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"
[2m[36m(RolloutWorker pid=40008)[0m E0708 07:55:28.856797478   40093 chttp2_transport.cc:1111]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"
2022-07-08 07:55:28,966	INFO tune.py:747 -- Total run time: 310.04 seconds (309.48 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fea179ad130>

In [None]:
# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean"),
    metric="episode_reward_mean")

# or simply get the last checkpoint (with highest "training_iteration")
last_checkpoint = analysis.get_last_checkpoint()
# if there are multiple trials, select a specific trial or automatically
# choose the best one according to a given metric
last_checkpoint = analysis.get_last_checkpoint(
    metric="episode_reward_mean", mode="max"
)

In [16]:
ray.shutdown()

In [None]:
Creaete NAS Gym env
