In [1]:
%load_ext autoreload
from ddls.devices.processors.gpus.A100 import A100
from ddls.plotting.plotting import plot_computation_graph
from ddls.environments.ramp_job_placement_shaping.ramp_job_placement_shaping_environment import RampJobPlacementShapingEnvironment
from ddls.demands.jobs.job import Job
from ddls.distributions.uniform import Uniform
from ddls.utils import seed_stochastic_modules_globally

from ddls.ml_models.policies import GNNPolicy
from ddls.plotting.plotting import plot_line

from ddls.loops.rllib_epoch_loop import RLlibEpochLoop
from ddls.launchers.launcher import Launcher

import ray
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print
ray.shutdown()
ray.init()

from ray.rllib.models import ModelCatalog
from ray.rllib.agents import ppo

import glob
import numpy as np
from collections import defaultdict

2022-06-23 12:56:17,686	INFO services.py:1338 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8269[39m[22m
2022-06-23 12:56:24,346	INFO services.py:1338 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8269[39m[22m


In [2]:
%autoreload
register_env('ramp_job_placement_shaping_environment', lambda env_config: RampJobPlacementShapingEnvironment(**env_config))

In [3]:
%autoreload
ModelCatalog.register_custom_model('my_model', GNNPolicy)

In [4]:
%autoreload

node_config = {'type_1':
                  {
                      'num_nodes': 16, # 8 16
                      'workers_config': 
                          [
                              {
                               'num_workers': 1, # NEED 1 WORKER PER SERVER FOR RAMP
                               'worker': A100
                              }
                          ]
                  }
              }

topology_config = {'type':
                      'ramp',
                   'kwargs':
                      {
                          'num_communication_groups': 2,
                          'num_racks_per_communication_group': 2,
                          'num_servers_per_rack': 4, # 2 4
                          'num_channels': 2
                      }
                  }

jobs_config = {'path_to_files': '/scratch/datasets/ddls/jobs/pipedream_graphs/image_classification/profiles/alexnet/',
               'job_interarrival_time_dist': Uniform(min_val=1, max_val=1000),
               'max_files': 20,
               'job_sampling_mode': 'remove',
               # 'job_sampling_mode': 'remove_and_replace',
               }

In [5]:
env_config = {'node_config': node_config,
              'topology_config': topology_config,
              'jobs_config': jobs_config,
              # 'max_simulation_run_time': 1e4,
              'max_simulation_run_time': float('inf'),
              'job_queue_capacity': 100,
              'reward_function': 'lookahead_job_completion_time',
              'pad_obs_kwargs': {'max_nodes': 200}
             }

model_config = {
        'in_features_node':5,
        'in_features_edge':1,
        'out_features_msg':8,
        'out_features_hidden':16,
        'out_features':4,
        'in_features_graph':34, # CHANGE 130
        'out_features_graph':4,
        'num_layers':1,
        'aggregator_type':'mean',
        'action_space_type': 'discrete' # 'discrete' 'continuous'
    }



rllib_config = {
    
    'seed': 0,
    
    'env': 'ramp_job_placement_shaping_environment',
    
    'env_config': env_config,
    
    'batch_mode': 'complete_episodes',
    'train_batch_size': 4, # 1 32 128
    'sgd_minibatch_size': 4, # 1 32 128
    
    'model':{
            'fcnet_hiddens':[8],
            'fcnet_activation':'relu',
            'custom_model':'my_model',
            'custom_model_config': model_config
        },
    
    'num_workers': 4,
    'num_gpus': 1,
    
    'framework': 'torch'
    
    }

print(rllib_config)

{'seed': 0, 'env': 'ramp_job_placement_shaping_environment', 'env_config': {'node_config': {'type_1': {'num_nodes': 16, 'workers_config': [{'num_workers': 1, 'worker': <class 'ddls.devices.processors.gpus.A100.A100'>}]}}, 'topology_config': {'type': 'ramp', 'kwargs': {'num_communication_groups': 2, 'num_racks_per_communication_group': 2, 'num_servers_per_rack': 4, 'num_channels': 2}}, 'jobs_config': {'path_to_files': '/scratch/datasets/ddls/jobs/pipedream_graphs/image_classification/profiles/alexnet/', 'job_interarrival_time_dist': <ddls.distributions.uniform.Uniform object at 0x7efb574474f0>, 'max_files': 20, 'job_sampling_mode': 'remove'}, 'max_simulation_run_time': inf, 'job_queue_capacity': 100, 'reward_function': 'lookahead_job_completion_time', 'pad_obs_kwargs': {'max_nodes': 200}}, 'batch_mode': 'complete_episodes', 'train_batch_size': 4, 'sgd_minibatch_size': 4, 'model': {'fcnet_hiddens': [8], 'fcnet_activation': 'relu', 'custom_model': 'my_model', 'custom_model_config': {'in_f

In [6]:
%autoreload

# load default PPO config and update with custom config params
ppo_config = ppo.DEFAULT_CONFIG.copy()
ppo_config.update(rllib_config)
print(f'Config:\n{ppo_config}')

# initialise rllib trainer
epoch_loop = ppo.PPOTrainer(config=ppo_config)
print('\nInitialised trainer.')

2022-06-23 12:56:26,511	INFO ppo.py:166 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-06-23 12:56:26,512	INFO trainer.py:743 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Config:
{'num_workers': 4, 'num_envs_per_worker': 1, 'create_env_on_driver': False, 'rollout_fragment_length': 200, 'batch_mode': 'complete_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4, 'model': {'fcnet_hiddens': [8], 'fcnet_activation': 'relu', 'custom_model': 'my_model', 'custom_model_config': {'in_features_node': 5, 'in_features_edge': 1, 'out_features_msg': 8, 'out_features_hidden': 16, 'out_features': 4, 'in_features_graph': 34, 'out_features_graph': 4, 'num_layers': 1, 'aggregator_type': 'mean', 'action_space_type': 'discrete'}}, 'optimizer': {}, 'horizon': None, 'soft_horizon': False, 'no_done_at_end': False, 'env': 'ramp_job_placement_shaping_environment', 'observation_space': None, 'action_space': None, 'env_config': {'node_config': {'type_1': {'num_nodes': 16, 'workers_config': [{'num_workers': 1, 'worker': <class 'ddls.devices.processors.gpus.A100.A100'>}]}}, 'topology_config': {'type': 'ramp', 'kwargs': {'num_communication_groups': 2, 'num_racks_per_communic

2022-06-23 12:56:41,770	INFO trainable.py:124 -- Trainable.setup took 15.265 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.



Initialised trainer.


In [None]:
%autoreload

agent_name = 'PPO'
num_epochs = 500
rl_training_stats = defaultdict(lambda: [])
for epoch in range(num_epochs):
    print(f'\n------- Epoch {epoch+1} of {num_epochs} -------')
    result = epoch_loop.train()
    
    # print epoch data
    # print(pretty_print(result))
    
    # save epoch data
    for key, val in result['hist_stats'].items():
        rl_training_stats[key].extend(val)
    for _ in range(len(val)):
        rl_training_stats['seed'].append(result['config']['seed'])
        rl_training_stats['agent'].append(agent_name)
        rl_training_stats['epoch'].append(epoch)
        
# display(pd.DataFrame(rl_training_stats))


------- Epoch 1 of 500 -------


[2m[36m(RolloutWorker pid=2905509)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  return np.nanmean(tower_data)
  lambda *s: None if s[0] is None else np.nanmean(s, axis=0),


agent_timesteps_total: 4
custom_metrics: {}
date: 2022-06-23_12-56-43
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: 0.0013693919650025483
episode_reward_min: 0.001369391959558368
episodes_this_iter: 4
episodes_total: 4
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 0.20000000000000007
        cur_lr: 4.999999999999999e-05
        entropy: 2.639056841532389
        entropy_coeff: 0.0
        kl: -1.4651812515846056e-07
        policy_loss: 0.0
        total_loss: 1.1700356541410884e-07
        vf_explained_var: .nan
        vf_loss: 1.4630718965143638e-07
  num_agent_steps_sampled: 4
  num_agent_steps_trained: 4
  num_steps_sampled: 4
  num_steps_trained: 4
iterations_since_restore: 1
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}
perf:
  cpu_util_percent: 24.624999999999996
  ram_util_

[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 12
custom_metrics: {}
date: 2022-06-23_12-56-47
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.08208194611832222
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 12
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 0.05000000000000002
        cur_lr: 4.999999999999999e-05
        entropy: 2.639053948720296
        entropy_coeff: 0.0
        kl: 1.1332590171756843e-06
        policy_loss: -0.0014623085657755534
        total_loss: 0.24834418892860413
        vf_explained_var: 2.7815500895182292e-08
        vf_loss: 0.24980644782384237
  num_agent_steps_sampled: 12
  num_agent_steps_trained: 12
  num_steps_sampled: 12
  num_steps_trained: 12
  num_steps_trained_this_iter: 0
iterations_since_restore: 3
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}
perf

[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 28
custom_metrics: {}
date: 2022-06-23_12-56-55
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.07016025772227394
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 28
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 0.003125000000000001
        cur_lr: 4.999999999999999e-05
        entropy: 2.6390395005544027
        entropy_coeff: 0.0
        kl: 9.39330069134788e-07
        policy_loss: -0.0014851073424021403
        total_loss: 0.24607057273387908
        vf_explained_var: -3.9736429850260414e-08
        vf_loss: 0.247555677096049
  num_agent_steps_sampled: 28
  num_agent_steps_trained: 28
  num_steps_sampled: 28
  num_steps_trained: 28
  num_steps_trained_this_iter: 0
iterations_since_restore: 7
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}
perf:

[2m[36m(RolloutWorker pid=2905510)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905510)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 32
custom_metrics: {}
date: 2022-06-23_12-56-57
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.09251212360618402
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 32
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 0.0015625000000000005
        cur_lr: 4.999999999999999e-05
        entropy: 2.639030679066976
        entropy_coeff: 0.0
        kl: 2.2243018975132143e-06
        policy_loss: -0.0023518572251001995
        total_loss: 0.2430181622505188
        vf_explained_var: 0.0
        vf_loss: 0.24537003089984258
  num_agent_steps_sampled: 32
  num_agent_steps_trained: 32
  num_steps_sampled: 32
  num_steps_trained: 32
  num_steps_trained_this_iter: 0
iterations_since_restore: 8
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}
perf:
  cpu_util_perce

[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 68
custom_metrics: {}
date: 2022-06-23_12-57-16
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.057537539245176045
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 68
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 3.051757812500001e-06
        cur_lr: 4.999999999999999e-05
        entropy: 2.638969810803731
        entropy_coeff: 0.0
        kl: 3.909493928707282e-06
        policy_loss: -0.0036184996366500854
        total_loss: 0.24017160038153332
        vf_explained_var: 3.9736429850260414e-08
        vf_loss: 0.2437900980313619
  num_agent_steps_sampled: 68
  num_agent_steps_trained: 68
  num_steps_sampled: 68
  num_steps_trained: 68
  num_steps_trained_this_iter: 0
iterations_since_restore: 17
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}
pe

[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 76
custom_metrics: {}
date: 2022-06-23_12-57-20
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.06451284755785597
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 76
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 7.629394531250003e-07
        cur_lr: 4.999999999999999e-05
        entropy: 2.6389034509658815
        entropy_coeff: 0.0
        kl: 3.3632414366972324e-06
        policy_loss: -0.0030986666679382324
        total_loss: 0.2375253717104594
        vf_explained_var: 6.357828776041667e-08
        vf_loss: 0.24062402695417404
  num_agent_steps_sampled: 76
  num_agent_steps_trained: 76
  num_steps_sampled: 76
  num_steps_trained: 76
  num_steps_trained_this_iter: 0
iterations_since_restore: 19
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}
pe

[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 80
custom_metrics: {}
date: 2022-06-23_12-57-22
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.07373597379762088
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 80
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 3.8146972656250013e-07
        cur_lr: 4.999999999999999e-05
        entropy: 2.6388290087382
        entropy_coeff: 0.0
        kl: 4.922533056136065e-06
        policy_loss: -0.004236912727355957
        total_loss: 0.23322179516156513
        vf_explained_var: 6.75519307454427e-08
        vf_loss: 0.237458698451519
  num_agent_steps_sampled: 80
  num_agent_steps_trained: 80
  num_steps_sampled: 80
  num_steps_trained: 80
  num_steps_trained_this_iter: 0
iterations_since_restore: 20
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}
perf:
  

[2m[36m(RolloutWorker pid=2905511)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905511)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 92
custom_metrics: {}
date: 2022-06-23_12-57-28
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.0748245645170548
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 92
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 4.7683715820312516e-08
        cur_lr: 4.999999999999999e-05
        entropy: 2.638635190327962
        entropy_coeff: 0.0
        kl: 6.085219229134964e-06
        policy_loss: -0.004083748658498128
        total_loss: 0.2304176062345505
        vf_explained_var: -2.384185791015625e-08
        vf_loss: 0.2345013494292895
  num_agent_steps_sampled: 92
  num_agent_steps_trained: 92
  num_steps_sampled: 92
  num_steps_trained: 92
  num_steps_trained_this_iter: 0
iterations_since_restore: 23
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}
perf:

[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 100
custom_metrics: {}
date: 2022-06-23_12-57-32
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.07874293121519628
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 100
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 1.1920928955078129e-08
        cur_lr: 4.999999999999999e-05
        entropy: 2.6384035031000774
        entropy_coeff: 0.0
        kl: 5.2300449169706555e-06
        policy_loss: -0.004398266474405925
        total_loss: 0.22684779067834218
        vf_explained_var: -5.960464477539063e-08
        vf_loss: 0.23124605218569438
  num_agent_steps_sampled: 100
  num_agent_steps_trained: 100
  num_steps_sampled: 100
  num_steps_trained: 100
  num_steps_trained_this_iter: 0
iterations_since_restore: 25
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimato

  1. The dashboard might not display correct information on this node.
  2. Metrics on this node won't be reported.
  3. runtime_env APIs won't work.
Check out the `dashboard_agent.log` to see the detailed failure messages.


agent_timesteps_total: 112
custom_metrics: {}
date: 2022-06-23_12-57-38
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.06872923820075028
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 112
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 1.4901161193847661e-09
        cur_lr: 4.999999999999999e-05
        entropy: 2.6382069031397504
        entropy_coeff: 0.0
        kl: 2.4711568282024623e-07
        policy_loss: -4.4607526312271754e-05
        total_loss: 0.001974727027118206
        vf_explained_var: 0.00012424389521280925
        vf_loss: 0.0020193347590975463
  num_agent_steps_sampled: 112
  num_agent_steps_trained: 112
  num_steps_sampled: 112
  num_steps_trained: 112
  num_steps_trained_this_iter: 0
iterations_since_restore: 28
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_est

[2m[36m(RolloutWorker pid=2905511)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905511)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 128
custom_metrics: {}
date: 2022-06-23_12-57-47
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.0687290624722871
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 128
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 9.313225746154788e-11
        cur_lr: 4.999999999999999e-05
        entropy: 2.6381874879201255
        entropy_coeff: 0.0
        kl: 4.780304433360527e-06
        policy_loss: -0.004159031311670939
        total_loss: 0.2277791331211726
        vf_explained_var: 7.152557373046875e-08
        vf_loss: 0.23193816244602203
  num_agent_steps_sampled: 128
  num_agent_steps_trained: 128
  num_steps_sampled: 128
  num_steps_trained: 128
  num_steps_trained_this_iter: 0
iterations_since_restore: 32
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}

[2m[36m(RolloutWorker pid=2905510)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905510)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 132
custom_metrics: {}
date: 2022-06-23_12-57-49
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.06872907416258293
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 132
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 4.656612873077394e-11
        cur_lr: 4.999999999999999e-05
        entropy: 2.638048799832662
        entropy_coeff: 0.0
        kl: 1.0000395498839983e-05
        policy_loss: -0.007182978590329488
        total_loss: 0.22102899849414825
        vf_explained_var: -4.76837158203125e-08
        vf_loss: 0.228211976091067
  num_agent_steps_sampled: 132
  num_agent_steps_trained: 132
  num_steps_sampled: 132
  num_steps_trained: 132
  num_steps_trained_this_iter: 0
iterations_since_restore: 33
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}

[2m[36m(RolloutWorker pid=2905509)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905509)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 136
custom_metrics: {}
date: 2022-06-23_12-57-51
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.07874285775960517
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 136
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 2.328306436538697e-11
        cur_lr: 4.999999999999999e-05
        entropy: 2.6377917925516763
        entropy_coeff: 0.0
        kl: 1.3762514997021451e-05
        policy_loss: -0.007322574655214946
        total_loss: 0.21650578280289967
        vf_explained_var: -1.9868214925130207e-08
        vf_loss: 0.22382834901412327
  num_agent_steps_sampled: 136
  num_agent_steps_trained: 136
  num_steps_sampled: 136
  num_steps_trained: 136
  num_steps_trained_this_iter: 0
iterations_since_restore: 34
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimato

[2m[36m(RolloutWorker pid=2905509)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905509)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 168
custom_metrics: {}
date: 2022-06-23_12-58-11
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.07874277005196592
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 168
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 9.094947017729285e-14
        cur_lr: 4.999999999999999e-05
        entropy: 2.6373133579889934
        entropy_coeff: 0.0
        kl: 1.3848242997482884e-05
        policy_loss: -0.007804243763287862
        total_loss: 0.2195387581984202
        vf_explained_var: -4.3710072835286455e-08
        vf_loss: 0.2273430054386457
  num_agent_steps_sampled: 168
  num_agent_steps_trained: 168
  num_steps_sampled: 168
  num_steps_trained: 168
  num_steps_trained_this_iter: 0
iterations_since_restore: 42
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator:

[2m[36m(RolloutWorker pid=2905511)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905511)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 180
custom_metrics: {}
date: 2022-06-23_12-58-19
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.06872907267580682
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 180
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 1.1368683772161607e-14
        cur_lr: 4.999999999999999e-05
        entropy: 2.6366271575291953
        entropy_coeff: 0.0
        kl: 1.666192035238249e-05
        policy_loss: -0.008640975753466288
        total_loss: 0.21638192137082418
        vf_explained_var: 0.0
        vf_loss: 0.2250228946407636
  num_agent_steps_sampled: 180
  num_agent_steps_trained: 180
  num_steps_sampled: 180
  num_steps_trained: 180
  num_steps_trained_this_iter: 0
iterations_since_restore: 45
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}
perf:
  cpu_uti

[2m[36m(RolloutWorker pid=2905509)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905509)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 252
custom_metrics: {}
date: 2022-06-23_12-58-59
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.02867380955310577
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 252
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 4.336808689942019e-20
        cur_lr: 4.999999999999999e-05
        entropy: 2.6350320180257163
        entropy_coeff: 0.0
        kl: 3.0800846811137227e-05
        policy_loss: -0.01115752359231313
        total_loss: 0.22511454224586486
        vf_explained_var: 4.76837158203125e-08
        vf_loss: 0.23627206732829412
  num_agent_steps_sampled: 252
  num_agent_steps_trained: 252
  num_steps_sampled: 252
  num_steps_trained: 252
  num_steps_trained_this_iter: 0
iterations_since_restore: 63
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {

[2m[36m(RolloutWorker pid=2905509)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905509)[0m   ret = ret.dtype.type(ret / rcount)
[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 256
custom_metrics: {}
date: 2022-06-23_12-59-02
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.048701102514835615
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 256
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 2.1684043449710096e-20
        cur_lr: 4.999999999999999e-05
        entropy: 2.6335107644399005
        entropy_coeff: 0.0
        kl: 7.054387751850299e-05
        policy_loss: -0.02083012064297994
        total_loss: 0.4373426040013631
        vf_explained_var: -2.384185791015625e-08
        vf_loss: 0.4581727276245753
  num_agent_steps_sampled: 256
  num_agent_steps_trained: 256
  num_steps_sampled: 256
  num_steps_trained: 256
  num_steps_trained_this_iter: 0
iterations_since_restore: 64
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: 

[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 320
custom_metrics: {}
date: 2022-06-23_12-59-37
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.03868723648619357
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 320
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 3.308722450212112e-25
        cur_lr: 4.999999999999999e-05
        entropy: 2.6304284811019896
        entropy_coeff: 0.0
        kl: 3.926964373211679e-05
        policy_loss: -0.01252899666627248
        total_loss: 0.2225481390953064
        vf_explained_var: -7.947285970052083e-09
        vf_loss: 0.23507712930440902
  num_agent_steps_sampled: 320
  num_agent_steps_trained: 320
  num_steps_sampled: 320
  num_steps_trained: 320
  num_steps_trained_this_iter: 0
iterations_since_restore: 80
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {

[2m[36m(RolloutWorker pid=2905511)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905511)[0m   ret = ret.dtype.type(ret / rcount)
[2m[36m(RolloutWorker pid=2905510)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905510)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 340
custom_metrics: {}
date: 2022-06-23_12-59-49
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.05871416741536325
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 340
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 1.033975765691285e-26
        cur_lr: 4.999999999999999e-05
        entropy: 2.627041451136271
        entropy_coeff: 0.0
        kl: 8.077020684188104e-05
        policy_loss: -0.022202320893605552
        total_loss: 0.44372531871000925
        vf_explained_var: -1.5894571940104166e-08
        vf_loss: 0.46592764258384706
  num_agent_steps_sampled: 340
  num_agent_steps_trained: 340
  num_steps_sampled: 340
  num_steps_trained: 340
  num_steps_trained_this_iter: 0
iterations_since_restore: 85
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator:

[2m[36m(RolloutWorker pid=2905510)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905510)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 344
custom_metrics: {}
date: 2022-06-23_12-59-51
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.068727765223492
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 344
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 5.169878828456425e-27
        cur_lr: 4.999999999999999e-05
        entropy: 2.6232065757115683
        entropy_coeff: 0.0
        kl: 0.0001369085512124002
        policy_loss: -0.02689856042464574
        total_loss: 0.19809827307860056
        vf_explained_var: -2.384185791015625e-08
        vf_loss: 0.22499683499336243
  num_agent_steps_sampled: 344
  num_agent_steps_trained: 344
  num_steps_sampled: 344
  num_steps_trained: 344
  num_steps_trained_this_iter: 0
iterations_since_restore: 86
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}

[2m[36m(RolloutWorker pid=2905511)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905511)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 364
custom_metrics: {}
date: 2022-06-23_13-00-03
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.04870045092327166
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 364
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 1.6155871338926327e-28
        cur_lr: 4.999999999999999e-05
        entropy: 2.617017658551534
        entropy_coeff: 0.0
        kl: 9.460012282449525e-05
        policy_loss: -0.01884345312913259
        total_loss: 0.20622047086556752
        vf_explained_var: -3.178914388020833e-08
        vf_loss: 0.22506392995516458
  num_agent_steps_sampled: 364
  num_agent_steps_trained: 364
  num_steps_sampled: 364
  num_steps_trained: 364
  num_steps_trained_this_iter: 0
iterations_since_restore: 91
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: 

[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 376
custom_metrics: {}
date: 2022-06-23_13-00-10
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.05871403469947869
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 376
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 2.019483917365791e-29
        cur_lr: 4.999999999999999e-05
        entropy: 2.610150424639384
        entropy_coeff: 0.0
        kl: 9.809704760431378e-05
        policy_loss: -0.019456152121225992
        total_loss: 0.20336321691672007
        vf_explained_var: 3.5762786865234374e-08
        vf_loss: 0.2228193720181783
  num_agent_steps_sampled: 376
  num_agent_steps_trained: 376
  num_steps_sampled: 376
  num_steps_trained: 376
  num_steps_trained_this_iter: 0
iterations_since_restore: 94
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {

[2m[36m(RolloutWorker pid=2905509)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905509)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 416
custom_metrics: {}
date: 2022-06-23_13-00-31
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.06872782069457087
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 416
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 1.9721522630525302e-32
        cur_lr: 4.999999999999999e-05
        entropy: 2.5991150617599486
        entropy_coeff: 0.0
        kl: 0.00010450263730354218
        policy_loss: -0.02168356776237488
        total_loss: 0.21092533071835837
        vf_explained_var: -3.9736429850260414e-08
        vf_loss: 0.23260890146096547
  num_agent_steps_sampled: 416
  num_agent_steps_trained: 416
  num_steps_sampled: 416
  num_steps_trained: 416
  num_steps_trained_this_iter: 0
iterations_since_restore: 104
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimat

[2m[36m(RolloutWorker pid=2905509)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905509)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 456
custom_metrics: {}
date: 2022-06-23_13-00-53
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.03868694343539023
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 456
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 1.9259299443872365e-35
        cur_lr: 4.999999999999999e-05
        entropy: 2.5865396579106648
        entropy_coeff: 0.0
        kl: 0.00012971680698683485
        policy_loss: -0.02570931166410446
        total_loss: 0.21219313144683838
        vf_explained_var: -7.152557373046875e-08
        vf_loss: 0.23790244509776434
  num_agent_steps_sampled: 456
  num_agent_steps_trained: 456
  num_steps_sampled: 456
  num_steps_trained: 456
  num_steps_trained_this_iter: 0
iterations_since_restore: 114
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimato

[2m[36m(RolloutWorker pid=2905512)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905512)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 460
custom_metrics: {}
date: 2022-06-23_13-00-55
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.048700647030322335
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 460
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 9.629649721936182e-36
        cur_lr: 4.999999999999999e-05
        entropy: 2.5779552380243937
        entropy_coeff: 0.0
        kl: 0.0002563496275494496
        policy_loss: -0.038134388128916424
        total_loss: 0.19116338690121967
        vf_explained_var: 7.947285970052083e-09
        vf_loss: 0.22929777006308238
  num_agent_steps_sampled: 460
  num_agent_steps_trained: 460
  num_steps_sampled: 460
  num_steps_trained: 460
  num_steps_trained_this_iter: 0
iterations_since_restore: 115
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator

[2m[36m(RolloutWorker pid=2905509)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905509)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 544
custom_metrics: {}
date: 2022-06-23_13-01-35
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.02867315540948528
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 544
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 4.591774807899562e-42
        cur_lr: 4.999999999999999e-05
        entropy: 2.552209663391113
        entropy_coeff: 0.0
        kl: 0.0004128053562453715
        policy_loss: -0.04450285981098811
        total_loss: 0.199954949816068
        vf_explained_var: 5.5631001790364584e-08
        vf_loss: 0.24445781807104747
  num_agent_steps_sampled: 544
  num_agent_steps_trained: 544
  num_steps_sampled: 544
  num_steps_trained: 544
  num_steps_trained_this_iter: 0
iterations_since_restore: 136
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: {}

[2m[36m(RolloutWorker pid=2905509)[0m   return _methods._mean(a, axis=axis, dtype=dtype,
[2m[36m(RolloutWorker pid=2905509)[0m   ret = ret.dtype.type(ret / rcount)


agent_timesteps_total: 548
custom_metrics: {}
date: 2022-06-23_13-01-37
done: false
episode_len_mean: 1.0
episode_media: {}
episode_reward_max: 0.0013693919704467286
episode_reward_mean: -0.038686849329026426
episode_reward_min: -1.0
episodes_this_iter: 4
episodes_total: 548
experiment_id: 07575fbcc2df4014a1a7692320f0bfdf
hostname: mammoth.ee.ucl.ac.uk
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 2.295887403949781e-42
        cur_lr: 4.999999999999999e-05
        entropy: 2.5335166613260904
        entropy_coeff: 0.0
        kl: 0.0005620851514355726
        policy_loss: -0.058222938080628714
        total_loss: 0.17377656896909077
        vf_explained_var: 4.76837158203125e-08
        vf_loss: 0.2319995159904162
  num_agent_steps_sampled: 548
  num_agent_steps_trained: 548
  num_steps_sampled: 548
  num_steps_trained: 548
  num_steps_trained_this_iter: 0
iterations_since_restore: 137
node_ip: 128.40.41.23
num_healthy_workers: 4
off_policy_estimator: 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-b2c08f591d47>", line 8, in <module>
    result = epoch_loop.train()
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/tune/trainable.py", line 314, in train
    result = self.step()
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 867, in step
    result = self.step_attempt()
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 920, in step_attempt
    step_results = next(self.train_exec_impl)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/util/iter.py", line 756, in __next__
    return next(self.built_iterator)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/util/iter.py", line

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-b2c08f591d47>", line 8, in <module>
    result = epoch_loop.train()
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/tune/trainable.py", line 314, in train
    result = self.step()
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 867, in step
    result = self.step_attempt()
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 920, in step_attempt
    step_results = next(self.train_exec_impl)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/util/iter.py", line 756, in __next__
    return next(self.built_iterator)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/ray/util/iter.py", line

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

x = 'epoch'
scaling_factor = 1
metrics_to_plot = {'episode_reward', 'episode_lengths'}

for metric in metrics_to_plot:
    print(f'Plotting metric {metric}')
    fig = plt.figure()
    fig = plot_line(pd.DataFrame(rl_training_stats), 
                    x=x, 
                    y=metric, 
                    hue='agent', 
                    xlabel=x, 
                    ylabel=metric, 
                    err_style='band', # 'band' 'bars'
                    ci=68, # 95 68
                    scaling_factor=scaling_factor,
                    show_fig=False)
#     plt.axhline(y=np.mean(random_baseline_stats[metric]), linestyle='--', color='#a84a32', label='Random')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.3), ncol=2)
    plt.show()

Plotting metric episode_lengths
ERROR! Session/line number was not unique in database. History logging moved to new session 2944


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-a9472bd64616>", line 11, in <module>
    fig = plot_line(pd.DataFrame(rl_training_stats),
  File "/home/zciccwf/phd_project/projects/ddls/ddls/plotting/plotting.py", line 314, in plot_line
    g = sns.lineplot(data=df,
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/_decorators.py", line 46, in inner_f
    return f(**kwargs)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/relational.py", line 710, in lineplot
    p.plot(ax, kwargs)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/relational.py", line 499, in plot
    x, y, y_ci = self.aggregate(y, x, u)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/relational.py", line 414, in aggregate
  

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-a9472bd64616>", line 11, in <module>
    fig = plot_line(pd.DataFrame(rl_training_stats),
  File "/home/zciccwf/phd_project/projects/ddls/ddls/plotting/plotting.py", line 314, in plot_line
    g = sns.lineplot(data=df,
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/_decorators.py", line 46, in inner_f
    return f(**kwargs)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/relational.py", line 710, in lineplot
    p.plot(ax, kwargs)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/relational.py", line 499, in plot
    x, y, y_ci = self.aggregate(y, x, u)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/relational.py", line 414, in aggregate
  

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-a9472bd64616>", line 11, in <module>
    fig = plot_line(pd.DataFrame(rl_training_stats),
  File "/home/zciccwf/phd_project/projects/ddls/ddls/plotting/plotting.py", line 314, in plot_line
    g = sns.lineplot(data=df,
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/_decorators.py", line 46, in inner_f
    return f(**kwargs)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/relational.py", line 710, in lineplot
    p.plot(ax, kwargs)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/relational.py", line 499, in plot
    x, y, y_ci = self.aggregate(y, x, u)
  File "/scratch/zciccwf/py36/envs/ddls/lib/python3.9/site-packages/seaborn/relational.py", line 414, in aggregate
  