# SmartCab

## Setup

In [1]:
# Colab
!rm -r smart-cities-drl
!git clone https://github.com/eescriba/smart-cities-drl
!cd smart-cities-drl/ && pip install -e .

import sys
sys.path.insert(0,'./smart-cities-drl/src/')

# Local
# !pip install -e ..
# import sys
# sys.path.insert(0,'../src/')

Cloning into 'smart-cities-drl'...
remote: Enumerating objects: 829, done.[K
remote: Counting objects: 100% (829/829), done.[K
remote: Compressing objects: 100% (578/578), done.[K
remote: Total 829 (delta 367), reused 657 (delta 214), pack-reused 0[K
Receiving objects: 100% (829/829), 10.80 MiB | 7.83 MiB/s, done.
Resolving deltas: 100% (367/367), done.
Obtaining file:///content/smart-cities-drl
Installing collected packages: smart-cities-drl
  Found existing installation: smart-cities-drl 0.1.0
    Can't uninstall 'smart-cities-drl'. No files were found to uninstall.
  Running setup.py develop for smart-cities-drl
Successfully installed smart-cities-drl


In [2]:
import json
import shutil
import random

import gym
from gym.spaces import Box, Discrete, Tuple
import ray
from ray.tune import run, choice, function
from ray.rllib.agents.ppo import DEFAULT_CONFIG
from core.rl import PPOAgent
from core.pbt import PbtOptimizer
from smartcab.env import SmartCabEnv, HierarchicalSmartCabEnv



Instructions for updating:
experimental_compile is deprecated, use jit_compile instead


## Proximal Policy Optimization (PPO)

### Tune hyperparameters

In [3]:
config = DEFAULT_CONFIG
env = HierarchicalSmartCabEnv(None)

def policy_mapping_fn(agent_id):
    if agent_id.startswith("low_level_"):
        return "low_level_policy"
    else:
        return "high_level_policy"

def explore(config):
        # Postprocess the perturbed config to ensure it's still valid used if PBT
        # ensure we collect enough timesteps to do sgd
        if config["train_batch_size"] < config["sgd_minibatch_size"] * 2:
            config["train_batch_size"] = config["sgd_minibatch_size"] * 2
        # ensure we run at least one sgd iter
        if config["num_sgd_iter"] < 1:
            config["num_sgd_iter"] = 1
        return config

multiagent = {
        "policies": {
            "high_level_policy": (None, 
                                  env.flat_env.observation_space,
                                  env.high_action_space, 
                                  {}),
            "low_level_policy": (None,
                                 Tuple([
                                        env.flat_env.observation_space,
                                        env.high_action_space,
                                 ]), 
                                 env.low_action_space, 
                                 {}),
        },
        "policy_mapping_fn": function(policy_mapping_fn),
        "policies_to_train": ["high_level_policy"],
        "count_steps_by":"env_steps",
        "observation_filter": "MeanStdFilter",
        "observation_fn": None
    }
config["multiagent"] = multiagent

ppo = PPOAgent("smartcab_ppo_tune", HierarchicalSmartCabEnv, config)




[(2, 3), (2, 12), (4, 7), (6, 1), (11, 13), (13, 9)]
Action Space:  Discrete(6)
Observation Space:  Tuple(Box(0.0, 14.0, (2,), float32), Discrete(7), Discrete(6))


2021-06-30 11:39:26,740	INFO services.py:1274 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
  if np.isnan(value):
2021-06-30 11:39:29,153	INFO trainer.py:671 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2021-06-30 11:39:29,158	INFO trainer.py:698 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=3116)[0m Instructions for updating:
[2m[36m(pid=3116)[0m experimental_compile is deprecated, use jit_compile instead
[2m[36m(pid=3115)[0m Instructions for updating:
[2m[36m(pid=3115)[0m experimental_compile is deprecated, use jit_compile instead


[2m[36m(pid=3116)[0m [(2, 3), (2, 12), (4, 7), (6, 1), (11, 13), (13, 9)]
[2m[36m(pid=3116)[0m Action Space:  Discrete(6)
[2m[36m(pid=3116)[0m Observation Space:  Tuple(Box(0.0, 14.0, (2,), float32), Discrete(7), Discrete(6))
[2m[36m(pid=3115)[0m [(2, 3), (2, 12), (4, 7), (6, 1), (11, 13), (13, 9)]
[2m[36m(pid=3115)[0m Action Space:  Discrete(6)
[2m[36m(pid=3115)[0m Observation Space:  Tuple(Box(0.0, 14.0, (2,), float32), Discrete(7), Discrete(6))


2021-06-30 11:39:42,360	INFO trainable.py:104 -- Trainable.setup took 13.441 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [4]:
tune_config = {
    "env": HierarchicalSmartCabEnv,
    "seed": 123,
    "horizon": 100,
    "num_gpus": 1,
    "num_workers": 1,
    "observation_filter": "MeanStdFilter",
    "lambda": 0.9,
    "clip_param": 0.3,
    "lr": 5e-5,
    "num_sgd_iter": choice([10, 20, 30]),
    "sgd_minibatch_size": choice([128, 256, 512]),
    "train_batch_size": choice([8000, 16000, 32000]),
    "multiagent": multiagent
}
stop_criteria = {
    "timesteps_total": 2000000
}
hyperparam_mutations={
    "lambda": lambda: random.uniform(0.7, 1.0),
    "clip_param": lambda: random.uniform(0.1, 0.5),
    "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
    "num_sgd_iter": lambda: random.randint(1, 30),
    "sgd_minibatch_size": lambda: random.randint(128, 16384),
    "train_batch_size": lambda: random.randint(2000, 160000),
}
pbt = PbtOptimizer(hyperparam_mutations)

In [None]:
ppo.restart()
analysis = ppo.tune(tune_config, stop_criteria, scheduler=pbt.scheduler)
best_config =  analysis.get_best_config(metric="episode_reward_mean", mode="max")
print("Best hyperparameters found: ", best_config)

2021-06-30 11:39:45,188	INFO services.py:1274 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
  if np.isnan(value):


Trial name,status,loc,num_sgd_iter,sgd_minibatch_size,train_batch_size
PPO_HierarchicalSmartCabEnv_df600_00000,PENDING,,30,512,8000
PPO_HierarchicalSmartCabEnv_df600_00001,PENDING,,10,256,16000
PPO_HierarchicalSmartCabEnv_df600_00002,PENDING,,30,512,32000
PPO_HierarchicalSmartCabEnv_df600_00003,PENDING,,10,512,16000
PPO_HierarchicalSmartCabEnv_df600_00004,PENDING,,30,256,8000
PPO_HierarchicalSmartCabEnv_df600_00005,PENDING,,20,256,32000
PPO_HierarchicalSmartCabEnv_df600_00006,PENDING,,20,128,32000
PPO_HierarchicalSmartCabEnv_df600_00007,PENDING,,30,512,16000


[2m[36m(pid=3307)[0m Instructions for updating:
[2m[36m(pid=3307)[0m experimental_compile is deprecated, use jit_compile instead
[2m[36m(pid=3307)[0m 2021-06-30 11:39:51,039	INFO trainer.py:671 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=3307)[0m 2021-06-30 11:39:51,040	INFO trainer.py:698 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc,num_sgd_iter,sgd_minibatch_size,train_batch_size
PPO_HierarchicalSmartCabEnv_df600_00000,RUNNING,,30,512,8000
PPO_HierarchicalSmartCabEnv_df600_00001,PENDING,,10,256,16000
PPO_HierarchicalSmartCabEnv_df600_00002,PENDING,,30,512,32000
PPO_HierarchicalSmartCabEnv_df600_00003,PENDING,,10,512,16000
PPO_HierarchicalSmartCabEnv_df600_00004,PENDING,,30,256,8000
PPO_HierarchicalSmartCabEnv_df600_00005,PENDING,,20,256,32000
PPO_HierarchicalSmartCabEnv_df600_00006,PENDING,,20,128,32000
PPO_HierarchicalSmartCabEnv_df600_00007,PENDING,,30,512,16000


[2m[36m(pid=3308)[0m Instructions for updating:
[2m[36m(pid=3308)[0m experimental_compile is deprecated, use jit_compile instead


[2m[36m(pid=3308)[0m [(2, 3), (2, 12), (4, 7), (6, 1), (11, 13), (13, 9)]
[2m[36m(pid=3308)[0m Action Space:  Discrete(6)
[2m[36m(pid=3308)[0m Observation Space:  Tuple(Box(0.0, 14.0, (2,), float32), Discrete(7), Discrete(6))


Trial name,status,loc,num_sgd_iter,sgd_minibatch_size,train_batch_size
PPO_HierarchicalSmartCabEnv_df600_00000,RUNNING,,30,512,8000
PPO_HierarchicalSmartCabEnv_df600_00001,PENDING,,10,256,16000
PPO_HierarchicalSmartCabEnv_df600_00002,PENDING,,30,512,32000
PPO_HierarchicalSmartCabEnv_df600_00003,PENDING,,10,512,16000
PPO_HierarchicalSmartCabEnv_df600_00004,PENDING,,30,256,8000
PPO_HierarchicalSmartCabEnv_df600_00005,PENDING,,20,256,32000
PPO_HierarchicalSmartCabEnv_df600_00006,PENDING,,20,128,32000
PPO_HierarchicalSmartCabEnv_df600_00007,PENDING,,30,512,16000




Result for PPO_HierarchicalSmartCabEnv_df600_00000:
  agent_timesteps_total: 7920
  custom_metrics: {}
  date: 2021-06-30_11-40-09
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: -1915.0
  episode_reward_mean: -2232.5625
  episode_reward_min: -3088.0
  episodes_this_iter: 80
  episodes_total: 80
  experiment_id: 04ed6812b8194e1b96307b2ca853f968
  hostname: c2f6a5589ba4
  info:
    learner:
      high_level_policy:
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 0.6912553906440735
          entropy_coeff: 0.0
          kl: 0.0018854527734220028
          model: {}
          policy_loss: -0.026443367823958397
          total_loss: 521843.5
          vf_explained_var: 0.00019735097885131836
          vf_loss: 521843.5
    num_agent_steps_sampled: 7920
    num_agent_steps_trained: 88
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iterations_since_restore: 1
  node_ip: 

Trial name,status,loc,num_sgd_iter,sgd_minibatch_size,train_batch_size,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_HierarchicalSmartCabEnv_df600_00000,RUNNING,172.28.0.2:3307,30,512,8000,1.0,8.12716,8000.0,-2232.56,-1915.0,-3088.0,100.0
PPO_HierarchicalSmartCabEnv_df600_00001,PENDING,,10,256,16000,,,,,,,
PPO_HierarchicalSmartCabEnv_df600_00002,PENDING,,30,512,32000,,,,,,,
PPO_HierarchicalSmartCabEnv_df600_00003,PENDING,,10,512,16000,,,,,,,
PPO_HierarchicalSmartCabEnv_df600_00004,PENDING,,30,256,8000,,,,,,,
PPO_HierarchicalSmartCabEnv_df600_00005,PENDING,,20,256,32000,,,,,,,
PPO_HierarchicalSmartCabEnv_df600_00006,PENDING,,20,128,32000,,,,,,,
PPO_HierarchicalSmartCabEnv_df600_00007,PENDING,,30,512,16000,,,,,,,


In [None]:
best_config = {
    "observation_filter": "MeanStdFilter",
    "model": {"free_log_std": True},
    "num_sgd_iter": 10,
    "sgd_minibatch_size": 128,
    "lambda": 0.731396,
    "clip_param": 0.317651,
    "lr": 5e-05,
    "train_batch_size": 18812,
}

### Training

In [None]:
ppo = PPOAgent("smartcab_ppo_train", best_config, WasteNetEnv, {})
ppo.train(num_iter=200)

In [None]:
policy = ppo.agent.get_policy()
model = policy.model
print(model.base_model.summary())

### Testing

In [None]:
# ppo = PPOAgent("smartcab_ppo_test", best_config, SmartCabEnv, {})
# ppo.load("checkpoints/checkpoint-best")
ppo.test(num_episodes=1000)

### Visualization

In [None]:
!zip -r /content/ray_results.zip /content/ray_results

In [None]:
%load_ext tensorboard 
%tensorboard --logdir="/content/ray_results/"