In [1]:
!pip install ray[rllib]

Collecting ray[rllib]
  Downloading ray-1.2.0-cp38-cp38-manylinux2014_x86_64.whl (47.3 MB)
[K     |████████████████████████████████| 47.3 MB 47 kB/s 
Collecting msgpack<2.0.0,>=1.0.0
  Downloading msgpack-1.0.2-cp38-cp38-manylinux1_x86_64.whl (302 kB)
[K     |████████████████████████████████| 302 kB 574 kB/s 
[?25hCollecting opencv-python-headless<=4.3.0.36
  Downloading opencv_python_headless-4.3.0.36-cp38-cp38-manylinux2014_x86_64.whl (36.4 MB)
[K     |████████████████████████████████| 36.4 MB 42 kB/s 
[?25hCollecting py-spy>=0.2.0
  Downloading py_spy-0.3.5-py2.py3-none-manylinux1_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 800 kB/s 
[?25hCollecting redis>=3.5.0
  Downloading redis-3.5.3-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 261 kB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.7.4.post0-cp38-cp38-manylinux2014_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 2.0 MB/s 
Collecting async-

In [2]:
import ray
import ray.rllib.agents.ppo as ppo

ray.shutdown()
ray.init(ignore_reinit_error=True)

Instructions for updating:
non-resource variables are not supported in the long term
2021-04-05 12:07:44,996	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.43.113',
 'raylet_ip_address': '192.168.43.113',
 'redis_address': '192.168.43.113:6379',
 'object_store_address': '/tmp/ray/session_2021-04-05_12-07-44_472602_12344/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-04-05_12-07-44_472602_12344/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-04-05_12-07-44_472602_12344',
 'metrics_export_port': 57831,
 'node_id': 'b3b4826fd7b54bb232ef16c1c7c7a55a27adfb42dd84d0321f807bef'}

In [None]:
import os
import random
import argparse
import pandas as pd
from datetime import datetime

from ray.tune import run, sample_from
from ray.tune.schedulers import PopulationBasedTraining

In [None]:
# Postprocess the perturbed config to ensure it's still valid used if PBT.
def explore(config):
    # ensure we collect enough timesteps to do sgd
    if config["train_batch_size"] < config["sgd_minibatch_size"] * 2:
        config["train_batch_size"] = config["sgd_minibatch_size"] * 2
    # ensure we run at least one sgd iter
    if config["num_sgd_iter"] < 1:
        config["num_sgd_iter"] = 1
    return config

pbt = PopulationBasedTraining(
    time_attr="time_total_s",
    perturbation_interval=120,
    resample_probability=0.25,
    metric="episode_reward_mean",
    mode="max",
    # Specifies the mutations of these hyperparams
    hyperparam_mutations={
        "lambda": lambda: random.uniform(0.9, 1.0),
        "clip_param": lambda: random.uniform(0.01, 0.5),
        "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
        "num_sgd_iter": lambda: random.randint(1, 30),
        "sgd_minibatch_size": lambda: random.randint(128, 16384),
        "train_batch_size": lambda: random.randint(2000, 160000),
    },
    custom_explore_fn=explore)

In [None]:
analysis = run(
        'PPO',
        name="prueba_ppo",
        scheduler=pbt,
        num_samples=8,
        config={
            "env": "Taxi-v3",
            "seed": 123,
            "kl_coeff": 1.0,
            "num_gpus": 1,
            "num_workers": 1,
            "horizon": 99,
            "observation_filter": "MeanStdFilter",
            "model": {
                # "fcnet_hiddens": [
                #     32,
                #     32
                # ],
                "free_log_std": True
            },
            "num_sgd_iter": 10,
            "sgd_minibatch_size": 128,
            "lambda": sample_from(lambda spec: random.uniform(0.9, 1.0)),
            "clip_param": sample_from(lambda spec: random.uniform(0.1, 0.5)),
            "lr": sample_from(lambda spec: random.uniform(1e-3, 1e-5)),
            "train_batch_size": sample_from(
                lambda spec: random.randint(1000, 60000))
        })
print("best hyperparameters: ", analysis.best_config)

In [None]:
N_ITER = 30
results = []
episode_data = []
episode_json = []

for n in range(N_ITER):
    result = agent.train()
    results.append(result)
    
    episode = {'n': n, 
               'episode_reward_min': result['episode_reward_min'], 
               'episode_reward_mean': result['episode_reward_mean'], 
               'episode_reward_max': result['episode_reward_max'],  
               'episode_len_mean': result['episode_len_mean']
              }
    
    episode_data.append(episode)
    episode_json.append(json.dumps(episode))
    file_name = agent.save(CHECKPOINT_ROOT)
    
    print(f'{n+1:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f}, len mean: {result["episode_len_mean"]:8.4f}. Checkpoint saved to {file_name}')

In [None]:
policy = agent.get_policy()
model = policy.model
print(model.base_model.summary())

In [None]:
! rllib rollout \
    tmp/ppo/taxi/checkpoint_77/checkpoint-77 \
    --config "{\"env\": \"Taxi-v3\"}" \
    --run PPO \
    --steps 2000

In [None]:
!tensorboard --logdir=$HOME/ray_results/

In [None]:
obs = env.reset()
done = False
episode_reward = 0
sum_reward = 0
n_step = 20
for step in range(n_step):
    action = agent.compute_action(obs)
    print(action)
    state, reward, done, info = env.step(action)
    print(state, reward, done, info)
    sum_reward += reward
    if done:
        print("cumulative reward", sum_reward)
        state = env.reset()
        sum_reward = 0