# Cart Pole Example
An old classic RL beginner problem.
https://www.anyscale.com/blog/an-introduction-to-reinforcement-learning-with-openai-gym-rllib-and-google

the intent is to see how exactly RLLib can fit into a dead simple RL problem.

In [1]:
import torch
import gymnasium as gym
import ray
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.dqn.dqn import DQNConfig
from ray.tune.logger import pretty_print

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    !nvidia-smi
else:
    device = torch.device("cpu")
device

device(type='cpu')

In [3]:
# env = gym.make("LunarLander-v2", render_mode="human")
env = gym.make("CartPole-v1", render_mode="human")
observation, info = env.reset()

for _ in range(100):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [6]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print


algo = (
    PPOConfig()
    .framework('torch')
    .rollouts(num_rollout_workers=1)
    .resources(num_gpus=0)
    .environment(env="CartPole-v1")
    .build()
)

for i in range(1):
    result = algo.train()
    print(pretty_print(result))

    if i % 1 == 0:
        checkpoint_dir = algo.save(checkpoint_dir="./checkpoints")
        print(f"Checkpoint saved in directory {checkpoint_dir}")

2023-06-14 22:53:32,818	INFO algorithm.py:501 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


agent_timesteps_total: 4000
counters:
  num_agent_steps_sampled: 4000
  num_agent_steps_trained: 4000
  num_env_steps_sampled: 4000
  num_env_steps_trained: 4000
custom_metrics: {}
date: 2023-06-14_22-54-02
done: false
episode_len_mean: 22.732558139534884
episode_media: {}
episode_reward_max: 66.0
episode_reward_mean: 22.732558139534884
episode_reward_min: 9.0
episodes_this_iter: 172
episodes_total: 172
experiment_id: b27048a724b6493a9a831e6111018312
hostname: furman
info:
  learner:
    default_policy:
      custom_metrics: {}
      diff_num_grad_updates_vs_sampler_policy: 464.5
      learner_stats:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.20000000000000004
        cur_lr: 5.0000000000000016e-05
        entropy: 0.6661100400391445
        entropy_coeff: 0.0
        grad_gnorm: 1.5103502790133159
        kl: 0.028344952762677365
        policy_loss: -0.04215431610342636
        total_loss: 8.93649986636254
        vf_explained_var: 0.0038692155832885412
        vf_loss: 8

In [19]:
import ray
from ray import air, tune
ray.shutdown()
ray.init()

## NOTE: Could not solve the tensorflow_probability error, suspect I do not know how to properly set config to torch.
## Possible that I am not able to use tuner.fit() at all...
config = PPOConfig().framework(framework="torch").training(lr=tune.grid_search([0.01, 0.001, 0.0001]))

tuner = tune.Tuner(
    "PPO",
    run_config=air.RunConfig(
        stop={"episode_reward_mean": 150},
    ),
    param_space=config,
)

tuner.fit()

2023-06-15 02:06:06,123	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


TuneError: The Ray Tune run failed. Please inspect the previous error messages for a cause. After fixing the issue, you can restart the run from scratch or continue this run. To continue this run, you can use `tuner = Tuner.restore("/home/furman/ray_results/PPO")`.

In [20]:
config = {
    "env": "CartPole-v1",
    # Change the following line to `“framework”: “tf”` to use tensorflow
    "framework": "torch",
    "model": {
        "fcnet_hiddens": [32],
        "fcnet_activation": "linear",
    },
}
stop = {"episode_reward_mean": 295}
ray.shutdown()
ray.init(
    num_cpus=3,
    include_dashboard=False,
    ignore_reinit_error=True,
    log_to_driver=False,
)
# execute training
analysis = ray.tune.run(
    "PPO",
    local_dir="./checkpoints",
    config=config,
    stop=stop,
    checkpoint_at_end=True,
)

2023-06-15 02:08:12,626	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


ModuleNotFoundError: No module named 'tensorflow_probability'

In [14]:
import ray
from ray import air, tune
ray.shutdown()
ray.init()

config = PPOConfig().training(lr=tune.grid_search([0.01, 0.001, 0.0001])).framework('torch')
# ``Tuner.fit()`` allows setting a custom log directory (other than ``~/ray-results``)
tuner = ray.tune.Tuner(
    "PPO",
    param_space=config,
    run_config=air.RunConfig(
        stop={"episode_reward_mean": 150},
        checkpoint_config=air.CheckpointConfig(checkpoint_at_end=True),
    ),
)

results = tuner.fit()

# Get the best result based on a particular metric.
best_result = results.get_best_result(metric="episode_reward_mean", mode="max")

# Get the best checkpoint corresponding to the best result.
best_checkpoint = best_result.checkpoint

2023-06-14 22:47:19,622	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


TuneError: The Ray Tune run failed. Please inspect the previous error messages for a cause. After fixing the issue, you can restart the run from scratch or continue this run. To continue this run, you can use `tuner = Tuner.restore("/home/furman/ray_results/PPO")`.

In [43]:
env = gym.make("CartPole-v1", render_mode="human")
after_training = "after_training.mp4"
after_video = VideoRecorder(env, after_training)
reps = 0
while reps < 5:
    observation, info = env.reset()
    while not terminated:
        env.render()
        after_video.capture_frame()
        action = algo.compute_single_action(observation=observation, explore=0)
        observation, reward, terminated, truncated, info = env.step(env.action_space.sample())
    reps += 1
    terminated = False
after_video.close()
env.close()

In [44]:
observation

array([ 0.24730873,  0.62726694, -0.21656461, -0.978816  ], dtype=float32)

In [47]:
algo.compute_single_action(observation=observation)

0