In [1]:
#!pip install numpy
#!pip install "ray[rllib]"
#!pip install "ray[tune]"
#!pip install torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio===0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
#!pip install redis

#!pip install Box2D
#!pip install box2d-py
#!pip install gym[all]
#!pip install gym[Box_2D]

import gym
from ray.rllib.agents.ppo import PPOTrainer


In [2]:
from gym import envs
all_envs = envs.registry.all()
env_ids = [env_spec.id for env_spec in all_envs]
print(sorted(env_ids))

['Acrobot-v1', 'Ant-v2', 'Ant-v3', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3', 'Blackjack-v1', 'CarRacing-v0', 'CartPole-v0', 'CartPole-v1', 'CliffWalking-v0', 'CubeCrash-v0', 'CubeCrashScreenBecomesBlack-v0', 'CubeCrashSparse-v0', 'FetchPickAndPlace-v1', 'FetchPickAndPlaceDense-v1', 'FetchPush-v1', 'FetchPushDense-v1', 'FetchReach-v1', 'FetchReachDense-v1', 'FetchSlide-v1', 'FetchSlideDense-v1', 'FrozenLake-v1', 'FrozenLake8x8-v1', 'HalfCheetah-v2', 'HalfCheetah-v3', 'HandManipulateBlock-v0', 'HandManipulateBlockDense-v0', 'HandManipulateBlockFull-v0', 'HandManipulateBlockFullDense-v0', 'HandManipulateBlockRotateParallel-v0', 'HandManipulateBlockRotateParallelDense-v0', 'HandManipulateBlockRotateParallelTouchSensors-v0', 'HandManipulateBlockRotateParallelTouchSensors-v1', 'HandManipulateBlockRotateParallelTouchSensorsDense-v0', 'HandManipulateBlockRotateParallelTouchSensorsDense-v1', 'HandManipulateBlockRotateXYZ-v0', 'HandManipulateBlockRotateXYZDense-v0', 'HandManipulateBlockRot

In [None]:
env = gym.make('LunarLander-v2')
sample = env.observation_space.sample()
print(sample)

In [None]:
env = gym.make('LunarLander-v2')

env.reset()
env.render()

for _ in range(1000):
    action = env.action_space.sample()
    print(action)
    observation, reward, done, info = env.step(action)
    print(reward)
    env.render()

    if done:
        env.reset()
env.close()

In [None]:
# Configure the algorithm.
config = {
    # Environment (RLlib understands openAI gym registered strings).
    "env": "LunarLander-v2",
    # Use 2 environment workers (aka "rollout workers") that parallelly
    # collect samples from their own environment clone(s).
    "num_workers": 4,
    # Change this to "framework: torch", if you are using PyTorch.
    # Also, use "framework: tf2" for tf2.x eager execution.
    "framework": "torch",
    # Tweak the default model provided automatically by RLlib,
    # given the environment's observation- and action spaces.
    "model": {
        "fcnet_hiddens": [128, 128, 128, 128],
        "fcnet_activation": "relu",
    },
    # Set up a separate evaluation worker set for the
    # `trainer.evaluate()` call after training (see below).
    "evaluation_num_workers": 1,
    # Only for evaluation runs, render the env.
    "evaluation_config": {
        "render_env": True,
    },
}



In [None]:

# Create our RLlib Trainer.
trainer = PPOTrainer(config=config)

# Run it for n training iterations. A training iteration includes
# parallel sample collection by the environment workers as well as
# loss calculation on the collected batch and a model update.
for _ in range(10):
    print(trainer.train())

In [None]:
# Evaluate the trained Trainer (and render each timestep to the shell's
# output).
trainer.evaluate()

In [None]:
import ray
from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

# How many time steps to run the experiment for.
time_steps_total = 1_000_000

# Where the trained agents and the logs will end up.
local_dir = "tune_runs_lunarlander"

# Run the experiment.
results = tune.run(
    agents.ppo.PPOTrainer,
    config=config,
    metric="episode_reward_mean",
    mode="max",
    stop={"timesteps_total": time_steps_total},
    checkpoint_at_end=True,
    checkpoint_freq=10,
    # keep_checkpoints_num=10,
    local_dir=local_dir,
    #restore=restore_checkpoint
    #        name="PPOTrainer_2022-03-23_15-47-45",
    #resume=AUTO
)

# Get the checkpoints.
checkpoints = results.get_trial_checkpoints_paths(
    trial=results.get_best_trial("episode_reward_mean"),
    metric="episode_reward_mean")
for checkpoint in checkpoints:
    checkpoint_path = checkpoint[0]
    print("Checkpoint path:", checkpoint_path)




In [None]:
import os
import glob

# Clean up.
subfolders = glob.glob(os.path.join(local_dir, "*"), recursive=True)
subfolders = [subfolder for subfolder in subfolders if os.path.isdir(subfolder)]
for subfolder in subfolders:
    subsubfolders = glob.glob(os.path.join(subfolder, "*"), recursive=True)
    subsubfolders = [subsubfolder for subsubfolder in subsubfolders if os.path.isdir(subsubfolder)]
    for subsubfolder in subsubfolders:
        subsubfolder_cleaned = subsubfolder.replace("[", "").replace("]", "").replace(", ", "-")
        if subsubfolder_cleaned != subsubfolder:
            os.rename(subsubfolder, subsubfolder_cleaned)
            print(f"Renamed {subsubfolder} to {subsubfolder_cleaned}")

# Sound an alarm using unix escape sequence.
print("Done.")
print("\a")

In [None]:
import json
import os
import glob
import ray
from ray import tune
from ray.rllib import agents
# Where the trained agents and the logs will end up.
local_dir = "tune_runs_lunarlander"

# Find all the occurences of params.json in the directory tune_runs.
params_paths = glob.glob(os.path.join(local_dir, "**", "params.json"), recursive=True)
params_paths = sorted(params_paths)

# Find all the checkpoints per params.json.
pairs = []
for params_path in params_paths:
    search_path = os.path.join(os.path.dirname(params_path), "**")
    checkpoint_paths = glob.glob(search_path, recursive=True)
    checkpoint_paths = [checkpoint_path for checkpoint_path in checkpoint_paths if
                        not os.path.isdir(checkpoint_path)]
    checkpoint_paths = [checkpoint_path for checkpoint_path in checkpoint_paths if
                        os.path.basename(checkpoint_path).startswith(
                            "checkpoint-") and not checkpoint_path.endswith(".tune_metadata")]
    checkpoint_paths = sorted(checkpoint_paths)
    pairs += [(params_path, checkpoint_path) for checkpoint_path in checkpoint_paths]

# Get the user input.
if len(pairs) == 0:
    print("No checkpoints found.")
elif len(pairs) == 1:
    user_index = 0
else:
    print("Select a checkpoint:")
    for index, (_, subfolder) in enumerate(pairs):
        print(f"{index: >2}: {subfolder}")
    user_index = int(input("Enter the index of the checkpoint: "))

# Get config path and checkpoint path.
config_path, checkpoint_path = pairs[user_index]
assert os.path.exists(config_path)
assert os.path.exists(checkpoint_path)
print("Config path:", config_path)
print("Checkpoint path:", checkpoint_path)

# Read the config.
with open(config_path, "r") as file:
    enjoy_config = json.load(file)
    enjoy_config = {key: value for key, value in enjoy_config.items() if key not in ["num_gpus", "num_workers"]}

print(enjoy_config, config_path)

# Load the agent.
print("Loading agent...")
agent = agents.ppo.PPOTrainer(config=enjoy_config)
agent.restore(checkpoint_path)
print("Agent loaded.")

# Create the environment.
print("Creating the environment.")
environment = gym.make('LunarLander-v2')
observation = environment.reset()
done = False
while not done:
    #print(observation["carriers_next_station_distance"])
    action = agent.compute_action(observation)
        
    # Execute the action.
    observation, reward, done, info = environment.step(action)
    
    # Render the environment and print the data.
    environment.render()