In [None]:
from PPO_train_test import PPO_train, PPO_train_udr, PPO_train_adr, PPO_test

TIMESTEPS = 1_500_000
SEED = 999

We want to deploy our agent in the real world that is represented by the Target environment, but we have only access to a simulator (source environment) which has a sistematic error wrt real word.

Since we want to close this sim2real gap we will experiment some techniques to address this problem.

To begin with, we will try a simple UDR. Then our focus will be implementing an ADR technique inspired to the OpenAI original paper where ADR is formally introduced in the RL world.

In the following experiments PPO, a policy-based RL algorithm, will be employed.

# Hopper Environment

Training on SOURCE

In [None]:
PPO_train(
    train_env_id='CustomHopper-source-v0',
    model_name=f'hopper_source_{SEED}',
    lr=3e-4,
    steps=TIMESTEPS,
    seed=SEED
)

SOURCE -> SOURCE

In [None]:
PPO_test(
    test_env_id='CustomHopper-source-v0',
    model_name=f'hopper_source_{SEED}'
)

SOURCE -> TARGET

In [None]:
PPO_test(
    test_env_id='CustomHopper-target-v0',
    model_name=f'hopper_source_{SEED}'
)

Training on TARGET

In [None]:
PPO_train(
    train_env_id='CustomHopper-target-v0',
    model_name=f'hopper_target_{SEED}',
    lr=3e-4,
    steps=TIMESTEPS,
    seed=SEED
)

TARGET -> TARGET

In [None]:
PPO_test(
    test_env_id='CustomHopper-target-v0',
    model_name=f'hopper_target_{SEED}'
)

## UDR

In [None]:
model_name=f'hopper_source_udr_constant_50_medium_{SEED}'

In [None]:
PPO_train_udr(
    train_env_id='CustomHopper-source-v0',
    model_name=model_name,
    lr=3e-4,
    lr_scheduler_type='constant',
    steps=TIMESTEPS,
    udr_range=0.5,
    net_size="medium", # [ small - medium - large ] -> [ 64 - 128 - 256 ]
    seed=SEED
)

In [None]:
PPO_test(
    test_env_id='CustomHopper-target-v0',
    model_name=model_name
)

## ADR (OpenAI Style)

In [None]:
model_name = f'hopper_source_adr_constant_70_1300_40000_medium_{SEED}'

In [None]:
PPO_train_adr(
    train_env_id='CustomHopper-source-v0',
    model_name=model_name,
    lr=3e-4,
    lr_scheduler_type="constant",
    steps=TIMESTEPS,
    starting_adr_range=0.05,
    objective_adr_range=0.5,
    increase_rate=0.05,
    reward_to_check=1300,
    check_frequency=40_000,
    net_size="medium",
    seed=SEED
)

In [None]:
PPO_test(
    test_env_id='CustomHopper-target-v0',
    model_name=model_name
)

## Hopper Visualization

In [None]:
from utils.visualize_agent import visualize

visualize(
    model_path=f"models/{model_name}",
    env_id="CustomHopper-target-v0"
)

# Walker2D Environment

Training on SOURCE

In [None]:
PPO_train(
    train_env_id='CustomWalker-source-v0',
    model_name=f'walker_source_{SEED}',
    lr=3e-4,
    steps=TIMESTEPS,
    seed=SEED
)

SOURCE -> SOURCE

In [None]:
PPO_test(
    test_env_id='CustomWalker-source-v0',
    model_name=f'walker_source_{SEED}'
)

SOURCE -> TARGET

In [None]:
PPO_test(
    test_env_id='CustomWalker-target-v0',
    model_name=f'walker_source_{SEED}'
)

Training on TARGET

In [None]:
PPO_train(
    train_env_id='CustomWalker-target-v0',
    model_name=f'walker_target_{SEED}',
    lr=3e-4,
    steps=TIMESTEPS,
    seed=SEED
)

TARGET -> TARGET

In [None]:
PPO_test(
    test_env_id='CustomWalker-target-v0',
    model_name=f'walker_target_{SEED}'
)

## UDR

In [None]:
model_name=f'walker_source_udr_constant_50_medium_{SEED}'

In [None]:
PPO_train_udr(
    train_env_id='CustomWalker-source-v0',
    model_name=model_name,
    lr=3e-4,
    lr_scheduler_type='constant',
    steps=TIMESTEPS,
    udr_range=0.5,
    net_size="medium", # [ small - medium - large ] -> [ 64 - 128 - 256 ]
    seed=SEED
)

In [None]:
PPO_test(
    test_env_id='CustomWalker-target-v0',
    model_name=model_name
)

## ADR (OpenAI Style)

In [None]:
model_name = f'walker_source_adr_constant_50_1500_40000_medium_{SEED}'

In [None]:
PPO_train_adr(
    train_env_id='CustomWalker-source-v0',
    model_name=model_name,
    lr=3e-4,
    lr_scheduler_type="constant",
    steps=TIMESTEPS,
    starting_adr_range=0.05,
    objective_adr_range=0.5,
    increase_rate=0.05,
    reward_to_check=1500,
    check_frequency=40_000,
    net_size="medium",
    seed=SEED
)

In [None]:
PPO_test(
    test_env_id='CustomWalker-target-v0',
    model_name=model_name
)

## Walker Visualization

In [None]:
from utils.visualize_agent import visualize

visualize(
    model_path=f"models/{model_name}",
    env_id="CustomWalker-target-v0"
)