In [1]:
import numpy as np
import gymnasium as gym
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env
from imitation.data.wrappers import RolloutInfoWrapper

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = make_vec_env(
    "seals:seals/CartPole-v0",
    rng=np.random.default_rng(),
    post_wrappers=[
        lambda env, _: RolloutInfoWrapper(env)
    ],  # needed for computing rollouts later
)
expert = load_policy(
    "ppo-huggingface",
    organization="HumanCompatibleAI",
    env_name="seals/CartPole-v0",
    venv=env,
)


Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int


In [3]:
from stable_baselines3.common.evaluation import evaluate_policy

reward, _ = evaluate_policy(expert, env, 10)
print(reward)

500.0


In [4]:
from imitation.data import rollout

rng = np.random.default_rng()
rollouts = rollout.rollout(
    expert,
    env,
    rollout.make_sample_until(min_timesteps=None, min_episodes=50),
    rng=rng,
)
transitions = rollout.flatten_trajectories(rollouts)

In [5]:
print(
    f"""The `rollout` function generated a list of {len(rollouts)} {type(rollouts[0])}.
After flattening, this list is turned into a {type(transitions)} object containing {len(transitions)} transitions.
The transitions object contains arrays for: {', '.join(transitions.__dict__.keys())}."
"""
)

The `rollout` function generated a list of 56 <class 'imitation.data.types.TrajectoryWithRew'>.
After flattening, this list is turned into a <class 'imitation.data.types.Transitions'> object containing 28000 transitions.
The transitions object contains arrays for: obs, acts, infos, next_obs, dones."



In [21]:
rollouts[0]

TrajectoryWithRew(obs=array([[ 0.03917111,  0.00851629, -0.00286903,  0.0273277 ],
       [ 0.03934143,  0.20367928, -0.00232248, -0.26625904],
       [ 0.04341502,  0.00859054, -0.00764766,  0.02569044],
       ...,
       [ 0.2711443 ,  0.00901818, -0.00818093,  0.01625393],
       [ 0.27132466,  0.2042565 , -0.00785585, -0.2789989 ],
       [ 0.2754098 ,  0.00924749, -0.01343583,  0.01119598]],
      dtype=float32), acts=array([1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 

In [14]:
transitions[0]

{'obs': array([ 0.03917111,  0.00851629, -0.00286903,  0.0273277 ], dtype=float32),
 'acts': 1,
 'infos': {},
 'next_obs': array([ 0.03934143,  0.20367928, -0.00232248, -0.26625904], dtype=float32),
 'dones': False}

In [10]:
from imitation.algorithms import bc

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng, device="cpu"
)

In [11]:
reward_before_training, _ = evaluate_policy(bc_trainer.policy, env, 10)
print(f"Reward before training: {reward_before_training}")

Reward before training: 9.0


In [12]:
bc_trainer.train(n_epochs=1)
reward_after_training, _ = evaluate_policy(bc_trainer.policy, env, 10)
print(f"Reward after training: {reward_after_training}")

0batch [00:00, ?batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 0         |
|    ent_loss       | -0.000693 |
|    entropy        | 0.693     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 72.5      |
|    loss           | 0.693     |
|    neglogp        | 0.693     |
|    prob_true_act  | 0.5       |
|    samples_so_far | 32        |
---------------------------------


493batch [00:02, 188.83batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000376 |
|    entropy        | 0.376     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 95.1      |
|    loss           | 0.406     |
|    neglogp        | 0.407     |
|    prob_true_act  | 0.745     |
|    samples_so_far | 16032     |
---------------------------------


875batch [00:05, 173.90batch/s]


Reward after training: 500.0


In [9]:
import torch
torch.cuda.is_available()

True

In [1]:
import random
from imitation.algorithms import preference_comparisons
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env
from imitation.policies.base import FeedForward32Policy, NormalizeFeaturesExtractor
import gymnasium as gym
from stable_baselines3 import PPO
import numpy as np

rng = np.random.default_rng(0)

venv = make_vec_env("Pendulum-v1", rng=rng)

reward_net = BasicRewardNet(
    venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
)

fragmenter = preference_comparisons.RandomFragmenter(
    warning_threshold=0,
    rng=rng,
)
gatherer = preference_comparisons.SyntheticGatherer(rng=rng)
preference_model = preference_comparisons.PreferenceModel(reward_net)
reward_trainer = preference_comparisons.BasicRewardTrainer(
    preference_model=preference_model,
    loss=preference_comparisons.CrossEntropyRewardLoss(),
    epochs=3,
    rng=rng,
)


# Several hyperparameters (reward_epochs, ppo_clip_range, ppo_ent_coef,
# ppo_gae_lambda, ppo_n_epochs, discount_factor, use_sde, sde_sample_freq,
# ppo_lr, exploration_frac, num_iterations, initial_comparison_frac,
# initial_epoch_multiplier, query_schedule) used in this example have been
# approximately fine-tuned to reach a reasonable level of performance.
agent = PPO(
    policy=FeedForward32Policy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=venv,
    seed=0,
    n_steps=2048 // venv.num_envs,
    batch_size=64,
    ent_coef=0.01,
    learning_rate=2e-3,
    clip_range=0.1,
    gae_lambda=0.95,
    gamma=0.97,
    n_epochs=10,
)

trajectory_generator = preference_comparisons.AgentTrainer(
    algorithm=agent,
    reward_fn=reward_net,
    venv=venv,
    exploration_frac=0.05,
    rng=rng,
)

pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5,  # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    fragment_length=100,
    transition_oversampling=1,
    initial_comparison_frac=0.1,
    allow_variable_horizon=False,
    initial_epoch_multiplier=4,
    query_schedule="hyperbolic",
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pref_comparisons.train(
    total_timesteps=5_000,
    total_comparisons=200,
)

Query schedule: [20, 51, 41, 34, 29, 25]
Collecting 40 fragments (4000 transitions)
Requested 3800 transitions but only 0 in buffer. Sampling 3800 additional transitions.
Sampling 200 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 20 comparisons


Training reward model: 100%|██████████| 12/12 [00:00<00:00, 16.36it/s]

Training agent for 1000 timesteps





----------------------------------------------------
| raw/                                 |           |
|    agent/rollout/ep_len_mean         | 200       |
|    agent/rollout/ep_rew_mean         | -1.26e+03 |
|    agent/rollout/ep_rew_wrapped_mean | -37       |
|    agent/time/fps                    | 2052      |
|    agent/time/iterations             | 1         |
|    agent/time/time_elapsed           | 0         |
|    agent/time/total_timesteps        | 2048      |
----------------------------------------------------
-------------------------------------------------------
| mean/                                   |           |
|    agent/rollout/ep_len_mean            | 200       |
|    agent/rollout/ep_rew_mean            | -1.26e+03 |
|    agent/rollout/ep_rew_wrapped_mean    | -37       |
|    agent/time/fps                       | 2.05e+03  |
|    agent/time/iterations                | 1         |
|    agent/time/time_elapsed              | 0         |
|    agent/time/total_

Training reward model: 100%|██████████| 3/3 [00:00<00:00,  5.40it/s]

Training agent for 1000 timesteps





-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 200          |
|    agent/rollout/ep_rew_mean         | -1.17e+03    |
|    agent/rollout/ep_rew_wrapped_mean | -20          |
|    agent/time/fps                    | 2486         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0012476778 |
|    agent/train/clip_fraction         | 0.053        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -1.41        |
|    agent/train/explained_variance    | -0.381       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0701       |
|    agent/train/n_updates             | 10           |
|    agent/train/policy_gradient_loss  | -0.0017

Training reward model: 100%|██████████| 3/3 [00:00<00:00,  3.60it/s]

Training agent for 1000 timesteps





-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 200          |
|    agent/rollout/ep_rew_mean         | -1.16e+03    |
|    agent/rollout/ep_rew_wrapped_mean | -8.32        |
|    agent/time/fps                    | 2375         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0019020251 |
|    agent/train/clip_fraction         | 0.0988       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -1.41        |
|    agent/train/explained_variance    | 0.662        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0565       |
|    agent/train/n_updates             | 20           |
|    agent/train/policy_gradient_loss  | -0.0049

Training reward model: 100%|██████████| 3/3 [00:01<00:00,  2.50it/s]

Training agent for 1000 timesteps





-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 200          |
|    agent/rollout/ep_rew_mean         | -1.17e+03    |
|    agent/rollout/ep_rew_wrapped_mean | -3.91        |
|    agent/time/fps                    | 2387         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0021893103 |
|    agent/train/clip_fraction         | 0.104        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -1.41        |
|    agent/train/explained_variance    | 0.894        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0185       |
|    agent/train/n_updates             | 30           |
|    agent/train/policy_gradient_loss  | -0.0054

Training reward model: 100%|██████████| 3/3 [00:01<00:00,  2.03it/s]

Training agent for 1000 timesteps





-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 200          |
|    agent/rollout/ep_rew_mean         | -1.18e+03    |
|    agent/rollout/ep_rew_wrapped_mean | -4.17        |
|    agent/time/fps                    | 1613         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0023718174 |
|    agent/train/clip_fraction         | 0.136        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -1.41        |
|    agent/train/explained_variance    | 0.93         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0397       |
|    agent/train/n_updates             | 40           |
|    agent/train/policy_gradient_loss  | -0.0076

Training reward model: 100%|██████████| 3/3 [00:01<00:00,  1.95it/s]

Training agent for 1000 timesteps





-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 200          |
|    agent/rollout/ep_rew_mean         | -1.19e+03    |
|    agent/rollout/ep_rew_wrapped_mean | -5.74        |
|    agent/time/fps                    | 2505         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0041706255 |
|    agent/train/clip_fraction         | 0.194        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -1.39        |
|    agent/train/explained_variance    | 0.967        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00442     |
|    agent/train/n_updates             | 50           |
|    agent/train/policy_gradient_loss  | -0.0122

{'reward_loss': 0.0971360150059419, 'reward_accuracy': 0.96875}

In [4]:
from imitation.rewards.reward_wrapper import RewardVecEnvWrapper

learned_reward_venv = RewardVecEnvWrapper(venv, reward_net.predict_processed)

In [5]:
learner = PPO(
    seed=0,
    policy=FeedForward32Policy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=learned_reward_venv,
    batch_size=64,
    ent_coef=0.01,
    n_epochs=10,
    n_steps=2048 // learned_reward_venv.num_envs,
    clip_range=0.1,
    gae_lambda=0.95,
    gamma=0.97,
    learning_rate=2e-3,
)
learner.learn(1_000)  # Note: set to 100_000 to train a proficient expert

<stable_baselines3.ppo.ppo.PPO at 0x7a2f0c7752d0>

In [6]:
from stable_baselines3.common.evaluation import evaluate_policy

n_eval_episodes = 10
reward_mean, reward_std = evaluate_policy(learner.policy, venv, n_eval_episodes)
reward_stderr = reward_std / np.sqrt(n_eval_episodes)
print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

Reward: -1307 +/- 125
