In [None]:
# !git clone https://github.com/DLR-RM/stable-baselines3.git
# %cd stable-baselines3/
# !python setup.py install
# %cd ..
# !git clone https://github.com/HumanCompatibleAI/imitation.git
# %cd imitation
# !python setup.py install
# %cd ..

In [None]:
# RESTART RUNTIME!!

Imitation Leaning experiments:

Conclusion: BC works. Dagger better. 

In [None]:
import gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.ppo import MlpPolicy

from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper

In [None]:
env = gym.make("CartPole-v1")
rng = np.random.default_rng(0)

In [None]:
## Behavior Cloning (BC)

In [None]:
rng = np.random.default_rng(0)
env = gym.make("CartPole-v1")
expert = PPO(policy=MlpPolicy, env=env)
expert.learn(3000)

In [None]:
reward, _ = evaluate_policy(
    expert.policy,  # type: ignore[arg-type]
    env,
    n_eval_episodes=1,
    render=False,
)
print(f"Reward of expert: {reward}")

In [None]:
rollouts = rollout.rollout(
    expert,
    DummyVecEnv([lambda: RolloutInfoWrapper(env)]),
    rollout.make_sample_until(min_timesteps=None, min_episodes=50),
    rng=rng,
)

In [None]:
transitions = rollout.flatten_trajectories(rollouts)

In [None]:
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
)

In [None]:
bc_trainer.train(n_epochs=10)

In [None]:
reward, _ = evaluate_policy(bc_trainer.policy, env, 50)
print("Reward:", reward)

In [None]:
### DAGGER Algorithm

In [None]:
import tempfile
from imitation.algorithms.dagger import SimpleDAggerTrainer

In [None]:
rng = np.random.default_rng(0)
env = gym.make("CartPole-v1")
expert = PPO(policy=MlpPolicy, env=env)
expert.learn(1000)
venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    rng=rng,
)

In [None]:
with tempfile.TemporaryDirectory(prefix="dagger_example_") as tmpdir:
    print(tmpdir)
    dagger_trainer = SimpleDAggerTrainer(
        venv=venv,
        scratch_dir=tmpdir,
        expert_policy=expert,
        bc_trainer=bc_trainer,
        rng=rng,
    )
    dagger_trainer.train(2000)

In [None]:
reward, _ = evaluate_policy(dagger_trainer.policy, env, 10)
print("Reward:", reward)