In [1]:
%matplotlib inline


# Behavioral cloning with PyTorch


We present here how to perform behavioral cloning on a Minari dataset using [PyTorch](https://pytorch.org/).
We will start generating the dataset of the expert policy for the [CartPole-v1](https://gymnasium.farama.org/environments/classic_control/cart_pole/) environment, which is a classic control problem.
The objective is to balance the pole on the cart, and we receive a reward of +1 for each successful timestep.



## Imports
For this tutorial you will need the [RL Baselines3 Zoo](https://github.com/DLR-RM/rl-baselines3-zoo) library, which you can install with `pip install rl_zoo3`.
Let's then import all the required packages and set the random seed for reproducibility:



In [7]:

import os
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from gymnasium import spaces
from stable_baselines3 import PPO
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import minari
from minari import DataCollector

import sys
from rl_zoo3.train import train

torch.manual_seed(42)

<torch._C.Generator at 0x1158d6fd0>

## Policy training
Now we can train the expert policy using RL Baselines3 Zoo.
We train a PPO agent on the environment:



In [37]:
import sys
from rl_zoo3.train import train

sys.argv = [
    "python", 
    "--algo", "ppo",
    "--env", "LunarLanderContinuous-v3",
    "--n-timesteps", "1000000",
    "--track",
    "--wandb-project-name", "FRL",
    "--wandb-entity", "frankcholula",
    "--tensorboard-log", "runs",
    "--hyperparams",
    "n_envs:16",
    "n_steps:1024",
    "batch_size:64",
    "n_epochs:4",
    "gamma:0.999",
    "gae_lambda:0.98",
    "ent_coef:0.01"
]

train()

Seed: 1897547941


[34m[1mwandb[0m: Currently logged in as: [33mtsufanglu[0m ([33mfrankcholula[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading hyperparameters from: /Users/frankcholula/Workspace/school/FRL-playground/.frl/lib/python3.10/site-packages/rl_zoo3/hyperparams/ppo.yml
Default hyperparameters for environment (ones being tuned will be overridden):
OrderedDict([('batch_size', 64),
             ('ent_coef', 0.01),
             ('gae_lambda', 0.98),
             ('gamma', 0.999),
             ('n_envs', 16),
             ('n_epochs', 4),
             ('n_steps', 1024),
             ('n_timesteps', 1000000.0),
             ('policy', 'MlpPolicy')])
Using 16 environments
Overwriting n_timesteps with n=1000000
Creating test environment
Using cpu device
Log path: logs/ppo/LunarLanderContinuous-v3_4
Logging to runs/LunarLanderContinuous-v3__ppo__1897547941__1751888953/LunarLanderContinuous-v3/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 107      |
|    ep_rew_mean     | -268     |
| time/              |          |
|    fps             | 22817    |
|    iterations    

This will generate a new folder named `log` with the expert policy.



## Dataset generation
Now let's generate the dataset using the [DataCollector](https://minari.farama.org/api/data_collector/) wrapper:




In [38]:
env = DataCollector(gym.make("LunarLander-v3", continuous=True))
# path = os.path.abspath('') + '/logs/ppo/LunarLanderContinuous-v3_1/best_model.zip'
path = os.path.abspath('')+ "/code/models/ppo-LunarLander-v3/model.zip"
agent = PPO.load(path)

total_episodes = 1_000
for i in tqdm(range(total_episodes)):
    obs, _ = env.reset(seed=42)
    while True:
        action, _ = agent.predict(obs)
        obs, rew, terminated, truncated, info = env.step(action)

        if terminated or truncated:
            break

100%|██████████| 1000/1000 [02:07<00:00,  7.86it/s]


In [39]:
dataset = env.create_dataset(
    dataset_id="LunarLanderContinuous-v3/ppo-1000-v1",
    algorithm_name="ppo",
    code_permalink="https://github.com/frankcholula/FRL-playground/blob/main/code/behavioral_cloning.py",
    author="Frank Lu",
    author_email="lu.phrank@gmail.com",
    description="Behavioral cloning dataset for LunarLanderContinuous-v3 using PPO",
    eval_env="LunarLanderContinuous-v3"
)

In [None]:
# sanity check
dataset = minari.load_dataset(dataset_id="LunarLanderContinuous-v3/ppo-1000-v1")
env = dataset.recover_environment(eval_env = True, render_mode="human")
episode = dataset[110]

obs, _ = env.reset(seed=42)
for action in episode.actions:
    obs, rew, terminated, truncated, info = env.step(action)
    print(f"Action: {action}, Reward: {rew}")
    env.render()
    if terminated or truncated:
        break


Action: [-1. -1.], Reward: 2.080468461985818
Action: [-0.9139328  -0.84831226], Reward: 2.0691645157514507
Action: [-1.         -0.21528655], Reward: 1.6409476155531877
Action: [-0.8679179 -1.       ], Reward: 1.6760357860007968
Action: [-1.        -0.9008479], Reward: 1.645494362158638
Action: [-1. -1.], Reward: 1.5719021784162817
Action: [-1.  1.], Reward: -0.03407746207409332
Action: [-1.         -0.22359085], Reward: 0.6510353826908499
Action: [-0.30729365  1.        ], Reward: -0.2297617949425603
Action: [-1. -1.], Reward: 1.2353997514459831
Action: [-1.         -0.14613046], Reward: -0.06626395721497147
Action: [-0.7597282   0.48485947], Reward: -0.3657382003323164
Action: [-1. -1.], Reward: -0.05989879865518219
Action: [-0.7734264  0.7703807], Reward: -1.7086247477080883
Action: [-0.49912763 -0.16292202], Reward: -1.2862138396458818
Action: [-0.28232288  1.        ], Reward: -2.142116784980401
Action: [-0.50673634 -1.        ], Reward: -0.8093648874207406
Action: [-1.         0.

: 

Once executing the script, the dataset will be saved on your disk. You can display the list of datasets with ``minari list local`` command.



## Behavioral cloning with PyTorch
Now we can use PyTorch to learn the policy from the offline dataset.
Let's define the policy network:



In [14]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In this scenario, the output dimension will be two, as previously mentioned. As for the input dimension, it will be four, corresponding to the observation space of ``CartPole-v1``.
Our next step is to load the dataset and set up the training loop. The ``MinariDataset`` is compatible with the PyTorch Dataset API, allowing us to load it directly using [PyTorch DataLoader](https://pytorch.org/docs/stable/data.html).
However, since each episode can have a varying length, we need to pad them.
To achieve this, we can utilize the [collate_fn](https://pytorch.org/docs/stable/data.html#working-with-collate-fn) feature of PyTorch DataLoader. Let's create the ``collate_fn`` function:



In [15]:
def collate_fn(batch):
    return {
        "id": torch.Tensor([x.id for x in batch]),
        # "seed": torch.Tensor([x.seed for x in batch]),
        # "total_timesteps": torch.Tensor([x.total_timesteps for x in batch]),
        "observations": torch.nn.utils.rnn.pad_sequence(
            [torch.as_tensor(x.observations) for x in batch],
            batch_first=True
        ),
        "actions": torch.nn.utils.rnn.pad_sequence(
            [torch.as_tensor(x.actions) for x in batch],
            batch_first=True
        ),
        "rewards": torch.nn.utils.rnn.pad_sequence(
            [torch.as_tensor(x.rewards) for x in batch],
            batch_first=True
        ),
        "terminations": torch.nn.utils.rnn.pad_sequence(
            [torch.as_tensor(x.terminations) for x in batch],
            batch_first=True
        ),
        "truncations": torch.nn.utils.rnn.pad_sequence(
            [torch.as_tensor(x.truncations) for x in batch],
            batch_first=True
        )
    }

We can now proceed to load the data and create the training loop.
To begin, let's initialize the DataLoader, neural network, optimizer, and loss.



In [None]:
minari_dataset = minari.load_dataset("LunarLanderContinuous-v3/ppo-1000-v1")
dataloader = DataLoader(minari_dataset, batch_size=256, shuffle=True, collate_fn=collate_fn)

env = minari_dataset.recover_environment()
observation_space = env.observation_space
action_space = env.action_space
# assert isinstance(observation_space, spaces.Box)
# assert isinstance(action_space, spaces.Discrete)

policy_net = PolicyNetwork(np.prod(observation_space.shape), action_space.n)
optimizer = torch.optim.Adam(policy_net.parameters())
loss_fn = nn.CrossEntropyLoss()

AttributeError: 'Box' object has no attribute 'n'

In [None]:
episode = minari_dataset[0]
print(episode)
print(episode.__dict__.keys())

We use the cross-entropy loss like a classic classification task, as the action space is discrete.
We then train the policy to predict the actions:



In [None]:
num_epochs = 32

for epoch in range(num_epochs):
    for batch in dataloader:
        a_pred = policy_net(batch['observations'][:, :-1])
        a_hat = F.one_hot(batch["actions"]).type(torch.float32)
        loss = loss_fn(a_pred, a_hat)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch: {epoch}/{num_epochs}, Loss: {loss.item()}")

And now, we can evaluate if the policy learned from the expert!



In [None]:
env = gym.make("CartPole-v1", render_mode="human")
obs, _ = env.reset(seed=42)
done = False
accumulated_rew = 0
while not done:
    action = policy_net(torch.Tensor(obs)).argmax()
    obs, rew, ter, tru, _ = env.step(action.numpy())
    done = ter or tru
    accumulated_rew += rew

env.close()
print("Accumulated rew: ", accumulated_rew)

We can visually observe that the learned policy aces this simple control task, and we get the maximum reward 500, as the episode is truncated after 500 steps.


