Gymnasium's Pusher task : SAC Torchrl torial

In [1]:
!pip3 install torchrl
!pip3 install gymnasium[mujoco]
!pip3 install tqdm

Collecting gymnasium[mujoco]
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium[mujoco])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Collecting mujoco>=2.3.3 (from gymnasium[mujoco])
  Downloading mujoco-3.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m957.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting imageio>=2.14.1 (from gymnasium[mujoco])
  Downloading imageio-2.34.0-py3-none-any.whl.metadata (4.9 kB)
Collecting absl-py (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting etils[epath] (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading etils-1.7.0-py3-none-any.whl.metadata (6.4 kB)
Collecting glfw (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading glfw-2.6.5-py2.py27.py3.py30.py3

In [27]:
# Torch
import torch
from tensordict.nn import TensorDictModule, InteractionType
from tensordict.nn.distributions import NormalParamExtractor

# Tensordict modules
from torch import multiprocessing

# Data collection
from torchrl.collectors import SyncDataCollector
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage

# Env
from torchrl.envs.libs.gym import GymEnv
import gymnasium, gym
from torchrl.envs.utils import check_env_specs

# Model and policy
from torchrl.modules import MLP, ProbabilisticActor, TanhNormal
# Loss
from torchrl.objectives import SoftUpdate
from torchrl.objectives.sac import SACLoss

# Utils
torch.manual_seed(0)
from matplotlib import pyplot as plt
from tqdm import tqdm
import time

In [28]:
# Devices
is_fork = multiprocessing.get_start_method() == "fork"
device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)
gym_device = device  # The device where the simulator is run (VMAS can run on GPU)

# Sampling
frames_per_batch = 6_000  # Number of team frames collected per training iteration
n_iters = 10  # Number of sampling and training iterations
total_frames = frames_per_batch * n_iters

# Training
num_epochs = 20  # Number of optimization steps per training iteration
minibatch_size = 400  # Size of the mini-batches in each optimization step
lr = 3e-4  # Learning rate

# SAC
value_loss='smooth_l1' # 'smooth_l1' 'mse' ; loss function to be used with the value function loss
gamma = 0.9  # discount factor
polyak = 0.002 # lambda for generalised advantage estimation

# Model
layers_config = [256, 258, 64, 32]  # Number of units per layer in the network

# GYM
max_steps = 100  # Episode steps before done
num_vmas_envs = (
    frames_per_batch // max_steps
)  # Number of vectorized envs. frames_per_batch should be divisible by this number
scenario_name = "Pusher-v4"

env = GymEnv(
    env_name=scenario_name,
    device=gym_device,
)

check_env_specs(env)
n_rollout_steps = 5
rollout = env.rollout(n_rollout_steps)
time = time.time()
print(time)

  logger.warn(
  logger.warn(
2024-02-23 22:59:06,826 [torchrl][INFO] check_env_specs succeeded!


1708729146.8301754


In [29]:
print("observation_space", env.observation_space)
print("action_space", env.action_space)
print("observation size", env.observation_space.shape[0])

observation_space Box(-inf, inf, (23,), float64)
action_space Box(-2.0, 2.0, (7,), float32)
observation size 23


In [74]:
def make_policy_module(env, layers_config, device):
    policy_net = MLP(
        in_features=env.observation_space.shape[0],
        out_features=env.action_space.shape[0] * 2,  # 2 outputs per action: loc and scale
        device=device,
        depth=len(layers_config),
        num_cells=layers_config,
        activation_class=torch.nn.LeakyReLU,
    )

    seq_policy_net = torch.nn.Sequential(
        policy_net,
        NormalParamExtractor(),  # this will just separate the last dimension into two outputs: a loc and a non-negative scale
    )

    policy_module = TensorDictModule(
        seq_policy_net,
        in_keys=["observation"],
        out_keys=["loc", "scale"],
    )

    policy = ProbabilisticActor(
        module=policy_module.double(),
        in_keys=["loc", "scale"],
        out_keys=["action"],
        spec=env.action_spec,
        distribution_class=TanhNormal,
        distribution_kwargs={
            "min": env.action_spec.space.minimum,
            "max": env.action_spec.space.maximum,
            "tanh_loc": False,
        },
        default_interaction_type=InteractionType.RANDOM,
        return_log_prob=False,
    )

    return policy

def make_Qval_module(env, layers_config, device):
    Qval_net = MLP(
        in_features=env.observation_space.shape[0] + env.action_space.shape[0],
        out_features=1,  # 2 outputs per action: loc and scale
        device=device,
        depth=len(layers_config),
        num_cells=layers_config,
        activation_class=torch.nn.LeakyReLU,
    )

    Qval = TensorDictModule(
        module=Qval_net,
        out_keys=["state_action_value"],
        in_keys=["observation", "action"],
    )

    return Qval.double()

In [75]:
def make_collector(env, policy, device, storing_device, frames_per_batch, init_random_frames, total_frames):
    return SyncDataCollector(
        env,
        policy,
        device=device,
        storing_device=storing_device,
        frames_per_batch=frames_per_batch,
        init_random_frames=init_random_frames,
        total_frames=total_frames,
    )

def make_replay_buffer(frames_per_batch, minibatch_size, device):
    return ReplayBuffer(
        storage=LazyTensorStorage(
            frames_per_batch, device=device
        ),  # We store the frames_per_batch collected at each iteration
        sampler=SamplerWithoutReplacement(),
        batch_size=minibatch_size,  # We will sample minibatches of this size
    )

In [76]:
def make_loss_module(env, policy, Qval, value_loss, gamma, polyak, lr):
    loss_module = SACLoss(
        actor_network=policy,
        qvalue_network=Qval,
        loss_function=value_loss,
        action_spec=env.action_spec,
    )
    loss_module.make_value_estimator(gamma=gamma)
    target_net_updater = SoftUpdate(loss_module,
                                    eps=polyak)
    optim = torch.optim.Adam(loss_module.parameters(),
                             lr=lr)
    
    return loss_module, target_net_updater, optim

In [77]:
def training_loop(
    env,
    loss_module,
    target_net_updater,
    optim,
    collector,
    replay_buffer,
    n_iters,
    num_epochs,
    frames_per_batch,
    minibatch_size):

    rewards = []
    rewards_eval = []

    # Main loop
    pbar = tqdm(total=n_iters, desc="episode_reward_mean = 0")

    q_loss = None

    episode_reward_mean_list = []

    for tensordict_data in collector:
        # We need to expand the done and terminated to match the reward shape (this is expected by the value estimator)
        data_view = tensordict_data.reshape(-1)  # Flatten the batch size to shuffle data
        replay_buffer.extend(data_view)

        for _ in range(num_epochs):
            for _ in range(frames_per_batch // minibatch_size):
                subdata = replay_buffer.sample()

                loss_vals = loss_module(subdata)

                loss_value = (
                    loss_vals["loss_actor"]
                    + loss_vals["loss_qvalue"]
                    + loss_vals["loss_alpha"]
                )

                optim.zero_grad()
                loss_value.backward()
                optim.step()

                # update qnet_target params
                target_net_updater.step()

        # update weights of the inference policy
        collector.update_policy_weights_()

        # Logging
        done = tensordict_data.get(("next", "done"))
        episode_reward_mean = (
            tensordict_data.get(("next", "reward"))[done].mean().item()
        )
        episode_reward_mean_list.append(episode_reward_mean)
        pbar.set_description(f"episode_reward_mean = {episode_reward_mean}", refresh=False)
        pbar.update()

    pbar.close()
    collector.shutdown()
    replay_buffer.shutdown()
    
    return episode_reward_mean_list

In [78]:
policy = make_policy_module(env, layers_config, device)
Qval = make_Qval_module(env, layers_config, device)
collector = make_collector(env, policy, device, device, frames_per_batch, init_random_frames=0, total_frames=total_frames)
replay_buffer = make_replay_buffer(frames_per_batch, minibatch_size, device)
loss_module, target_net_updater, optim = make_loss_module(env, policy, Qval, value_loss, gamma, polyak, lr)
episode_reward_mean_list = training_loop(env, loss_module, target_net_updater, optim, collector, replay_buffer, n_iters, num_epochs, frames_per_batch, minibatch_size)


[A


episode_reward_mean = 0:   0%|          | 0/10 [04:54<?, ?it/s]

[A
[A
[A
[A
[A
[A

RuntimeError: Converting a tensordict to boolean value is not permitted