# SAC smoke test on HalfCheetah-v5

Minimal setup that instantiates the SAC agent with a short HalfCheetah-v5 rollout so we can sanity-check wiring before running long experiments.


In [None]:
import gymnasium as gym
import torch
from pathlib import Path

from rlopt.agent.sac import SAC, SACRLOptConfig
from rlopt.config_base import NetworkConfig
from rlopt.env_utils import env_maker


def build_halfcheetah_config(total_frames: int = 1024) -> SACRLOptConfig:
    """Return a minimally tuned SAC config for HalfCheetah-v5 smoke tests."""

    cfg = SACRLOptConfig()
    cfg.seed = 7
    cfg.device = "cpu"

    # Environment + collector knobs -----------------------------------------
    cfg.env.env_name = "HalfCheetah-v5"
    cfg.env.library = "gymnasium"
    cfg.env.num_envs = 8

    cfg.collector.frames_per_batch = 256
    cfg.collector.total_frames = total_frames
    cfg.collector.init_random_frames = 25_000
    cfg.collector.prefetch = 1

    # Replay + optimization -------------------------------------------------
    cfg.loss.mini_batch_size = 256
    cfg.replay_buffer.size = 1_000_000
    cfg.replay_buffer.prefetch = 1
    cfg.optim.lr = 3e-4
    cfg.optim.scheduler = None
    cfg.optim.target_update_polyak = 0.995
    cfg.sac.utd_ratio = 1.0

    # Lightweight logging so the notebook runs without external services ----
    log_dir = Path.cwd() / "notebook_logs"
    cfg.logger.backend = ""
    cfg.logger.log_to_file = True
    cfg.logger.log_dir = str(log_dir)
    cfg.logger.exp_name = "sac_halfcheetah_smoketest"
    cfg.logger.python_level = "info"

    # Network dimensions depend on env specs --------------------------------
    dummy_env = gym.make(cfg.env.env_name)
    obs_dim = dummy_env.observation_space.shape[0]
    action_dim = dummy_env.action_space.shape[0]
    dummy_env.close()

    cfg.policy.input_dim = obs_dim
    cfg.policy.num_cells = [256, 256]
    cfg.policy.activation_fn = "relu"

    cfg.q_function = NetworkConfig(
        num_cells=[256, 256],
        input_dim=obs_dim + action_dim,
        input_keys=["observation", "action"],
        activation_fn="relu",
    )

    return cfg


# ---------------------------------------------------------------------------
cfg = build_halfcheetah_config(total_frames=1_000_000)
train_env = env_maker(cfg, device=cfg.device)
agent = SAC(env=train_env, config=cfg)
agent.train()

# Quick deterministic rollout on a fresh eval env to verify predict() -------
eval_env = env_maker(cfg, device=cfg.device)
with torch.no_grad():
    td = eval_env.reset().to(agent.device)
    action = agent.predict(td.clone())
print(f"Deterministic action sample (shape={tuple(action.shape)}):\n{action}")



  r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
Model Overview:
<rlopt.agent.sac.sac.SAC object at 0x7f95f8a9a4d0>
Policy Network:
ProbabilisticActor(
    module=ModuleList(
      (0): TensorDictModule(
          module=Sequential(
            (0): MLP(
              (0): Linear(in_features=17, out_features=256, bias=True)
              (1): ReLU()
              (2): Linear(in_features=256, out_features=256, bias=True)
              (3): ReLU()
              (4): Linear(in_features=256, out_features=12, bias=True)
            )
            (1): NormalParamExtractor(
              (scale_mapping): biased_softplus()
            )
          ),
          device=cpu,
          in_keys=['observation'],
          out_keys=['loc', 'scale'])
      (1): SafeProbabilisticModule(
          in_keys=['loc', 'scale'],
          out_keys=['action'],
          distribution_class=<class 'torchrl.modules.distributions.continuous.TanhNormal'>, 
          distribution_kwargs={'low':

RuntimeError: Converting a tensordict to boolean value is not permitted

  8%|▊         | 75264/1000000 [09:40<2:48:25, 91.51it/s, r_step=-0.50, r_ep=-289.0, n_ep=75, π_loss=57.408, α=0.000]