In [None]:
# This is demonstration notebook to show the SAC and PPO agents along with SignalTesterEnv registration

# rl_env.py: The SignalTesterEnv environment used by the agents
# This file now exposes a helper to register a gym id so cleanRL scripts (which expect an env id) can call gym.make("SignalTester-v0")
import gymnasium as gym

from rl_env import SignalTesterEnv, register_cleanrl_env

# Example Registration: (We'll need real data here to run gym.make, the "SignalTester-v0" is passed as env_id shown in examples below to sac_continuous_action.py and ppo_continuous_action.py)
# register_cleanrl_env(env_id="SignalTester-v0", data=df, news_documents=news)
# env = gym.make("SignalTester-v0")

In [None]:
# Monkeypatch gymnasium.vector.SyncVectorEnv to handle missing final_observation
# This fixes the KeyError: 'final_observation' when using older gymnasium versions or specific wrappers
import gymnasium as gym
from gymnasium.vector import SyncVectorEnv

original_step = SyncVectorEnv.step

def patched_step(self, actions):
    next_obs, rewards, terminations, truncations, infos = original_step(self, actions)
    if "final_observation" not in infos:
        # Check if any truncation happened
        if any(truncations):
            # Fallback: use next_obs for final_observation where missing
            # Note: next_obs is the reset observation, not the terminal one.
            # This is a workaround to prevent crashing.
            infos["final_observation"] = next_obs.copy()
    return next_obs, rewards, terminations, truncations, infos

SyncVectorEnv.step = patched_step
print("Applied SyncVectorEnv monkeypatch for final_observation")

In [None]:
# Example 1: SAC (Soft Actor-Critic) Training
from CleanRL_API.sac_continuous_action import train as train_sac, Args as SACArgs

# Configure SAC arguments
sac_args = SACArgs(
    env_id="Pendulum-v1",      # Using standard gym env for demo we'll be using SignalTesterEnv in main CleanRL.example.ipynb
    total_timesteps=5000,      # Short run
    policy_lr=3e-4,
    q_lr=1e-3,
    buffer_size=10000,
    gamma=0.99,
    tau=0.005,
    batch_size=256,
    learning_starts=1000,
    policy_frequency=2,
    target_network_frequency=1,
    alpha=0.2,
    autotune=True,
    run_name="api_demo_sac",
    seed=42,
    hidden_size=64,
)

print("Starting SAC Training...")
# train() returns the actor
sac_agent = train_sac(sac_args)
print("SAC Training Complete!")

In [None]:
# Example 2: PPO (Proximal Policy Optimization) Training
from CleanRL_API.ppo_continuous_action import train as train_ppo, Args as PPOArgs

# Configure PPO arguments
ppo_args = PPOArgs(
    env_id="Pendulum-v1",
    total_timesteps=5000,
    learning_rate=3e-4,
    num_envs=4,                # Vectorized environments for PPO
    num_steps=128,
    anneal_lr=True,
    gamma=0.99,
    gae_lambda=0.95,
    num_minibatches=4,
    update_epochs=4,
    norm_adv=True,
    clip_coef=0.2,
    clip_vloss=True,
    ent_coef=0.01,
    vf_coef=0.5,
    max_grad_norm=0.5,
    target_kl=None,
    run_name="api_demo_ppo",
    seed=42,
    hidden_size=64,
)

print("Starting PPO Training...")
# train() returns the Agent (ActorCritic module)
ppo_agent = train_ppo(ppo_args)
print("PPO Training Complete!")