In [18]:
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import gymnasium as gym
from stable_baselines3 import PPO, SAC
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

import stlrom
import numpy as np
from rlrom.wrappers import STLWrapper
from rlrom.envs import *

In [19]:
cfg = cfg_envs['Pendulum']

In [25]:
print(['']+supported_envs)

['', 'LunarLander', 'highway-env', 'MountainCarContinuous', 'CartPole', 'Pendulum', 'MountainCar', 'LunarLanderContinuous', 'BipedalWalker', 'CarRacing', 'BipedalWalkerHardcore', 'Acrobot']


## Training

In [None]:
# Create the environment
env = gym.make(cfg['env_name'])


In [None]:
import gymnasium as gym
import numpy as np

# --- Parameters ---
ENV_ID = "Pendulum-v1"
# Hyperparameters inspired by RL Baselines Zoo3 for SAC on Pendulum-v1
# (Note: Zoo often uses more advanced setups like VecNormalize, but this provides the core SAC params)
tuned_params = {
    'learning_rate': 7.3e-4, # Often represented as linear_schedule(7.3e-4) in the zoo, but constant works too
    'buffer_size': 50000,
    'learning_starts': 100,
    'batch_size': 256,
    'tau': 0.005,
    'gamma': 0.99,
    'train_freq': 1,
    'gradient_steps': 1,
    'ent_coef': 'auto', # Automatically tune entropy coefficient
    'target_update_interval': 1,
    'target_entropy': 'auto',
    'use_sde': False, # State Dependent Exploration - can sometimes help, but often not needed for Pendulum
    'sde_sample_freq': -1,
    'policy_kwargs': dict(net_arch=[64, 64]) # Neural network architecture
}
TOTAL_TIMESTEPS = 50_000 # Adjust as needed, 50k-100k is often enough for Pendulum
N_EVAL_EPISODES = 10
MODEL_SAVE_PATH = "./sac_pendulum_tuned"

# --- Environment Setup ---
# Create a single environment instance, wrapped with Monitor for logging
env = gym.make(ENV_ID)
env = Monitor(env)

# For evaluation later
eval_env = gym.make(ENV_ID)
eval_env = Monitor(eval_env) # Wrap eval env too if you want stats from evaluation


# --- Model Definition ---
# Use the tuned hyperparameters
model = SAC(
    "MlpPolicy",
    env,
    verbose=1, # Print training progress
    **tuned_params
)

print(f"--- Training SAC on {ENV_ID} for {TOTAL_TIMESTEPS} timesteps ---")
print(f"Hyperparameters: {tuned_params}")

# --- Training ---
# The learn() method handles the training loop
model.learn(total_timesteps=TOTAL_TIMESTEPS, log_interval=10) # Log every 10 training updates

print("--- Training Finished ---")

# --- Saving the Model ---
model.save(MODEL_SAVE_PATH)
print(f"Model saved to {MODEL_SAVE_PATH}.zip")

# --- Evaluation ---
print("--- Evaluating Trained Agent ---")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=N_EVAL_EPISODES)
print(f"Evaluation over {N_EVAL_EPISODES} episodes:")
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# --- Optional: Load and Run Trained Agent ---
# del model # Remove trained model from memory
# loaded_model = SAC.load(MODEL_SAVE_PATH, env=eval_env)
# print("--- Running Loaded Agent ---")
# obs, _ = eval_env.reset()
# total_reward = 0
# terminated = False
# truncated = False
# while not terminated and not truncated:
#     action, _states = loaded_model.predict(obs, deterministic=True) # Use deterministic actions for evaluation
#     obs, reward, terminated, truncated, info = eval_env.step(action)
#     total_reward += reward
#     # Optional: render the environment
#     # eval_env.render()
# print(f"Reward in one episode run: {total_reward}")

eval_env.close()
env.close()

## Testing with STLWrapper only


In [None]:
env= gym.make(cfg['env_name'], render_mode= 'human')
#loaded_model = PPO.load("ppo_pendulum_optimized", env=env)
loaded_model = SAC.load("sac_pendulum_tuned", env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [14]:
driver = stlrom.STLDriver()
driver.parse_string(cfg['specs'])
env = STLWrapper(env, driver, signals_map=cfg)

In [15]:
obs,_= env.reset()
terminated = False
truncated = False
total_reward =0
while not terminated and not truncated:
    action, _states = loaded_model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    env.render()  # Optional: render the environment

print(f"Total reward during test: {total_reward}")
env.close()

Total reward during test: -239.28462995359527


# Testing with RLModelTester

In [17]:
from rlrom.testers import RLModelTester

In [None]:

env_name = cfg['env_name']
model_name = 'sac_pendulum_tuned.zip'
tester = RLModelTester()