In [None]:
import numpy as np
import time
import os

import gymnasium as gym
import stable_baselines3 as sb

from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
# Renders the model in the enviroment to see it learning progress
def show_progress(model, time_steps = 1000, deterministic = True):
    env = model.get_env()
    obs = env.reset()
    for i in range(time_steps):
        action, _states = model.predict(obs, deterministic = deterministic)
        obs, rewards, dones, info = env.step(action)
        env.render("human")
    return

# Save & Load

Guardar el modelo permite tener redes preentrenadas para no arrancar de cero cada vez si se quiere probar algo

Además se pueden cambiar los parametros del modelo luego de cargarlo. En este caso se cambia gamma y verbose

In [None]:
# Create save dir (esta en el disco C)
save_dir = "/tmp/gym/"
os.makedirs(save_dir, exist_ok=True)

model = sb.A2C('MlpPolicy', 'Pendulum-v1', verbose=0, gamma=0.9, n_steps=20).learn(8000)
# The model will be saved under A2C_tutorial.zip
model.save(save_dir + "/A2C_tutorial")

del model # delete trained model to demonstrate loading

# load the model, and when loading set verbose to 1
loaded_model = sb.A2C.load(save_dir + "/A2C_tutorial", verbose = 1, gamma = .8)

# show the save hyperparameters
print("loaded:", "gamma =", loaded_model.gamma, "n_steps =", loaded_model.n_steps)

# as the environment is not serializable, we need to set a new instance of the environment
loaded_model.set_env(sb.common.vec_env.DummyVecEnv([lambda: gym.make('Pendulum-v1')]))
# and continue training
loaded_model.learn(8000)

# Wrappers

A gym wrapper follows the gym interface: it has a reset() and step() method.

We can access it with self.env allowing to easily interact with it without modifying the original env.

In [None]:
# Anatomia Basica
# class CustomWrapper(gym.Wrapper):
#   """
#   :param env: (gym.Env) Gym environment that will be wrapped
#   """
#   def __init__(self, env):
#     # Call the parent constructor, so we can access self.env later
#     super(CustomWrapper, self).__init__(env)
  
#   def reset(self):
#     """
#     Reset the environment 
#     """
#     obs = self.env.reset()
#     return obs

#   def step(self, action):
#     """
#     :param action: ([float] or int) Action taken by the agent
#     :return: (np.ndarray, float, bool, dict) observation, reward, is the episode over?, additional informations
#     """
#     obs, reward, done, info = self.env.step(action)
#     return obs, reward, done, info

### Adding/Changing the TimeLimit per Episode

In [34]:
# Hagamos modificaciones:
#   Limitamos los time_steps por episodio
class CustomWrapper(gym.Wrapper):
  """
  :param env: (gym.Env) Gym environment that will be wrapped
  :param max_steps: (int) Max number of steps per episode
  """
  def __init__(self, env, max_steps = 100):
    # Call the parent constructor, so we can access self.env later
    super(CustomWrapper, self).__init__(env)
    self.max_steps = max_steps
    # Counter of steps per episode
    self.current_step = 0
  
  def reset(self):
    """
    Reset the environment 
    """
    # Reset the counter
    self.current_step = 0
    return self.env.reset()

  def step(self, action):
    """
    :param action: ([float] or int) Action taken by the agent
    :return: (np.ndarray, float, bool, dict) observation, reward, is the episode over?, additional informations
    """
    self.current_step += 1
    obs, reward, done, truncated, info = self.env.step(action)
    # Overwrite the done signal when 
    if self.current_step >= self.max_steps:
      truncated = True
      # Update the info dict to signal that the limit was exceeded
      info['time_limit_reached'] = True
    return obs, reward, done, truncated, info

In [None]:
from gymnasium.envs.classic_control.pendulum import PendulumEnv

# gym.make() already wrap the environement in a TimeLimit wrapper
env = PendulumEnv()
env = CustomWrapper(env, max_steps=100)

obs = env.reset()
done, truncated = False, False
n_steps = 0
while not (done or truncated):
  # Take random actions
  random_action = env.action_space.sample()
  obs, reward, done, truncated, info = env.step(random_action)
  n_steps += 1

print(n_steps, info)

100 {'time_limit_reached': True}


### Reescaling Actions

It's a good idea to work with normalize observations and actions -in a [-1,1] range-

In [38]:
class CustomWrapper(gym.Wrapper):
  """
  :param env: (gym.Env) Gym environment that will be wrapped
  """
  def __init__(self, env):
    # Retrieve the action space
    action_space = env.action_space
    assert isinstance(action_space, gym.spaces.Box), "This wrapper only works with continuous action space (spaces.Box)"
    # Retrieve the max/min values
    self.low, self.high = action_space.low, action_space.high

    # We modify the action space, so all actions will lie in [-1, 1]
    env.action_space = gym.spaces.Box(low=-1, high=1, shape=action_space.shape, dtype=np.float32)

    # Call the parent constructor, so we can access self.env later
    super(CustomWrapper, self).__init__(env)
  
  def rescale_action(self, scaled_action):
      """
      Rescale the action from [-1, 1] to [low, high]
      (no need for symmetric action space)
      :param scaled_action: (np.ndarray)
      :return: (np.ndarray)
      """
      return self.low + (0.5 * (scaled_action + 1.0) * (self.high -  self.low))

  def reset(self):
    """
    Reset the environment 
    """
    # Reset the counter
    return self.env.reset()

  def step(self, action):
    """
    :param action: ([float] or int) Action taken by the agent
    :return: (np.ndarray, float, bool, dict) observation, reward, is the episode over?, additional informations
    """
    # Rescale action from [-1, 1] to original [low, high] interval
    rescaled_action = self.rescale_action(action)
    obs, reward, done, info = self.env.step(rescaled_action)
    return obs, reward, done, info


In [None]:
original_env = gym.make("Pendulum-v1")
env = CustomWrapper(gym.make("Pendulum-v1"))

print("Original action Range: ", original_env.action_space.low, original_env.action_space.high)
print("Rescaled action Range: ", env.action_space.low, env.action_space.high)
for _ in range(10):
  print(original_env.action_space.sample(),"\t",env.action_space.sample())

Original action Range:  [-2.] [2.]
Rescaled action Range:  [-1.] [1.]
[1.9145433] 	 [-0.82647973]
[-1.8798988] 	 [-0.15079382]
[0.06876159] 	 [0.795015]
[-0.10251061] 	 [0.32899192]
[1.640153] 	 [0.38537577]
[0.2384393] 	 [0.66193366]
[-1.522139] 	 [0.24981137]
[1.8707141] 	 [0.26488578]
[-0.8182553] 	 [-0.724749]
[1.6064415] 	 [-0.57995605]
Original action Range:  [-2.] [2.]
Rescaled action Range:  [-1.] [1.]


# Vec Wrappers