In [None]:
import numpy as np
import time

import gymnasium as gym
import stable_baselines3 as sb

from stable_baselines3.common.evaluation import evaluate_policy

In [42]:
# Evalua el desempeño del agente
# La variable deterministic controla el factor estocastico del modelo que permite la exploración
# Para evaluar al modelo no es necesario que explore
def evaluate(model, num_episodes=100, deterministic = True):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    # This function will only work for a single Environment
    env = model.get_env()
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs, deterministic = deterministic)
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, info = env.step(action)
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward

# Renders the model in the enviroment to see it learning progress
def show_progress(model, time_steps = 1000, deterministic = True):
    env = model.get_env()
    obs = env.reset()
    for i in range(time_steps):
        action, _states = model.predict(obs, deterministic = deterministic)
        obs, rewards, dones, info = env.step(action)
        env.render("human")
    return

# PPO

In [None]:
# env = gym.make("Pendulum-v1", render_mode="rgb_array")
env = gym.make('CartPole-v1', render_mode="rgb_array")

model = sb.PPO("MlpPolicy", env, verbose = 0)

# Untrained model
show_progress(model)

# Funcion casera
mean_reward_before_train = evaluate(model, num_episodes=100)
# Incorporada default
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
# Train the agent for 10000 steps
model.learn(total_timesteps=10000)

# Trained model
show_progress(model)

# Funcion casera
mean_reward_before_train = evaluate(model, num_episodes=100)
# Incorporada default
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

# DQN

In [None]:
# By default SB3 has vanilla DQN 

# Note that the MlpPolicy of DQN is different from the one of PPO
# but stable-baselines handles that automatically if you pass a string
dqn_model = sb.DQN('MlpPolicy', 'CartPole-v1', verbose=0)

# Random Agent, before training
mean_reward_before_train = evaluate(dqn_model, num_episodes=100)

In [None]:
for i in range(10):
    # Train the agent for 10000 steps
    dqn_model.learn(total_timesteps=100000, progress_bar = True)
    show_progress(dqn_model)
    # Evaluate the trained agent
    mean_reward, std_reward = evaluate_policy(dqn_model, env, n_eval_episodes=100)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")