In [7]:
%load_ext autoreload
%autoreload 2

import os
import gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from tqdm import tqdm
from stable_baselines3.common.vec_env import DummyVecEnv
from envs.escape_room_continuous_space_env import EscapeRoomEnv
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, CallbackList
from stable_baselines3.common.monitor import Monitor
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
class CustomEnvWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)

    def reset(self, **kwargs):
        observation, info = self.env.reset(**kwargs)
        return self._process_observation(observation)

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        return self._process_observation(observation), reward, done, info

    def _process_observation(self, observation):
        if isinstance(observation, tuple):
            return np.concatenate(observation)
        else:
            return observation


In [9]:

# Custom Callback for collecting data
class DataCollectorCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(DataCollectorCallback, self).__init__(verbose)
        self.rewards = []
        self.losses = []
        self.episode_lengths = []

    def _on_step(self):
        self.rewards.append(np.mean(self.training_env.get_attr('episode_rewards')))
        self.episode_lengths.append(np.mean(self.training_env.get_attr('episode_lengths')))
        if 'loss' in self.logger.Logger.CURRENT.output_formats[0].writer.data:
            self.losses.append(self.logger.Logger.CURRENT.output_formats[0].writer.data['loss'])
        return True

In [10]:
# Initialize the environment and wrap it
env = CustomEnvWrapper(EscapeRoomEnv(max_steps_per_episode=3000, goal=(550, 450), delta=15))

# Initialize the PPO model
model = PPO("MlpPolicy", env, verbose=0)

# Set up tqdm progress bar
total_episodes = 500
progress_bar = tqdm(total=total_episodes, desc="Training Progress", leave=False)

# Path for saving the model
model_path = "./tmp/ppo/ppo_escape_room_checkpoint.zip"

# Train the model
for episode in range(total_episodes):
    # Perform a training step
    model.learn(total_timesteps=1000)  # Adjust total_timesteps as needed

    # Optionally evaluate the policy every 10 episodes
    if episode % 10 == 0:
        mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
        print(f"Episode: {episode}, Mean reward: {mean_reward}, Std reward: {std_reward}", end='\r')

    # Save the model every 10% of the episodes, overwrite the same file
    if (episode + 1) % (total_episodes // 10) == 0:
        model.save(model_path)

    # Update progress bar
    progress_bar.update(1)

# Save the final model
model.save("./tmp/ppo/ppo_escape_room_final.zip")


# Close the environment
env.close()

print("Training completed and model saved.")

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
                                                             

Episode: 0, Mean reward: -13219.462023, Std reward: 0.0



Episode: 10, Mean reward: -34.759566, Std reward: 0.0



Episode: 20, Mean reward: -159.16718399999996, Std reward: 2.842170943040401e-14



Episode: 30, Mean reward: 136.44583400000002, Std reward: 2.842170943040401e-14



Episode: 40, Mean reward: 103.189574, Std reward: 0.0



Episode: 50, Mean reward: 37.06732, Std reward: 0.0



Episode: 60, Mean reward: 46.71934900000001, Std reward: 7.105427357601002e-15



Goal 'G' reached in 483 steps with cumulative reward 11976.570104072105 for this episode.




Episode: 80, Mean reward: 45.010089, Std reward: 0.0



In [None]:
# Initialize the environment and wrap it
env = CustomEnvWrapper(EscapeRoomEnv(max_steps_per_episode=3000, goal=(550, 450), delta=15))

# Load the trained model
model_path = "./tmp/ppo/ppo_escape_room_final.zip"
model = PPO.load(model_path, env=env)

# Evaluate the model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=True)

print(f"Evaluated model on {20} episodes: Mean reward = {mean_reward}, Std reward = {std_reward}")

# Optionally: Visualize the agent's performance
try:
    for _ in range(5):  # Run 5 episodes
        obs = env.reset()
        done = False
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()  # Render the environment to visualize the agent's behavior
finally:
    env.close()  # Ensure the environment is closed properly

print("Evaluation and visualization completed.")

In [None]:

def plot_rewards(rewards, rolling_window=10):
    """ Plots the reward trend along with a rolling average.

    Args:
        rewards (list): List of rewards obtained per episode.
        rolling_window (int): Window size for the rolling average.
    """
    plt.figure(figsize=(10, 5))
    plt.plot(rewards, label='Reward per Episode')
    plt.plot(np.convolve(rewards, np.ones(rolling_window)/rolling_window, mode='valid'), 
             label=f'{rolling_window}-Episode Rolling Average')
    plt.title('Rewards Trend')
    plt.xlabel('Episodes')
    plt.ylabel('Reward')
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
def plot_losses(losses):
    """ Plots the training losses.

    Args:
        losses (list): List of loss values recorded during training.
    """
    plt.figure(figsize=(10, 5))
    plt.plot(losses, label='Training Loss per Step')
    plt.title('Loss Trend')
    plt.xlabel('Training Steps')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
def plot_episode_lengths(episode_lengths):
    """ Plots the length of each episode over time.

    Args:
        episode_lengths (list): List of episode lengths.
    """
    plt.figure(figsize=(10, 5))
    plt.plot(episode_lengths, label='Episode Length')
    plt.title('Episode Length Trend')
    plt.xlabel('Episodes')
    plt.ylabel('Length')
    plt.legend()
    plt.grid(True)
    plt.show()
