In [None]:
%load_ext autoreload
%autoreload 2

import os
import gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from tqdm.auto import tqdm
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv
from envs.escape_room_continuous_space_env import EscapeRoomEnv
import imageio
from IPython.display import Image, display

In [None]:
class CustomEnvWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)

    def reset(self, **kwargs):
        observation, info = self.env.reset(**kwargs)
        return self._process_observation(observation)

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        return self._process_observation(observation), reward, done, info

    def _process_observation(self, observation):
        if isinstance(observation, tuple):
            return np.concatenate(observation)
        else:
            return observation

In [None]:
from stable_baselines3.common.callbacks import BaseCallback

class LossAndRewardLoggingCallback(BaseCallback):
    def __init__(self, check_freq, verbose=1):
        super(LossAndRewardLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.rewards = []
        self.losses = []

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            # Accessing rewards and approximate losses
            self.rewards.append(self.locals['rewards'])  # Assuming 'rewards' are passed here
            if 'loss' in self.locals:
                self.losses.append(self.locals['loss'])
        return True


In [None]:
# Initialize the environment and wrap it
env = CustomEnvWrapper(EscapeRoomEnv(max_steps_per_episode=3000, goal=(550, 350), delta=15))

# Initialize the PPO model
model = PPO("MlpPolicy", env, verbose=0)

# Setup the callback for logging losses and rewards
callback = LossAndRewardLoggingCallback(check_freq=1000)  # Frequency to collect data

# Setup tqdm progress bar
total_episodes = 500
with tqdm(total=total_episodes, desc="Training Progress", leave=True) as progress_bar:
    # Train the model
    for episode in range(total_episodes):
        model.learn(total_timesteps=1000, callback=callback)  # Training step size

        # Update progress bar with current episode count
        progress_bar.update(1)

        # Optionally evaluate the policy every 10 episodes
        if episode % 10 == 0:
            mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
            print(f"Episode: {episode}, Mean reward: {mean_reward:.2f}, Std reward: {std_reward:.2f}")

# Save the final model
model_path = "./tmp/ppo/ppo_escape_room_final.zip"
model.save(model_path)

# Close the environment
env.close()

# Save collected loss and reward data for later analysis or plotting
data_dir = "./tmp/ppo/"
os.makedirs(data_dir, exist_ok=True)  # Ensure the directory exists

np.save(os.path.join(data_dir, "losses.npy"), np.array(callback.losses))
np.save(os.path.join(data_dir, "rewards.npy"), np.array(callback.rewards))

print("Training completed and model saved.")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os

# Load the data from files
data_dir = "./tmp/ppo/"
losses_path = os.path.join(data_dir, "losses.npy")
rewards_path = os.path.join(data_dir, "rewards.npy")

# Check if files exist and load data; if not, initialize empty arrays
losses = np.load(losses_path, allow_pickle=True) if os.path.exists(losses_path) else np.array([])
rewards = np.load(rewards_path, allow_pickle=True) if os.path.exists(rewards_path) else np.array([])

# Prepare episode indices for the x-axis
episode_indices_rewards = np.arange(1, len(rewards) + 1)
episode_indices_losses = np.arange(1, len(losses) + 1)

# Create plots
fig, axs = plt.subplots(2, 1, figsize=(12, 10))

# Plotting rewards
if rewards.size > 0:
    axs[0].plot(episode_indices_rewards, rewards, label='Rewards', color='blue')
    axs[0].set_title('Rewards Over Time')
    axs[0].set_xlabel('Episode')
    axs[0].set_ylabel('Rewards')
    axs[0].grid(True)
    axs[0].legend()
else:
    axs[0].text(0.5, 0.5, 'No reward data available', horizontalalignment='center', verticalalignment='center', transform=axs[0].transAxes)
    axs[0].set_title('Rewards Over Time')
    axs[0].set_xlabel('Episode')
    axs[0].set_ylabel('Rewards')
    axs[0].grid(True)

# Plotting losses
if losses.size > 0:
    axs[1].plot(episode_indices_losses, losses, label='Losses', color='red')
    axs[1].set_title('Losses Over Time')
    axs[1].set_xlabel('Episode')
    axs[1].set_ylabel('Losses')
    axs[1].grid(True)
    axs[1].legend()
else:
    axs[1].text(0.5, 0.5, 'No loss data available', horizontalalignment='center', verticalalignment='center', transform=axs[1].transAxes)
    axs[1].set_title('Losses Over Time')
    axs[1].set_xlabel('Episode')
    axs[1].set_ylabel('Losses')
    axs[1].grid(True)

plt.tight_layout()
plt.show()


In [None]:
def test_render(env):
    env.reset()
    frame = env.render(mode='rgb_array')
    if frame is None:
        print("Render mode 'rgb_array' is not supported.")
    else:
        print("Render mode 'rgb_array' works correctly.")
        plt.imshow(frame)
        plt.show()

test_render(env) 

In [None]:
def display_gif(path):
    with open(path, 'rb') as file:
        display(Image(file.read()))

In [None]:
# Initialize the environment and wrap it
import imageio


env = CustomEnvWrapper(EscapeRoomEnv(max_steps_per_episode=3000, goal=(550, 450), delta=15))

# Load the trained model
model_path = "./tmp/ppo/ppo_escape_room_final.zip"
model = PPO.load(model_path, env=env)

# Evaluate the model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=True)
print(f"Evaluated model on {20} episodes: Mean reward = {mean_reward:.2f}, Std reward = {std_reward:.2f}")

# Generate GIF of agent's performance
frames = []  # List to store frames
n_episodes = 5  # Number of episodes to simulate for GIF
max_steps_per_episode = 300  # Adjust as needed

# Collect frames for the GIF
try:
    for _ in range(n_episodes):
        obs = env.reset()
        done = False
        steps = 0
        while not done and steps < max_steps_per_episode:
            frame = env.render(mode='rgb_array')  # Capture the frame
            if frame is not None:
                frames.append(frame)  # Append the frame to the list
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            steps += 1
finally:
    env.close()  # Ensure the environment is closed properly
    # Save the frames as a GIF
    gif_path = "./ppo_simulation.gif"  # Path to save the GIF
    imageio.mimsave(gif_path, frames, fps=30)  # Save as a GIF

print("Evaluation and visualization completed.")
display_gif(gif_path)