In [1]:
%load_ext autoreload
%autoreload 2

from stable_baselines3 import TD3
from stable_baselines3.td3.policies import MlpPolicy
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np
import gym
from tqdm.auto import tqdm
from envs.escape_room_continuous_space_env import EscapeRoomEnv


In [2]:
class CustomEnvWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)

    def reset(self, **kwargs):
        observation, info = self.env.reset(**kwargs)
        return self._process_observation(observation)

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        return self._process_observation(observation), reward, done, info

    def _process_observation(self, observation):
        if isinstance(observation, tuple):
            return np.concatenate(observation)
        else:
            return observation


In [3]:
# Initialize the environment and wrap it
env = CustomEnvWrapper(EscapeRoomEnv(max_steps_per_episode=3000, goal=(550, 450), delta=15))

# Set up action noise for exploration
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

# Initialize the TD3 model
model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0)

# Set up tqdm progress bar
total_episodes = 500
progress_bar = tqdm(total=total_episodes, desc="Training Progress", leave=False)

# Path for saving the model
model_path = "./tmp/td3/td3_escape_room_checkpoint.zip"

# Train the model
for episode in range(total_episodes):
    # Perform a training step
    model.learn(total_timesteps=1000)  # Adjust total_timesteps as needed

    # Optionally evaluate the policy every 10 episodes
    if episode % 10 == 0:
        mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
        print(f"Episode: {episode}, Mean reward: {mean_reward}, Std reward: {std_reward}", end='\r')

    # Save the model every 10% of the episodes, overwrite the same file
    if (episode + 1) % (total_episodes // 10) == 0:
        model.save(model_path)

    # Update progress bar
    progress_bar.update(1)

# Save the final model
model.save("./tmp/td3/td3_escape_room_final.zip")

# Close the environment
env.close()

print("Training completed and model saved.")

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Training completed and model saved., Std reward: 0.0


In [7]:
# Initialize the environment and wrap it
env = CustomEnvWrapper(EscapeRoomEnv(max_steps_per_episode=3000, goal=(550, 450), delta=15))

# Load the trained model
model_path = "./tmp/td3/td3_escape_room_final.zip"
model = TD3.load(model_path, env=env)

# Evaluate the model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=True)
print(f"Evaluated model on {20} episodes: Mean reward = {mean_reward}, Std reward = {std_reward}")

# Optionally: Visualize the agent's performance
try:
    for _ in range(5):  # Run 5 episodes
        obs = env.reset()
        done = False
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()  # Render the environment to visualize the agent's behavior
finally:
    env.close()  # Ensure the environment is closed properly

print("Evaluation and visualization completed.")

Evaluated model on 20 episodes: Mean reward = -70.78408718109131, Std reward = 0.0
Evaluation and visualization completed.
