# Notebook permettant d'entrainer nos agents RL

In [None]:
# Librairies
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from gym_env_v2 import LabyrinthEnv
from callbacks import SaveModelCallback, ExplorationExploitationCallback
from stable_baselines3.common.callbacks import EvalCallback, CallbackList
from stable_baselines3.common.monitor import Monitor
import numpy as np

In [2]:
# Create directories for logs and models if they don't exist
log_dir = "./logs/"
model_dir = "./modeles/"
os.makedirs(log_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

# Create vectorized environment
def make_env(rank):
    def _init():
        env = LabyrinthEnv()
        env = Monitor(env, log_dir + f"agent_{rank}")
        return env
    return _init

# Create 4 environments running in parallel
num_envs = 4
env = SubprocVecEnv([make_env(i) for i in range(num_envs)])

In [3]:
# Create evaluation environment
eval_env = Monitor(LabyrinthEnv(), log_dir + "eval/")

# Callbacks
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=f"{model_dir}/best/",
    log_path=log_dir,
    eval_freq=2500,
    deterministic=True,
    render=False
)

exploration_callback = ExplorationExploitationCallback()
checkpoint_callback = SaveModelCallback(save_freq=2500, save_path=model_dir)

# Combine all callbacks
callback = CallbackList([
    eval_callback,
    exploration_callback,
    checkpoint_callback
])

In [4]:
# Create and train the model
model = PPO(
    "MultiInputPolicy",  # Special policy for Dict observation spaces
    env,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    verbose=1,
    tensorboard_log=log_dir
)

# Train the model
model.learn(
    total_timesteps=150000,
    callback=callback,
    progress_bar=True
)

Using cpu device
Logging to ./logs/PPO_1




EOFError: 

In [None]:
# Test the trained model
test_env = LabyrinthEnv(render_mode="human")
obs, _ = test_env.reset()

for _ in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = test_env.step(action)
    test_env.render()
    
    if terminated or truncated:
        obs, _ = test_env.reset()

test_env.close()

In [None]:
# Save the final model
model.save(f"{model_dir}/labyrinth_final_model")