In [None]:
from __future__ import annotations

from time import time

import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

from main import wrap_env, SimpleEnv, eval_agent, uniform_distribution

In [None]:
random_goal_env = wrap_env(SimpleEnv(
    size=5,
    goal_pos=None,
    agent_start_pos=None,
    render_mode='rgb_array'
))

br_env = wrap_env(SimpleEnv(
    size=5,
    goal_pos=(-2, -2),
    agent_start_pos=None,
    render_mode='rgb_array'
))

print("Action space:", random_goal_env.action_space)
print("Observation space:", random_goal_env.observation_space)

In [None]:
# print("Policy:", model.policy)
# print("Model size:", sum(p.numel() for p in model.policy.parameters()))

In [None]:
def get_agent(bottom_right_odds: int, steps: int = 50_000, general_env: gym.Env = random_goal_env, base_env: gym.Env = br_env):
    n_envs = 1

    # Define the training environment
    goal_distrib = uniform_distribution((4, 4))
    # There are 8 other positions, so odds are 8:br_odds*8 <=> 1:br_odds
    goal_distrib[3, 3] = bottom_right_odds * 8
    env = make_vec_env(lambda: wrap_env(SimpleEnv(
        size=5,
        goal_pos=goal_distrib,
        # goal_pos=(-2, -2),
        agent_start_pos=None,
        # render_mode='rgb_array'
    )), n_envs=n_envs)

    # Define the policy network
    policy = PPO("MlpPolicy", env, verbose=1,
               # learning_rate=0.01,
               learning_rate=lambda f: 0.01 * f,
               policy_kwargs=dict(net_arch=[30, 10]),
               n_steps=2000 // n_envs,
               batch_size=100,
               n_epochs=40,
               gamma=1,
               tensorboard_log="run_logs",
                 device="cpu" )
    # Train the agent
    policy.learn(total_timesteps=steps)

    # Evaluate the agent
    br_success_rate = eval_agent(policy, base_env, 1000)
    success_rate = eval_agent(policy, general_env, 1000)
    print("Bottom right success rate:", br_success_rate)
    print("Success rate:", success_rate)

    # Save the agent
    name = f"agents/ppo_{steps}steps_{success_rate*1000:03.0f}gen_{br_success_rate*1000:03.0f}br_{bottom_right_odds}odds"
    policy.save(name)
    print(f"Saved model to {name}")

In [None]:
for i in range(1):
    # if i > 0:
    #     sleep(30)
    odds = np.random.randint(1, 10)
    odds = 2
    get_agent(odds, 50_000)