# Import

In [4]:
import gymnasium as gym
import kidpuzzles

env = gym.make('kidpuzzles/DigitsPuzzleEnv-v0', render_mode = 'human')

observations = env.reset()

for _ in range(3):
    action = env.action_space.sample()
    observations, reward, terminated, truncated, info = env.step(action)
    env.render()

    if terminated:
        observations = env.reset()

env.close()

2025-02-21 04:46:59.950 python[5316:191207] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-21 04:46:59.950 python[5316:191207] +[IMKInputSession subclass]: chose IMKInputSession_Modern


In [1]:
import sys 
import math
import gymnasium as gym
from gymnasium.wrappers.record_video import RecordVideo
from gymnasium.wrappers.record_episode_statistics import RecordEpisodeStatistics
import kidpuzzles
from stable_baselines3 import A2C

# Training
## Learning rate schedule

In [2]:
def linear_scheduler(initial_value):
    def func(progress_remaining):
        return initial_value * progress_remaining
    return func

def step_scheduler(initial_value, drop_interval, drop_factor):
    def func(progress_remaining):
        return initial_value * (drop_factor ** (int((1 - progress_remaining) / drop_interval)))
    return func

def cosine_annealing_scheduler(initial_value, T_max):
    def func(progress_remaining):
        return initial_value * (1 + math.cos(math.pi * progress_remaining / T_max)) / 2
    return func

## Env

In [3]:

n_digits = 10
# reward_clipped = -0.02
# reward_enter_target_area = 0.01
# reward_exit_target_area = -0.02

env = gym.make(
    'kidpuzzles/DigitsPuzzleEnv-v0', 
    render_mode = 'rgb_array', 
    n_digits = n_digits,
    # reward_clipped = reward_clipped,
    # reward_enter_target_area = reward_enter_target_area,
    # reward_exit_target_area = reward_exit_target_area
)

record_freq = 200

total_timesteps=5_000_000
initial_lr = 7e-4
lin_lr_scheduler = linear_scheduler(initial_lr)
step_lr_scheduler = step_scheduler(initial_value=initial_lr, drop_factor=0.5, drop_interval=0.33)
cos_lr_scheduler = cosine_annealing_scheduler(initial_lr, T_max=total_timesteps)

In [None]:
lr_scheduler = step_lr_scheduler
lr_name = "step_f0.5_i0.33"

## Training the agent

In [None]:
    
video_folder = f"DigitsPuzzle-{n_digits}-{lr_name}"
env = RecordVideo(env, video_folder=f"videos/{video_folder}", name_prefix="training",
                episode_trigger=lambda x: x % record_freq == 0)

model = A2C(
    "MultiInputPolicy", 
    env, 
    ent_coef=0.01,
    n_steps=64,
    verbose=1, 
    device="mps", 
    learning_rate=lr_scheduler,
    tensorboard_log=f"./logs/a2c_digitspuzzle_nd{n_digits}_{lr_name}/"
)
model.learn(total_timesteps=total_timesteps)

  logger.warn(


Using mps device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./logs/a2c_digitspuzzle_nd10_step_f0.5_i0.33/A2C_2
MoviePy - Building video /Users/bapa/Codes/KidsPuzzles/videos/DigitsPuzzle-10-step_f0.5_i0.33/training-episode-0.mp4.
MoviePy - Writing video /Users/bapa/Codes/KidsPuzzles/videos/DigitsPuzzle-10-step_f0.5_i0.33/training-episode-0.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready /Users/bapa/Codes/KidsPuzzles/videos/DigitsPuzzle-10-step_f0.5_i0.33/training-episode-0.mp4




# Testing the agent

In [None]:
import time
vec_env = model.get_env()
observations = vec_env.reset()
reward_sum = 0
for _ in range(10):
    action, _state = model.predict(observations)
    observations, reward, terminated, info = vec_env.step(action)
    vec_env.render("human")
    time.sleep(5)

    reward_sum += reward
    if terminated:
        observations = vec_env.reset()

print("Total reward: ", reward_sum)

2025-02-09 12:48:27.298 python[40735:2934412] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-09 12:48:27.298 python[40735:2934412] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Total reward:  [2.5222225]


In [None]:
env.close()