# Curriculum Learning - Self-Contained

Automatic curriculum learning for MiniGrid environments.

In [1]:
import os, warnings
os.environ["PYTHONWARNINGS"] = "ignore"
warnings.filterwarnings("ignore")

from typing import List, Dict
import numpy as np
import time
import torch

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import SubprocVecEnv

from src.environment import make_fixed_mixed_vec_env
from src.evaluation import evaluate
from src.cnn import get_policy_kwargs
from src.filemanager import FileManager

# Result directory

In [2]:
fm: FileManager = FileManager("auto_curriculum_learning_frontier_capped", output_dir="results")

Experiment directory initialized: results/auto_curriculum_learning_frontier_capped_20251031_200801


## Curriculum Teacher


In [3]:
class AutoCurriculumTeacher:
    """
    Simple frontier curriculum:
    - Pick the hardest stage not yet mastered (frontier)
    - Allocate most envs there, some to the previous stage
    - Keep 1 env everywhere for refresh
    """

    def __init__(self, stages: List[str], n_envs: int = 8, window: int = 3, target: float = 0.90):
        self.stages = stages
        self.n_envs = n_envs
        self.window = window
        self.target = target
        self.performance: Dict[str, List[float]] = {s: [] for s in stages}
        self._cap = (n_envs - 1) // 2

    def _cap_fix(self, counts: np.ndarray) -> np.ndarray:
        """If any stage has > cap, move one env to neighbor (prefer previous, else next)."""
        K = len(counts)
        cap = self._cap
        if K == 1:
            counts[0] = min(counts[0], cap)
            return counts

        while True:
            over = np.where(counts > cap)[0]
            if len(over) == 0:
                break
            for i in over:
                # take 1 from i
                counts[i] -= 1
                # give to previous if exists, else next
                j = i - 1 if i - 1 >= 0 else i + 1
                counts[j] += 1
        return counts

    # logging
    def record_stage(self, stage: str, success_rate: float) -> None:
        h = self.performance[stage]
        h.append(success_rate)
        if len(h) > 2000: self.performance[stage] = h[-1000:]

    # helper
    def _mean_recent(self, s: str) -> float:
        h = self.performance[s]
        if not h: return 0.0
        w = min(self.window, len(h))
        return float(np.mean(h[-w:]))

    def _frontier_index(self) -> int:
        """Return index of the first (easiest) stage with mean < target; -1 if all mastered."""
        for i, s in enumerate(self.stages):
            h = self.performance[s]
            if not h:
                return i  # no data → treat as unmet
            w = min(self.window, len(h))
            if float(np.mean(h[-w:])) < self.target:
                return i
        return -1  # all mastered

    # allocation
    def get_env_composition(self) -> Dict[str, int]:
        K = len(self.stages)
        # 1) refresh: 1 env everywhere
        counts = np.ones(K, dtype=int)

        # 2) choose frontier (first unmet)
        f = self._frontier_index()

        if f == -1:
            # all mastered → uniform split (exact sum)
            q = self.n_envs // K
            r = self.n_envs - q * K
            counts[:] = q
            for i in range(r):
                counts[i] += 1
        else:
            # remaining capacity
            R = self.n_envs - counts.sum()
            if R > 0:
                prev_idx = f - 1 if f - 1 >= 0 else None
                to_frontier = R if prev_idx is None else int(round(0.7 * R))
                to_prev     = 0 if prev_idx is None else (R - to_frontier)

                counts[f] += to_frontier
                if prev_idx is not None:
                    counts[prev_idx] += to_prev

                # exact-sum safety (should be 0)
                diff = self.n_envs - counts.sum()
                if diff != 0:
                    counts[f] += diff
        counts = self._cap_fix(counts)
        # max 2 on the hardest stage
        hardest = len(counts) - 1
        if counts[hardest] == 3:          # cap is 3, we want 2
            for j in range(len(counts)):  # find first stage under cap
                if j != hardest and counts[j] < self._cap:
                    counts[hardest] -= 1
                    counts[j] += 1
                    break

        return {s: int(c) for s, c in zip(self.stages, counts)}


    # stopping
    def has_converged_hardest(self, thresh: float = 0.95, evals: int = 3) -> bool:
        s = self.stages[-1]
        h = self.performance[s]
        if len(h) < evals: return False
        return float(np.mean(h[-evals:])) >= thresh

    def plateaued(self, patience: int = 8, tol: float = 0.02) -> bool:
        s = self.stages[-1]
        h = self.performance[s]
        if len(h) < patience: return False
        seg = h[-patience:]
        return (max(seg) - min(seg)) < tol


## Step Callback


In [4]:
class StepCallback(BaseCallback):
    """Periodic evaluation on all stages (mixed training)."""
    
    def __init__(
        self,
        teacher: AutoCurriculumTeacher,
        stages: List[str],
        eval_freq: int = 5_000,
        n_eval: int = 30,
        visualize: bool = True,
    ) -> None:
        super().__init__()
        self.teacher = teacher
        self.stages = stages
        self.eval_freq = eval_freq
        self.n_eval = n_eval
        self.visualize = visualize
        
        self.total_steps: int = 0
        self.training_start_time: float = time.time()
        
        # Track performance on each stage separately
        self.eval_history: Dict[str, List[float]] = {s: [] for s in stages}

        self.allocation: Dict[str, int] = {}

    def set_allocation(self, allocation: Dict[str, int]) -> None:
        self.allocation = allocation
    
    def _on_step(self) -> bool:
        self.total_steps += 1
        
        if self.total_steps % self.eval_freq == 0:
            assert isinstance(self.model, PPO)
            
            # Evaluate on ALL stages (not just current training stage)
            print(f"\n  Eval @ {self.total_steps:,}")
            
            for stage in self.stages:
                episode_batch = evaluate(self.model, stage, self.n_eval)
                self.eval_history[stage].append(episode_batch.success_rate)
                self.teacher.record_stage(stage, episode_batch.success_rate)
                # self.teacher.record_entropy(stage, episode_batch.mean_entropy)
                
                # Log each stage
                print(
                    f"    {stage}: {episode_batch.success_rate:.1%} | "
                    f"Reward: {episode_batch.mean_reward:.2f} | "
                    f"Ent: {episode_batch.mean_entropy:.3f}"
                )
                
                # Save to CSV
                fm.dump_eval_to_csv(
                    total_step=self.total_steps,
                    stage=stage,
                    stage_step=self.total_steps,
                    batch=episode_batch,
                    model=self.model,
                    allocation=self.allocation
                )
                
                # Visualize only hardest task
                if self.visualize:
                    from src.episode_visualization import visualize_eval_episode
                    visualize_eval_episode(
                        model=self.model,
                        episode=episode_batch.episodes[0],
                        timestep=self.total_steps,
                        output_dir=fm.get_visualization_dir()
                    )
            
            elapsed = time.time() - self.training_start_time
            print(f"  Total time: {int(elapsed//60):02d}:{int(elapsed%60):02d}")
        
        return True
    
    def get_all_evals_summary(self) -> str:
        """Summary of all stages' performance."""
        lines = ["Evaluation History (all stages):"]
        
        for stage in self.stages:
            history = self.eval_history[stage]
            if not history:
                lines.append(f"  {stage}: No evals")
            else:
                latest = history[-1]
                lines.append(f"  {stage}: latest={latest:.1%}, mean={np.mean(history[-5:]):.1%}")
        
        return "\n".join(lines)

## Training


In [5]:
N_ENVS: int = 8
N_STEPS: int = 128
STEPS_PER_ROLLOUT = N_STEPS * N_ENVS

# Evaluate every 5 rollout
EVAL_FREQ: int = STEPS_PER_ROLLOUT * 3
N_EVALS: int = 15

device = ""
if torch.cuda.is_available(): # type: ignore
    device = "cuda"
else:
    device = "cpu"
print(f"Using device: {device}")

# Curriculum
WINDOW: int = 1

# https://minigrid.farama.org/environments/minigrid/

STAGES: List[str] = [
    "MiniGrid-DoorKey-5x5-v0",
    "MiniGrid-DoorKey-6x6-v0",
    "MiniGrid-DoorKey-8x8-v0",
    "MiniGrid-DoorKey-16x16-v0",
]


TOTAL_STEPS: int = 200_000

# PPO
def make_model(env: SubprocVecEnv) -> PPO:
    return PPO(
        "CnnPolicy",
        env,
        policy_kwargs=get_policy_kwargs(),
        learning_rate= 3e-4,
        n_steps=N_STEPS,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=0.02,
        verbose=0,
        device=device
    )

Using device: cuda


In [6]:
def train_curriculum() -> None:
    """Train with continuous auto-curriculum (dynamic env composition)."""
    
    teacher = AutoCurriculumTeacher(
        STAGES,
        window=WINDOW,  
        n_envs=N_ENVS
    )

    callback = StepCallback(
        teacher,
        stages=STAGES,
        eval_freq=EVAL_FREQ,
        n_eval=N_EVALS,
        visualize=True,
    )

    # Initial allocation + initial env/model
    allocation = teacher.get_env_composition()
    env = make_fixed_mixed_vec_env(allocation)
    model = make_model(env)
    callback.set_allocation(allocation)

    print(f"\n=== Initial Allocation: {allocation} ===")

    while (
        callback.total_steps < TOTAL_STEPS 
        and not teacher.has_converged_hardest(thresh=0.9, evals=5)
        and not teacher.plateaued()
    ):

        # Train for eval_freq steps (callback does evaluations)
        model.learn( # type: ignore
            total_timesteps=EVAL_FREQ,
            callback=callback,
            reset_num_timesteps=False
        )

        # Recompute curriculum allocation
        new_alloc = teacher.get_env_composition()

        # If allocation changed -> recompose VecEnv
        if new_alloc != allocation:
            print(f"\n>>> Curriculum shift detected")
            print(f"    Old: {allocation}")
            print(f"    New: {new_alloc}\n")

            env.close()
            env = make_fixed_mixed_vec_env(new_alloc)
            model.set_env(env) # type: ignore
            allocation = new_alloc
            callback.set_allocation(new_alloc)

            fm.save_checkpoint(
                model=model,
                stage=f"auto_stage",
                total_step=callback.total_steps
            )

    env.close()

    total_time = time.time() - callback.training_start_time
    print(f"\n=== TRAINING COMPLETE ===")
    print(f"Time: {int(total_time//60):02d}:{int(total_time%60):02d}")
    print(f"Steps: {callback.total_steps:,}")

In [7]:
train_curriculum()


=== Initial Allocation: {'MiniGrid-DoorKey-5x5-v0': 3, 'MiniGrid-DoorKey-6x6-v0': 3, 'MiniGrid-DoorKey-8x8-v0': 1, 'MiniGrid-DoorKey-16x16-v0': 1} ===

  Eval @ 3,072
    MiniGrid-DoorKey-5x5-v0: 13.3% | Reward: 0.09 | Ent: 1.823
    → Evaluation saved to: results/auto_curriculum_learning_frontier_capped_20251031_200801/evaluations.csv
    → Saved visualization: results/auto_curriculum_learning_frontier_capped_20251031_200801/visualizations/eval_3072_MiniGrid_DoorKey_5x5_v0.png
    MiniGrid-DoorKey-6x6-v0: 6.7% | Reward: 0.01 | Ent: 1.805
    → Evaluation saved to: results/auto_curriculum_learning_frontier_capped_20251031_200801/evaluations.csv
    → Saved visualization: results/auto_curriculum_learning_frontier_capped_20251031_200801/visualizations/eval_3072_MiniGrid_DoorKey_6x6_v0.png
    MiniGrid-DoorKey-8x8-v0: 0.0% | Reward: 0.00 | Ent: 1.774
    → Evaluation saved to: results/auto_curriculum_learning_frontier_capped_20251031_200801/evaluations.csv
    → Saved visualization: resu

# Run

In [8]:
# from src.environment import run_episode
# model_path = "results/curriculum_learning_20251027_134503/checkpoints/MiniGrid-KeyCorridorS3R2-v0_step_475136.zip"
# model = PPO.load(model_path) # type: ignore
# episode_data = run_episode(
#     model=model, 
#     env_name="MiniGrid-KeyCorridorS3R2-v0", 
#     seed=42, 
#     render_mode="human", 
#     deterministic=True
# )

# print(episode_data)

In [9]:
# from src.episode_visualization import visualize_eval_episode
# visualize_eval_episode(
#     model=model,
#     episode=episode_data,
#     timestep=-1,
#     output_dir="./"
# )