In [1]:

from collections import defaultdict
import typing
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import gym
import gym_minigrid
from gym_minigrid.wrappers import ImgObsWrapper
from gym_minigrid.envs.numbertasks import NumberTaskType

from stable_baselines3 import A2C, PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold


In [None]:
def _make_env(id, **kwargs):
    env = gym.make(id, **kwargs)
    return Monitor(ImgObsWrapper(env))


def _make_n_envs(id, n, seed, **kwargs):
    return DummyVecEnv([lambda: _make_env(id, seed=seed + i, **kwargs) for i in range(n)])

In [2]:
N_ENVS = 10
N_EVALS = 20
N_BASE_STEPS = 100000
N_TOTAL_STEPS = N_BASE_STEPS * N_ENVS
N_EVAL_EPISODES = 10

ENV_ID = 'MiniGrid-NumberTasksNosePoke-v0'
TRAIN_SEED = 100
EVAL_SEED = 200

TASKS = NumberTaskType

AGENT_TYPES = [A2C, PPO]
AGENT_KWARGS = dict(learning_rate=2e-3)




for task in TASKS:
    for agent_type in AGENT_TYPES:
        if N_ENVS == 1:
            env = _make_env(id, seed=TRAIN_SEED, task=task)
        else:
            env = _make_n_envs(id, n=N_ENVS, seed=TRAIN_SEED, task=task)

        eval_env =_make_env(id, seed=EVAL_SEED, task=task)
        stop_callback = StopTrainingOnRewardThreshold(reward_threshold=0.85, verbose=1)
        eval_callback = EvalCallback(eval_env, n_eval_episodes=N_EVAL_EPISODES, 
            eval_freq=N_BASE_STEPS / N_EVALS, callback_on_new_best=stop_callback,
            eval_log_dir=f'./logs/{agent_type.__name__}_{task.name}', verbose=1)

        model = agent_type('MlpPolicy', env, verbose=1, **AGENT_KWARGS)
        model.learn(total_timesteps=N_TOTAL_STEPS, callback=eval_callback)




NameError: name 'NumberTaskType' is not defined