In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

from collections import defaultdict
import typing
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import gym
import gym_minigrid
from gym_minigrid.wrappers import ImgObsWrapper
from gym_minigrid.envs.numbertasks import NumberTaskType

from stable_baselines3 import A2C, PPO, DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env


In [6]:
def _make_env(env_id: str, seed: int, **kwargs):
    env = gym.make(env_id, seed=seed, **kwargs)
    return Monitor(ImgObsWrapper(env))


def _make_n_envs(env_id: str, n: int, seed: int, normalize: bool = True, **kwargs):
    env = DummyVecEnv([lambda: _make_env(env_id, seed=seed + i, **kwargs) for i in range(n)])
    if normalize:
        return VecNormalize(env)

    else:
        return env


def _make_single_env(env_id: str, seed: int, **kwargs):
    return _make_n_envs(env_id, 1, seed, **kwargs)

In [7]:
N_ENVS = 1
N_EVALS = 20
N_BASE_STEPS = 200000
N_TOTAL_STEPS = N_BASE_STEPS * N_ENVS
N_EVAL_EPISODES = 50

ENV_ID = 'MiniGrid-NumberTasksNosePoke-v0'
TRAIN_SEED = 100
EVAL_SEED = 200

# TASKS = NumberTaskType
TASKS = [NumberTaskType.color]

AGENT_TYPES = [DQN]  #, PPO]
AGENT_KWARGS = dict()  #  dict(learning_rate=1e-3) #

ENV_KWARGS = dict() # dict(shuffle_task_locations=True) #, min_agent_view_size=5, step_reward=-0.01)

all_agent_results = defaultdict(dict)


for task in TASKS:
    for agent_type in AGENT_TYPES:
        if N_ENVS == 1:
            env = _make_single_env(ENV_ID, seed=TRAIN_SEED, task=task, **ENV_KWARGS)
        else:
            env = _make_n_envs(ENV_ID, n=N_ENVS, seed=TRAIN_SEED, task=task, **ENV_KWARGS)

        eval_env =_make_single_env(ENV_ID, seed=EVAL_SEED, task=task, **ENV_KWARGS)
        stop_callback = StopTrainingOnRewardThreshold(reward_threshold=0.85, verbose=1)
        eval_callback = EvalCallback(eval_env, n_eval_episodes=N_EVAL_EPISODES, 
            eval_freq=N_BASE_STEPS / N_EVALS, callback_on_new_best=stop_callback,
            log_path=f'./logs/{agent_type.__name__}_{task.name}', verbose=1)

        model = agent_type('MlpPolicy', env,  **AGENT_KWARGS)
        model.learn(total_timesteps=N_TOTAL_STEPS, callback=eval_callback)

        rewards = eval_callback.evaluations_results
        means = [np.mean(r) for r in rewards]
        stds = [np.std(r) for r in rewards]

        all_agent_results[task.name][agent_type.__name__] = means, stds

Eval num_timesteps=10000, episode_reward=0.00 +/- 0.00
Episode length: 36.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=0.00 +/- 0.00
Episode length: 36.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=0.00 +/- 0.00
Episode length: 36.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=0.00 +/- 0.00
Episode length: 36.00 +/- 0.00
Eval num_timesteps=50000, episode_reward=0.00 +/- 0.00
Episode length: 36.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=0.00 +/- 0.00
Episode length: 36.00 +/- 0.00
Eval num_timesteps=70000, episode_reward=0.00 +/- 0.00
Episode length: 36.00 +/- 0.00
Eval num_timesteps=80000, episode_reward=0.57 +/- 0.47
Episode length: 4.04 +/- 8.07
New best mean reward!
Eval num_timesteps=90000, episode_reward=0.46 +/- 0.47
Episode length: 2.00 +/- 0.00
Eval num_timesteps=100000, episode_reward=0.47 +/- 0.48
Episode length: 2.00 +/- 0.00
Eval num_timesteps=110000, episode_reward=0.47 +/- 0.47
Episode length: 2.00 +/- 0.00
Eval num_tim

In [58]:
evaluate_policy(model, eval_env, n_eval_episodes=N_EVAL_EPISODES * 2)

(0.57, 0.4654030511288038)

In [10]:
model.get_vec_normalize_env()

In [9]:
for task_name, agent_results in all_agent_results.items():
    for agent, results in agent_results.items():
        means, stds = results
        print(task_name, agent, means)

color A2C [0.0, 0.5700000000000001, 0.9200000000000002]
color PPO [0.665, 0.9499999999999998]
magnitude A2C [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.19, 0.0, 0.095, 0.095, 0.0, 0.0, 0.0, 0.0, 0.095, 0.0, 0.0, 0.0, 0.0, 0.0]
magnitude PPO [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
parity A2C [0.38, 0.475, 0.47000000000000003, 0.475, 0.655, 0.38, 0.665, 0.665, 0.38, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
parity PPO [0.38, 0.5700000000000001, 0.475, 0.38, 0.38, 0.5700000000000001, 0.665, 0.5700000000000001, 0.475, 0.38, 0.19, 0.19, 0.285, 0.475, 0.5700000000000001, 0.38, 0.285, 0.0, 0.19, 0.095]


In [48]:
env = _make_env(ENV_ID, seed=TRAIN_SEED, task='color')
obs = env.reset()
obs

array([[[ 2,  5,  0],
        [ 2,  5,  0],
        [ 8,  1,  0]],

       [[ 2,  5,  0],
        [11,  1,  5],
        [ 1,  0,  0]],

       [[ 2,  5,  0],
        [ 2,  5,  0],
        [ 8,  1,  0]]], dtype=uint8)

In [49]:
env.step(env.actions.right)

(array([[[ 2,  5,  0],
         [ 2,  5,  0],
         [11,  1,  5]],
 
        [[ 6,  2,  0],
         [ 8,  1,  0],
         [ 1,  0,  0]],
 
        [[ 2,  5,  0],
         [ 2,  5,  0],
         [ 2,  5,  0]]], dtype=uint8),
 0,
 False,
 {})

In [21]:
env.step(env.actions.left)
env.step(env.actions.left)

(array([[[ 2,  5,  0],
         [ 2,  5,  0],
         [ 2,  5,  0]],
 
        [[11,  5,  0],
         [ 8,  1,  0],
         [ 1,  0,  0]],
 
        [[ 2,  5,  0],
         [ 2,  5,  0],
         [11,  1,  5]]], dtype=uint8),
 0,
 False,
 {})

In [47]:
N_ITER = 10

env = _make_env(ENV_ID, seed=TRAIN_SEED, task='parity')
rewards = []

for _ in range(N_ITER):
    obs = env.reset()
    digit = obs[1, 1, -1]
    turn_obs, _, _, _ = env.step(env.actions.left)
    task_marker = turn_obs[1, 0, -1]

    if (digit % 2) != task_marker:
        env.step(env.actions.left)
        env.step(env.actions.left)

    _, r, done, _ = env.step(env.actions.forward)
    if not done:
        raise ValueError('Expected done')
    rewards.append(r)

print(rewards)
    


[0.9, 0.9, 0.95, 0.95, 0.9, 0.9, 0.9, 0.95, 0.9, 0.95]


In [46]:
N_ITER = 10

env = _make_env(ENV_ID, seed=TRAIN_SEED, task='magnitude')
rewards = []

for _ in range(N_ITER):
    obs = env.reset()
    digit = obs[1, 1, -1]
    turn_obs, _, _, _ = env.step(env.actions.left)
    task_marker = turn_obs[1, 0, -1]

    if (digit >= 5) != (task_marker == 11):
        env.step(env.actions.left)
        env.step(env.actions.left)

    _, r, done, _ = env.step(env.actions.forward)
    if not done:
        raise ValueError('Expected done')
    rewards.append(r)

print(rewards)
    


[0.95, 0.9, 0.9, 0.9, 0.95, 0.95, 0.9, 0.9, 0.9, 0.9]


In [50]:
N_ITER = 10

env = _make_env(ENV_ID, seed=TRAIN_SEED, task='color')
rewards = []

for _ in range(N_ITER):
    obs = env.reset()
    digit_color = obs[1, 1, 1]
    turn_obs, _, _, _ = env.step(env.actions.left)
    task_color = turn_obs[1, 0, 1]

    if digit_color != task_color:
        env.step(env.actions.left)
        env.step(env.actions.left)

    _, r, done, _ = env.step(env.actions.forward)
    if not done:
        raise ValueError('Expected done')
    rewards.append(r)

print(rewards)
    


[0.95, 0.95, 0.9, 0.9, 0.9, 0.9, 0.95, 0.95, 0.95, 0.95]
