# Minigrid navigation stable baseline

In [17]:
import sys

sys.path.append("..")

import numpy as np
import gym

import torch
import torch.nn as nn

import train_worker
from environments import NavigationGoalWrapper

from stable_baselines3 import DQN
from rllr_stable_baselines3 import GoalStateExtenderWrapper, ExtendedStateFeatureExtractor, StateExtenderWrapper

In [18]:
worker_env_config = {
        "env_type": "gym_minigrid",
        "env_task": "MiniGrid-Empty",
        "grid_size": 8,
        "action_size": 3,
        "rgb_image": False,
        "goal_achieving_criterion": "position",
        "goal_type": "random",
        "video_path": "outputs/video/"
    }

worker_env = train_worker.gen_navigation_env(worker_env_config)
worker_env = GoalStateExtenderWrapper(worker_env)

policy_kwargs = dict(
    features_extractor_class=ExtendedStateFeatureExtractor,
    features_extractor_kwargs=dict(features_dim=64, hidden=128),
    net_arch=[128, 128]
)

dqn_kwargs = {
    "learning_rate": 1e-3,
    "buffer_size": int(1e5),
    "batch_size": 128,
    "learning_starts": 1000,
    "exploration_fraction": 0.995,
    "gamma": 0.9,
    "tau": 0.999,
    "seed": 42,
    "policy_kwargs": policy_kwargs,
    "verbose": 1
}

worker = DQN("MlpPolicy", worker_env, **dqn_kwargs)

worker.learn(10000, log_interval=10)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 147      |
|    ep_rew_mean      | -13.8    |
|    exploration rate | 0.86     |
| time/               |          |
|    episodes         | 10       |
|    fps              | 1057     |
|    time_elapsed     | 1        |
|    total timesteps  | 1469     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.000402 |
|    n_updates        | 117      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 123      |
|    ep_rew_mean      | -11.4    |
|    exploration rate | 0.765    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 768      |
|    time_elapsed     | 3        |
|    total timesteps  | 2466     |
| train/              |        

<stable_baselines3.dqn.dqn.DQN at 0x7fde332dfa90>

In [19]:
master_env_config = {
    "env_type": "gym_minigrid",
    "env_task": "MiniGrid-Empty",
    "grid_size": 8,
    "action_size": 3,
    "rgb_image": False,
    "video_path": "outputs/video/"
}

master_env = minigrid_envs.gen_wrapped_env(master_env_config)
master_env = StateExtenderWrapper(master_env)
worker.set_env(master_env)

for param in worker.policy.q_net.features_extractor.state_encoder.parameters():
    param.requires_gradient = False

for param in worker.policy.q_net.q_net.parameters():
    param.requires_gradient = False

worker.learn(30000, log_interval=10)


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 236      |
|    ep_rew_mean      | 0.109    |
|    exploration rate | 0.925    |
| time/               |          |
|    episodes         | 10       |
|    fps              | 770      |
|    time_elapsed     | 3        |
|    total timesteps  | 2363     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.000483 |
|    n_updates        | 2590     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 205      |
|    ep_rew_mean      | 0.23     |
|    exploration rate | 0.87     |
| time/               |          |
|    episodes         | 20       |
|    fps              | 660      |
|    time_elapsed     | 6        |
|    total timesteps  | 4096     |
| train/              |          |
|    learning

<stable_baselines3.dqn.dqn.DQN at 0x7fde332dfa90>