In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('..')

import random
import torch
import numpy as np

from gym_minigrid_navigation.utils import show_video
from navigation_policy import gen_env, get_agent, run_episode, run_episodes
from rewards import get_reward_function
from utils import init_logger, switch_reproducibility_on, display_stats
from expected_steps import ExpectedStepsAmountLeaner
from rewards import ExpectedStepsAmountReward

import logging
logger = logging.getLogger(__name__)

init_logger('__main__')

init_logger('dqn')
init_logger('expected_steps')
init_logger('navigation_policy')
init_logger('gym_minigrid_navigation.environments')

### config 

In [4]:
from pyhocon import ConfigFactory

config = ConfigFactory.parse_file('../conf/minigrid_dqn_navigation_draft.hocon')
config['env']['video_path'] = '../outputs/video/'

switch_reproducibility_on(config['seed'])

### agent and steps amount model 

In [5]:
expected_steps_learner = ExpectedStepsAmountLeaner(config['expected_steps_params'])
agent = get_agent(config)

2021-02-15 18:14:46,927 INFO    dqn                    : Running on device: cuda:0


### steps amount model trainings 

In [6]:
env = gen_env(config['env'], reward_function=lambda *args: 0)
expected_steps_learner.model.fc[-1].bias.data.fill_(20.)

expected_steps_learner.reset_buffer()
expected_steps_learner.collect_episodes(env, agent)
expected_steps_learner.learn(verbose=True)

2021-02-15 18:51:33,712 INFO    expected_steps         : Expected steps L1Loss:  8.341548559979023


### agent training

In [7]:
reward_function = ExpectedStepsAmountReward(expected_steps_learner.model)
env = gen_env(config['env'], reward_function=reward_function)

scores, steps = run_episodes(env, agent, n_episodes=config['training.n_episodes'], verbose=config['training.verbose'])

2021-02-15 18:53:13,230 INFO    navigation_policy      : Episode: 100. Average score: 15.88916485786438. Average steps: 123.77
2021-02-15 18:54:22,128 INFO    navigation_policy      : Episode: 200. Average score: 18.5242711353302. Average steps: 83.65
2021-02-15 18:55:09,234 INFO    navigation_policy      : Episode: 300. Average score: 18.094962930679323. Average steps: 56.47
2021-02-15 18:55:45,063 INFO    navigation_policy      : Episode: 400. Average score: 18.609642791748048. Average steps: 42.33
2021-02-15 18:56:18,961 INFO    navigation_policy      : Episode: 500. Average score: 19.514019021987917. Average steps: 39.52
2021-02-15 18:56:48,246 INFO    navigation_policy      : Episode: 600. Average score: 19.65490288734436. Average steps: 33.83
2021-02-15 18:57:14,212 INFO    navigation_policy      : Episode: 700. Average score: 20.21767728805542. Average steps: 30.16
2021-02-15 18:57:34,435 INFO    navigation_policy      : Episode: 800. Average score: 18.857579832077025. Average s

In [15]:
env = gen_env(config['env'], reward_function, verbose=True)
print(run_episode(env, agent, train_mode=True))

show_video(config['env.video_path'])

2021-02-15 19:08:01,126 INFO    gym_minigrid_navigation.environments   : From [2 3] to [1 3]


(16.416171073913574, 18)
