In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('..')

from gym_minigrid_navigation.utils import show_video
from navigation_policy import gen_env, get_agent, run_episode, run_episodes

from rewards import get_reward_function
from utils import init_logger, switch_reproducibility_on, display_stats

init_logger('dqn')
init_logger('navigation_policy')
init_logger('gym_minigrid_navigation.environments')

### config 

In [4]:
from pyhocon import ConfigFactory

config = ConfigFactory.parse_file('../conf/minigrid_dqn_navigation_rgb_cnn.hocon')
config['env']['video_path'] = '../outputs/video/'

In [5]:
switch_reproducibility_on(config['seed'])

### environment 

In [6]:
reward_functions = get_reward_function(config)
env = gen_env(config['env'], reward_functions)

### agent 

In [7]:
from utils import init_logger

agent = get_agent(env, config)

2021-02-05 17:09:55,483 INFO    dqn                    : Running on device: cuda:0


### trainings 

In [None]:
scores, steps = run_episodes(env, agent, n_episodes=config['training.n_episodes'], verbose=config['training.verbose'])

display_stats(scores, steps)

### visualisation 

In [12]:
env = gen_env(config['env'], reward_functions, verbose=True)
print(run_episode(env, agent, train_mode=False))

show_video(config['env.video_path'])

2021-02-05 13:46:35,990 INFO    gym_minigrid_navigation.environments   : From [4 4] to [2 3]


(-25.600000000000094, 256)


In [15]:
state[12:16, 12:16, 0]

array([[ 28,   0,   0,   0],
       [170, 198, 113,   0],
       [170, 198, 113,   0],
       [ 28,   0,   0,   0]], dtype=uint8)

In [8]:
res_state = env.reset()
state = env.reset()
states = agent._vstack([state] * 6)
states.shape

torch.Size([6, 32, 32, 3])

In [19]:
import numpy as np
goal_state = np.copy(state)

goal_state[12:16, 12:16] = np.swapaxes(state[12:16, 12:16],0,1)

reward_functions(res_state, state, goal_state)

1.7011259


-0.1

In [20]:
import numpy as np
goal_state = np.copy(state)

goal_state[8:12, 8:12, :] = state[12:16, 12:16]
goal_state[12:16, 12:16] = 0

reward_functions(res_state, state, goal_state)

1.6498365


-0.1

In [21]:
import numpy as np
goal_state = np.copy(state)

goal_state[16:20, 12:16] = np.swapaxes(state[12:16, 12:16],0,1)
goal_state[12:16, 12:16] = 0

reward_functions(res_state, state, goal_state)

1.4874552


1

In [17]:
goal_state[12:16, 12:16, 0]

array([[ 28, 170, 170,  28],
       [  0, 198, 198,   0],
       [  0, 113, 113,   0],
       [  0,   0,   0,   0]], dtype=uint8)

In [36]:
[:, :, 0]

array([[ 28, 170, 170,  28],
       [  0, 198, 198,   0],
       [  0, 113, 113,   0],
       [  0,   0,   0,   0]], dtype=uint8)

In [None]:
agent.qnetwork_target.master

In [9]:
model = agent.qnetwork_target.master
model.output_size

400

torch.Size([6, 32, 32, 3])

In [11]:
model(states).shape

torch.Size([6, 400])

-0.1

In [46]:
np.linalg.norm(state - goal_state) / 255

2.251745446267544

In [82]:
state / np.linalg.norm(state)

array([[[0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        ...,
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321]],

       [[0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        ...,
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321]],

       [[0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        ...,
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321]],

       ...,

       [[0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        [0.0261321, 0.0261321, 0.0261321],
        ...,
        [0.0261321, 0.0261

In [81]:
np.linalg.norm(state)

3826.7105978895243

In [None]:
state / 