In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('..')

from gym_minigrid_navigation.utils import show_video
from train_worker import gen_navigation_env, get_agent, run_episode, run_episodes

from rewards import get_reward_function
from utils import init_logger, switch_reproducibility_on, display_stats

init_logger('dqn')
init_logger('train_worker')
init_logger('environments')
init_logger('gym_minigrid_navigation.environments')

### config 

In [4]:
from pyhocon import ConfigFactory

config = ConfigFactory.parse_file('../conf/minigrid_first_step.hocon')
config['env']['video_path'] = './video/'

In [5]:
switch_reproducibility_on(config['seed'])

### environment 

In [6]:
config['env.state_distance_network_params']['path'] = '../outputs/models/state_distance_encoder.p'

In [7]:
env = gen_navigation_env(config['env'])

### agent 

In [8]:
from utils import init_logger

agent = get_agent(config)

2021-03-26 14:40:57,145 INFO    dqn                    : Running on device: cuda:0


### trainings 

In [None]:
scores, steps = run_episodes(
    env=env,
    worker_agent=agent,
    n_episodes=config['training.n_episodes'],
    verbose=config['training.verbose'],
    max_steps=config['training'].get('max_steps', 100_000),
)

In [None]:
# display_stats(scores, steps)

In [None]:
config['env']['goal_type'] = 'random'
env_ = gen_navigation_env(config['env'])
_, _ = run_episodes(
    env=env_,
    worker_agent=agent,
    train_mode=False,
    n_episodes=100,
    verbose=config['training.verbose']
)

### visualisation 

In [None]:
import environments

env_ = gen_navigation_env(config['env'], verbose = True)
env_ = environments.visualisation_wrapper(env_, config['env.video_path'])

print(run_episode(env_, agent, train_mode=True))

In [None]:
show_video()

In [10]:
len(env.buffer)

10000

In [28]:
import numpy as np
from collections import Counter

for complexity in range(10):
    print(complexity)
    arr = np.zeros((6, 6))

    for k, v in Counter([(s['position'][0], s['position'][1]) for x, s in env.buffer if x==complexity]).items():
        arr[k[0] - 1, k[1] - 1] = v

    print(arr)

40
[[2. 3. 1. 0. 2. 3.]
 [3. 0. 2. 0. 0. 1.]
 [4. 3. 1. 0. 1. 2.]
 [0. 2. 0. 1. 0. 1.]
 [1. 1. 0. 0. 2. 0.]
 [1. 0. 1. 0. 1. 0.]]
50
[[1. 0. 2. 2. 0. 0.]
 [5. 1. 1. 0. 1. 3.]
 [1. 1. 0. 0. 0. 1.]
 [3. 1. 1. 1. 1. 2.]
 [1. 0. 1. 0. 1. 1.]
 [2. 2. 0. 2. 1. 0.]]
60
[[4. 1. 3. 1. 0. 0.]
 [1. 1. 0. 1. 1. 1.]
 [2. 2. 1. 1. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 2. 0. 2. 1. 2.]
 [2. 4. 1. 0. 0. 2.]]
70
[[5. 2. 1. 1. 0. 2.]
 [2. 0. 0. 2. 1. 0.]
 [0. 2. 0. 0. 0. 0.]
 [1. 1. 1. 1. 2. 0.]
 [1. 1. 1. 0. 2. 2.]
 [3. 2. 0. 0. 1. 2.]]
80
[[5. 0. 1. 2. 1. 2.]
 [2. 1. 2. 0. 0. 1.]
 [2. 0. 0. 0. 0. 0.]
 [0. 1. 1. 3. 0. 2.]
 [1. 0. 1. 0. 1. 0.]
 [3. 0. 2. 1. 1. 3.]]
90
[[1. 2. 0. 2. 1. 1.]
 [2. 2. 1. 1. 2. 2.]
 [2. 1. 1. 1. 0. 0.]
 [1. 2. 0. 1. 1. 0.]
 [1. 1. 0. 0. 0. 1.]
 [1. 0. 0. 2. 4. 2.]]
100
[[3. 1. 1. 2. 2. 1.]
 [4. 2. 0. 0. 2. 1.]
 [1. 0. 0. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [2. 2. 3. 0. 1. 3.]]
110
[[6. 0. 3. 0. 0. 0.]
 [3. 1. 1. 0. 0. 0.]
 [5. 0. 0. 1. 2. 2.]
 [0. 0. 0. 1. 0. 1.]
 

In [10]:
import numpy as np
from collections import Counter

for c, s in [(1 + x*5, 5) for x in range(20)]:
    env.complexity, env.scale = c, s

    m = 1000

    print(c, s)
    choices = [env.buffer_random_choice() for x in range(m)]
    arr = np.zeros((6, 6))

    for k, v in Counter([(s['position'][0], s['position'][1]) for s in choices]).items():
        arr[k[0] - 1, k[1] - 1] = v / m

    print(arr)

1 5
[[0.513 0.076 0.038 0.009 0.002 0.002]
 [0.142 0.027 0.02  0.003 0.001 0.   ]
 [0.081 0.03  0.013 0.005 0.    0.   ]
 [0.023 0.002 0.004 0.003 0.    0.   ]
 [0.    0.    0.006 0.    0.    0.   ]
 [0.    0.    0.    0.    0.    0.   ]]
6 5
[[0.416 0.114 0.04  0.017 0.001 0.006]
 [0.135 0.034 0.027 0.005 0.002 0.   ]
 [0.075 0.049 0.031 0.008 0.    0.   ]
 [0.013 0.001 0.008 0.003 0.    0.   ]
 [0.002 0.002 0.007 0.002 0.002 0.   ]
 [0.    0.    0.    0.    0.    0.   ]]
11 5
[[0.319 0.158 0.036 0.028 0.008 0.008]
 [0.123 0.036 0.044 0.01  0.011 0.002]
 [0.061 0.034 0.041 0.016 0.    0.   ]
 [0.005 0.003 0.004 0.01  0.001 0.   ]
 [0.004 0.01  0.018 0.008 0.001 0.   ]
 [0.    0.    0.    0.    0.    0.001]]
16 5
[[0.284 0.162 0.024 0.024 0.026 0.008]
 [0.111 0.028 0.033 0.027 0.018 0.012]
 [0.039 0.027 0.055 0.022 0.    0.   ]
 [0.008 0.001 0.007 0.015 0.003 0.001]
 [0.007 0.015 0.013 0.015 0.015 0.   ]
 [0.    0.    0.    0.    0.    0.   ]]
21 5
[[0.253 0.139 0.038 0.02  0.046 0.021

In [17]:
np.array([1, 1]).tolist()

[1, 1]

In [12]:
import numpy as np
from scipy.stats import norm

steps_array = np.array([x for x, _ in env.buffer])
p = norm.pdf(steps_array, loc=env.complexity, scale=env.scale)
p /= p.sum()
choice = np.random.choice(np.arange(len(steps_array)), p=p)
i, s = env.buffer[choice]

i, s['position'], s['direction']

(10, array([4, 1]), 0)

In [16]:
np.array([x for x, _ in env.buffer])

array([12, 13, 14, ..., 38, 39, 40])

In [11]:
[(x, s['position'], s['direction']) for x, s in env.buffer][-40:]

[(233, array([6, 1]), 0),
 (234, array([6, 1]), 0),
 (235, array([6, 1]), 0),
 (236, array([6, 1]), 3),
 (237, array([6, 1]), 0),
 (238, array([6, 1]), 0),
 (239, array([6, 1]), 3),
 (240, array([6, 1]), 0),
 (241, array([6, 1]), 0),
 (242, array([6, 1]), 0),
 (243, array([6, 1]), 3),
 (244, array([6, 1]), 3),
 (245, array([6, 1]), 2),
 (246, array([6, 1]), 3),
 (247, array([6, 1]), 2),
 (248, array([6, 1]), 3),
 (249, array([6, 1]), 3),
 (250, array([6, 1]), 0),
 (251, array([6, 1]), 1),
 (252, array([6, 2]), 1),
 (253, array([6, 2]), 2),
 (254, array([5, 2]), 2),
 (255, array([5, 2]), 3),
 (256, array([5, 2]), 2),
 (1, (1, 1), 1),
 (2, (1, 1), 2),
 (3, (1, 1), 2),
 (4, (1, 1), 1),
 (5, (1, 1), 0),
 (6, array([2, 1]), 0),
 (7, array([2, 1]), 1),
 (8, array([2, 2]), 1),
 (9, array([2, 2]), 1),
 (10, array([2, 3]), 1),
 (11, array([2, 3]), 2),
 (12, array([2, 3]), 1),
 (13, array([2, 3]), 1),
 (14, array([2, 4]), 1),
 (15, array([2, 5]), 1),
 (16, array([2, 5]), 1)]

1

In [12]:
env.complexity

9

In [21]:
len(env.buffer)

2276

In [15]:
env.complexity = 9

[(state['position'], state['direction']) for state in [env.buffer_random_choice() for _ in range(20)]]

[(array([2, 1]), 3),
 (array([2, 1]), 0),
 (array([2, 1]), 3),
 (array([5, 1]), 1),
 (array([3, 3]), 2),
 ((1, 1), 0),
 ((1, 1), 1),
 (array([2, 3]), 1),
 (array([2, 1]), 1),
 (array([2, 4]), 1),
 (array([2, 2]), 1),
 ((1, 1), 0),
 (array([6, 1]), 0),
 ((1, 1), 3),
 (array([1, 1]), 3),
 (array([2, 1]), 3),
 (array([2, 1]), 3),
 (array([2, 2]), 3),
 ((1, 1), 2),
 (array([1, 3]), 1)]

In [None]:
a = env.reset()['position']

In [None]:
import numpy as np
from gym_minigrid_navigation.environments import gen_wrapped_env

def random_grid_goal_generator(conf, verbose=False):
    env = gen_wrapped_env(conf)
    grid_size = env.unwrapped.grid.encode().shape[0]
    init_pos = np.array([1, 1])

    while True:
        goal_pos = None
        while goal_pos is None or (init_pos == goal_pos).all():
            goal_pos = np.random.randint(1, grid_size - 2, 2)
        if verbose:
            logger.info(f"Random goal: {goal_pos}")

        env.unwrapped.agent_pos = goal_pos
        goal_state = env.observation(env.unwrapped.gen_obs())
        yield goal_state
        
next(random_grid_goal_generator(config['env']))

In [None]:
config['env']['goal_achieving_criterion'] = 'position'

In [None]:
env = gen_wrapped_env(config['env'])

In [None]:
type(env.reset()['position'])

In [None]:
import gym

conf = config['env']

if conf['env_task'] in ['MiniGrid-Empty', 'MiniGrid-Dynamic-Obstacles']:
    env_name = f"{conf['env_task']}-{conf['grid_size']}x{conf['grid_size']}-v0"
else:
    raise AttributeError(f"unknown env_task '{conf['env_task']}'")

env = gym.make(env_name)
    
env.reset()
print(env.unwrapped.agent_pos, env.unwrapped.agent_dir)
state1, _, _, _ = env.step(2)
print(env.unwrapped.agent_pos, env.unwrapped.agent_dir)
state2, _, _, _ = env.step(0)
print(env.unwrapped.agent_pos, env.unwrapped.agent_dir)
state3, _, _, _ = env.step(1)
print(env.unwrapped.agent_pos, env.unwrapped.agent_dir)

In [None]:
state1

In [None]:
env

In [None]:
init_logger('environments')

In [None]:
((1, 1) == np.array((1, 1))).all()

In [None]:
env.threshold

In [None]:
env = gen_env(config['env'], goal_achieving_criterion, reward_functions)

In [None]:
config = ConfigFactory.parse_file('../conf/minigrid_dqn_navigation_mlp.hocon')

In [None]:
from gym_minigrid_navigation import environments as minigrid_envs

In [None]:
from gym_minigrid_navigation.environments import gen_wrapped_env

In [None]:
import numpy as np

def random_grid_goal_generator(conf, verbose=False):
    env = gen_wrapped_env(conf)
    grid_size = env.unwrapped.grid.encode().shape[0]
    init_pos = np.array([1, 1])
    
    while True:
        goal_pos = None
        while goal_pos is None or (init_pos == goal_pos).all():
            goal_pos = np.random.randint(1, grid_size - 2, 2)
        if verbose and (goal_pos == np.array([1, 1])).all():
            print(f"Goal: {goal_pos}")

        env.unwrapped.agent_pos = goal_pos
        goal_state = env.observation(env.unwrapped.gen_obs())
        yield goal_state

In [None]:
a = random_grid_goal_generator(config['env'], True)

In [None]:
for x in range(1000):
    next(a)

In [None]:
np.random.randint(1, 5, 2)

In [None]:
env = gen_wrapped_env(config['env'])

In [None]:
s = next(a)
s.keys()

In [None]:
s['position']