In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('..')

from gym_minigrid_navigation.utils import show_video
from navigation_policy import gen_env, get_agent, run_episode, run_episodes

from rewards import get_reward_function
from utils import init_logger, switch_reproducibility_on, display_stats


import logging
logger = logging.getLogger(__name__)

init_logger('__main__')

init_logger('dqn')
init_logger('expected_steps')
init_logger('navigation_policy')
init_logger('gym_minigrid_navigation.environments')

### config 

In [4]:
from pyhocon import ConfigFactory

config = ConfigFactory.parse_file('../conf/minigrid_dqn_draft.hocon')
config['env']['video_path'] = '../outputs/video/'

switch_reproducibility_on(config['seed'])

### agent and steps amount model

In [5]:
agent = get_agent(config)

2021-03-17 14:34:38,718 INFO    dqn                    : Running on device: cuda:0


In [6]:
config['env']

ConfigTree([('env_type', 'gym_minigrid'),
            ('env_task', 'MiniGrid-Empty'),
            ('grid_size', 8),
            ('action_size', 3),
            ('rgb_image', True),
            ('tile_size', 4),
            ('goal_type', 'from_buffer'),
            ('buffer_size', 1000),
            ('task_complexity', 10),
            ('video_path', '../outputs/video/')])

In [7]:
reward_function = get_reward_function(config)
env = gen_env(config['env'], reward_function=reward_function)

In [8]:
scores, steps = run_episodes(
    env=env,
    agent=agent,
    n_episodes=config['training.n_episodes'],
    verbose=config['training.verbose'],
    max_steps=config['training'].get('max_steps', 100_000)
)

2021-03-17 14:36:06,006 INFO    navigation_policy      : Episode: 100. scores: -8.04, steps: 89.75
2021-03-17 14:37:13,016 INFO    navigation_policy      : Episode: 200. scores: -5.10, steps: 60.83
2021-03-17 14:37:52,743 INFO    navigation_policy      : Episode: 300. scores: -2.99, steps: 39.71
2021-03-17 14:38:15,043 INFO    navigation_policy      : Episode: 400. scores: -1.06, steps: 21.20
2021-03-17 14:38:49,053 INFO    navigation_policy      : Episode: 500. scores: -1.47, steps: 24.88
2021-03-17 14:39:15,510 INFO    navigation_policy      : Episode: 600. scores: -0.99, steps: 20.19
2021-03-17 14:39:26,138 INFO    navigation_policy      : Episode: 700. scores: -0.07, steps: 11.38
2021-03-17 14:39:37,411 INFO    navigation_policy      : Episode: 800. scores: -0.32, steps: 13.87
2021-03-17 14:39:47,495 INFO    navigation_policy      : Episode: 900. scores: -0.10, steps: 11.77
2021-03-17 14:39:55,486 INFO    navigation_policy      : Episode: 1000. scores: 0.09, steps: 9.86
2021-03-17 

## check

In [9]:
config['env']['goal_type'] = 'random' 
env = gen_env(config['env'], reward_function=reward_function)

In [10]:
scores, steps = run_episodes(
    env=env,
    agent=agent,
    train_mode=True,
    n_episodes=100,
    verbose=config['training.verbose']
)

2021-03-17 14:46:06,251 INFO    navigation_policy      : Episode: 100. scores: -7.71, steps: 85.23


In [31]:
# env = gen_env(config['env'], reward_function, verbose=True)
# print(run_episode(env, agent, train_mode=False))

show_video(config['env.video_path'])

In [32]:
import luigi

ModuleNotFoundError: No module named 'luigi'

In [24]:
import numpy as np

In [30]:
a = 10, 20

np.random.randint(*a)

16

In [124]:
type(8.) == int

False

In [65]:
import sys
sys.path.append('..')

In [66]:
from gym_minigrid_navigation.encoders import *

In [120]:
config = ConfigFactory.parse_file('../conf/minigrid_dqn_draft.hocon')

In [121]:
CNNAllLayers(16, config['worker']).output_size

400

In [119]:
SimpleCNN(16, config['worker']).output_size

144

In [101]:
env = gen_env(config['env'], reward_function=lambda *args: 0)

In [102]:
a = torch.from_numpy(env.reset()['image']).unsqueeze(0).float()

In [103]:
a.shape

torch.Size([1, 64, 64, 3])

In [104]:
net(a).shape

torch.Size([1, 1792])

### steps amount model trainings 

In [6]:
expected_steps_learner = ExpectedStepsAmountLeaner(config['expected_steps_params'])

In [7]:
config['env']['goal_type'] = 'random'

In [8]:
env = gen_env(config['env'], reward_function=lambda *args: 0)

In [None]:
expected_steps_learner.config['update_step'] = 10_000_000

scores, steps = run_episodes(
    env=env,
    agent=agent,
    steps_learner=expected_steps_learner,
    n_episodes=config['expected_steps_params.warm_up'],
    agent_train_mode=False,
    verbose=config['training.verbose'],
    max_steps=config['expected_steps_params.warm_up_max_steps']
)

2021-02-18 12:04:33,962 INFO    navigation_policy      : Episode: 100. scores: 0.00, steps: 34.51, L1loss: 7.68
2021-02-18 12:04:50,874 INFO    navigation_policy      : Episode: 200. scores: 0.00, steps: 37.01, L1loss: 7.15


In [None]:
agent.eps

### agent training 

In [None]:
reward_function = ExpectedStepsAmountReward(expected_steps_learner.model)
env = gen_env(config['env'], reward_function=reward_function)

In [None]:
scores, steps = run_episodes(
    env=env,
    agent=agent,
    steps_learner=expected_steps_learner,
    n_episodes=config['training.n_episodes'],
    verbose=config['training.verbose']
)

In [None]:
display_stats(scores, steps)

In [None]:
env = gen_env(config['env'], reward_function, verbose=True)
print(run_episode(env, agent, train_mode=True))

show_video(config['env.video_path'])

In [None]:
None / 3

In [None]:
import numpy as np
np.nan / 3

In [None]:
a = 0
a += np.nan
a == np.nan

In [None]:
from pyhocon import ConfigFactory

config = ConfigFactory.parse_file('../conf/minigrid_dqn_navigation_draft.hocon')
config['env']['video_path'] = '../outputs/video/'

switch_reproducibility_on(config['seed'])

### trainings 

In [None]:
# reward_function = get_reward_function(config)
reward_function = get_reward_function({'training.reward': 'explicit_pos_reward'})

agent = get_agent(config)
env = gen_env(config['env'], reward_function)
scores, steps = run_episodes(env, agent, n_episodes=config['training.n_episodes'], verbose=config['training.verbose'])

display_stats(scores, steps)

### visualisation 

In [None]:
# env = gen_env(config['env'], reward_function, verbose=True)
# print(run_episode(env, agent, train_mode=False))

# show_video(config['env.video_path'])

In [None]:
import random
import torch
import numpy as np

from fast_tensor_data_loader import FastTensorDataLoader
from expected_steps import ExpectedStepsAmountModel
from rewards import ExpectedStepsAmount

In [None]:
grid_size = config['env.grid_size'] * config['env'].get('tile_size', 1)

In [None]:
expected_steps_learner = ExpectedStepsAmountModel(grid_size, config['training.reward_params'])
agent = get_agent(config)

In [None]:
env = gen_env(config['env'], reward_function=lambda *args: 0)
env.env.env.max_steps = 40  # TOBD: config
expected_steps_learner.model.fc[-1].bias.data.fill_(20.)

expected_steps_learner.reset_buffer()
expected_steps_learner.collect_episodes(env, agent)
expected_steps_learner.learn(verbose=True)

arr = np.array([x for _, _, x in expected_steps_learner.buffer])
logger.info(f'{np.quantile(arr, np.arange(0, 1, 0.1))}')    

In [None]:
# agent = get_agent(config)

In [None]:
expected_reward_function = ExpectedStepsAmount(expected_steps_learner.model)
env = gen_env(config['env'], reward_function=expected_reward_function)
# env.env.env.max_steps = 50  # TOBD: config

# agent.reset_buffer()
scores, steps = run_episodes(env, agent, n_episodes=3000, verbose=300)

In [None]:
env = gen_env(config['env'], expected_reward_function, verbose=True)
print(run_episode(env, agent, train_mode=True))

show_video(config['env.video_path'])

In [None]:
env = gen_env(config['env'], reward_function=lambda *args: 0)
env.env.env.max_steps = 40  # TOBD: config
expected_steps_learner.model.fc[-1].bias.data.fill_(20.)

expected_steps_learner.reset_buffer()
expected_steps_learner.collect_episodes(env, agent)
expected_steps_learner.learn(verbose=True)

arr = np.array([x for _, _, x in expected_steps_learner.buffer])
logger.info(f'{np.quantile(arr, np.arange(0, 1, 0.1))}')  

In [None]:
expected_reward_function = ExpectedStepsAmount(expected_steps_learner.model)
env = gen_env(config['env'], reward_function=expected_reward_function)
# env.env.env.max_steps = 50  # TOBD: config

agent.reset_buffer()
scores, steps = run_episodes(env, agent, n_episodes=3000, verbose=300)

In [None]:
agent.eps

In [None]:
config = ConfigFactory.parse_file('../conf/minigrid_dqn_draft.hocon')
config['env']

In [None]:
stop

In [None]:
import torch

def metric(model, buffer, device=torch.device('cuda')):
    def _vstack(arr):
        arr = np.vstack([np.expand_dims(x, axis=0) for x in arr])
        return torch.from_numpy(arr).float()

    states, goal_states, y = map(_vstack, zip(*buffer))
    y = y.squeeze()
    y = torch.clamp(y, max=20)  # !!! hot fix
    model.eval()
    
    loss_fn = torch.nn.L1Loss()
    train_loader = FastTensorDataLoader(states, goal_states, y, batch_size=1000)

    loss_sum = 0
    for batch_state, batch_goal_state, batch_y in train_loader:
        with torch.no_grad():
            output = model(batch_state.to(device), batch_goal_state.to(device))
            loss = loss_fn(output, batch_y.to(device))
        loss_sum += loss.cpu().numpy().item()
    return loss_sum / len(train_loader)

In [None]:
import random
import torch
import numpy as np

from navigation_policy import *
from expected_steps import ExpectedStepsAmountLearner
from rewards import ExpectedStepsAmount
from fast_tensor_data_loader import FastTensorDataLoader

In [None]:
config = ConfigFactory.parse_file('../conf/minigrid_dqn_navigation_draft.hocon')
config['env']['video_path'] = '../outputs/video/'

grid_size = config['env.grid_size'] * config['env'].get('tile_size', 1)

In [None]:
expected_steps_learner = ExpectedStepsAmountLearner(grid_size, config['training.reward_params'])

env = gen_env(config['env'], reward_function=lambda *args: 0)
env.env.env.max_steps = 20  # TOBD: config
# expected_steps_learner.model.fc[-1].bias.data.fill_(10.)
    
expected_steps_learner.collect_episodes(env, agent)
valid_buffer = expected_steps_learner.buffer

print(len(valid_buffer))

In [None]:
for i in range(20):
    expected_steps_learner.reset_buffer()
    expected_steps_learner.collect_episodes(env, agent)

    expected_steps_learner.learn()

    metric_valid = metric(expected_steps_learner.model, valid_buffer)
    metric_train = metric(expected_steps_learner.model, expected_steps_learner.buffer)
    mean_steps = sum([x for _, _, x in expected_steps_learner.buffer]) / len(expected_steps_learner.buffer)
    print(f"epochs {i}: metric_train = {metric_train :.2f}, metric_valid = {metric_valid :.2f}, mean_steps: {mean_steps}")

In [None]:
config['training']['n_episodes'] = 3000

expected_reward_function = ExpectedStepsAmount(expected_steps_learner.model)
env = gen_env(config['env'], reward_function=expected_reward_function)

new_agent = get_agent(config)
scores, steps = run_episodes(env, new_agent, n_episodes=config['training.n_episodes'], verbose=config['training.verbose'])

display_stats(scores, steps)

In [None]:
#

In [None]:
env = gen_env(config['env'], reward_function, verbose=True)
print(run_episode(env, new_agent, train_mode=False))

show_video(config['env.video_path'])

In [None]:
arr = np.array(steps[-300:])
logger.info(f'{np.quantile(arr, np.arange(0, 1, 0.1))}')

In [None]:
config['training.reward_params']['buffer_size'] = 5_000

In [None]:
expected_steps_learner = ExpectedStepsAmountLearner(grid_size, config['training.reward_params'])
agent = get_agent(config)

In [None]:
env = gen_env(config['env'], reward_function=lambda *args: 0, verbose=False)
env.env.env.max_steps = 4  # TOBD: config
expected_steps_learner.model.fc[-1].bias.data.fill_(2.)

expected_steps_learner.reset_buffer()
expected_steps_learner.collect_episodes(env, agent)
expected_steps_learner.learn(verbose=True)

arr = np.array([x for _, _, x in expected_steps_learner.buffer])
logger.info(f'{np.quantile(arr, np.arange(0, 1, 0.1))}')

expected_reward_function = ExpectedStepsAmount(expected_steps_learner.model, 2.5)

In [None]:
env = gen_env(config['env'], reward_function=lambda *args: 0, verbose=True)
state = env.reset()
goal_state = env.env.env.goal_state

import numpy as np

start_pos = (5, 3)
ens_pos = (2, 3)
state_ = np.copy(state)

In [None]:
next_state = np.copy(state_)
state = np.copy(state_)
expected_reward_function(state, next_state, goal_state), to_coords(state), to_coords(next_state), to_coords(goal_state)

In [None]:
state = np.copy(next_state)
next_state[4, 2] = next_state[5, 2]
next_state[5, 2] = [1, 0, 0]

expected_reward_function(state, next_state, goal_state), to_coords(state), to_coords(next_state), to_coords(goal_state)

In [None]:
state = np.copy(next_state)
next_state[3, 2] = next_state[4, 2]
next_state[4, 2] = [1, 0, 0]

expected_reward_function(state, next_state, goal_state), to_coords(state), to_coords(next_state), to_coords(goal_state)

In [None]:
state = np.copy(next_state)
next_state[3, 3] = next_state[3, 2]
next_state[3, 2] = [1, 0, 0]

expected_reward_function(state, next_state, goal_state), to_coords(state), to_coords(next_state), to_coords(goal_state)

In [None]:
state = np.copy(next_state)
next_state[3, 4] = next_state[3, 3]
next_state[3, 3] = [1, 0, 0]

expected_reward_function(state, next_state, goal_state), to_coords(state), to_coords(next_state), to_coords(goal_state)

In [None]:
expected_reward_function = ExpectedStepsAmount(expected_steps_learner.model, 2.5)
env = gen_env(config['env'], reward_function=expected_reward_function)
# env.env.env.max_steps = 50  # TOBD: config

agent = get_agent(config)
agent.reset_buffer()

for x in range(40):
    scores, steps = run_episodes(env, agent, n_episodes=25, verbose=25)
    arr = np.array(steps)
    logger.info(f'{np.quantile(arr, np.arange(0, 1, 0.1))}')

In [None]:
def to_coords(state):
    obj_pos = (state == 10).nonzero()
    return obj_pos[0].item(), obj_pos[1].item()

In [None]:
model = expected_reward_function.model
dist = model(expected_reward_function._to_torch(state), expected_reward_function._to_torch(goal_state))

to_coords(state), to_coords(goal_state), dist

In [None]:
expected_reward_function = ExpectedStepsAmount(expected_steps_learner.model)

In [None]:
state[:,:,0], next_state[:,:,0]

In [None]:
state.shape

In [None]:
next_state = np.copy(state)

next_state[1, 3] = 0
next_state[5, 3] = np.array([10, 0, 0])
expected_reward_function(state, next_state, goal_state)

In [None]:
next_state = np.copy(state)
expected_reward_function(state, next_state, goal_state)

In [None]:
goal_state[12:16, 12:16, 0]

In [None]:
[:, :, 0]

In [None]:
agent.qnetwork_target.master

In [None]:
model = agent.qnetwork_target.master
model.output_size

In [None]:
model(states).shape

In [None]:
np.linalg.norm(state - goal_state) / 255

In [None]:
state / np.linalg.norm(state)

In [None]:
np.linalg.norm(state)

In [None]:
state / 