In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import matplotlib.pyplot as plt
from pprint import pprint

%load_ext autoreload
%autoreload 2

In [3]:
from pyhocon import ConfigFactory

config = ConfigFactory.parse_file('../experiments/conf/nle_draft.hocon')
config

ConfigTree([('env',
             ConfigTree([('env_type', 'nle'),
                         ('video_path', 'artifacts/video/'),
                         ('goal_achieving_criterion', 'position'),
                         ('goal_type', 'from_current_episode'),
                         ('from_buffer_choice_params',
                          ConfigTree([('buffer_size', 100000),
                                      ('warmup_steps', 10)])),
                         ('action_size', 23)])),
            ('worker',
             ConfigTree([('state_encoder_type', 'net_hack_encoder'),
                         ('use_lstm', False),
                         ('head', ConfigTree([('hidden_size', 64)]))])),
            ('master',
             ConfigTree([('state_encoder_type', 'net_hack_encoder'),
                         ('use_lstm', False),
                         ('head', ConfigTree([('hidden_size', [])])),
                         ('emb_size', 64)])),
            ('seed', 42),
            ('outputs

In [4]:
import sys
sys.path.append('../experiments')

from train_worker import *

state_encoder, goal_state_encoder = get_encoders(config)
net = get_master_worker_net(state_encoder, goal_state_encoder, config)

In [5]:
from rlplay.engine import BaseActorModule

In [6]:
class EncoderActor(BaseActorModule):
    def __init__(self, state_encoder, epsilon=0.1):
        super().__init__()

        self.encoder = state_encoder

        # for updating the exploration epsilon in the clones
        self.register_buffer('epsilon', torch.tensor(epsilon))

    def forward(self, obs, act, rew, fin, *, hx=None, stepno=None, virtual=False):
        qv, hx = self.encoder(obs), ()
        val, actions = qv.max(dim=-1)

        if self.training:
            *head, n_actions = qv.shape
            actions = actions.where(
                torch.rand(head, device=self.epsilon.device).gt(self.epsilon),
                torch.randint(n_actions, size=head, device=self.epsilon.device))

        return actions, hx, dict(q=qv, value=val)

### D-DQN loss

Service functions for the algorithms

In [7]:
from rlplay.engine.utils.plyr import apply, suply, xgetitem


def timeshift(state, *, shift=1):
    """Get current and shfited slices of nested objects."""
    # use xgetitem to lett None through
    # XXX `curr[t]` = (x_t, a_{t-1}, r_t, d_t), t=0..T-H
    curr = suply(xgetitem, state, index=slice(None, -shift))

    # XXX `next[t]` = (x_{t+H}, a_{t+H-1}, r_{t+H}, d_{t+H}), t=0..T-H
    next = suply(xgetitem, state, index=slice(shift, None))

    return curr, next

Double DQN loss for contiguous trajectory fragements. 

* `.state[t+1].rew` -- $r_{t+1}$
* `.state[t+1].fin` -- $d_{t+1}$
* `.state[t+1].act` -- $a_t$ which caused $
    (s_t, z_t, a_t) \longrightarrow (s_{t+1}, z_{t+1}, r_{t+1}, d_{t+1})
$
* `_, _, fragment.actor[t] = actor(.state[t])` -- $
    q(z_t, h_t, \cdot; \theta_{\text{old}})
$ -- used for rollout collection
* `_, _, info_module[t] = module(.state[t])` -- $
    q(z_t, h_t, \cdot; \theta)
$ -- the current q-function, producing $(h_t)_{t=0}^{T+1}$
* `_, _, info_target[t] = target(.state[t])` -- $
    q(z_t, h_t, \cdot; \theta_-)
$ -- the q-target with $(h^-_t)_{t=0}^{T+1}$

The current q-network minimizes the $\mathrm{TD}(0)$-error is

$$
\delta_t(\theta)
    = \bigl(
        r_{t+1}
        + \gamma 1_{\{\neg d_{t+1}\}} v^*(z_{t+1})
    \bigr) - q(z_t, h_t, a_t; \theta)
    \,, $$

where the approximate state value estimate $
    v^*(z_{t+1})
$ is one of
* `Q-learning`: $
    \max_a q(z_{t+1}, h_{t+1}, a; \theta)
$
* `DQN`: $
    \max_a q(z_{t+1}, h_{t+1}, a; \theta_-)
$
* `double DQN`: $
    q(z_{t+1}, h_{t+1}, \hat{a}_{t+1}; \theta_-)
$ for $
    \hat{a}_{t+1} = \arg \max_a Qq(z_{t+1}, h_{t+1}, a; \theta)
$

One of the works realted to DQN learning of recurrent agents is [Kapturowski et al. (2018)](https://openreview.net/forum?id=r1lyTjAqYX), who propose to use a burn-in period in
the contiguous trajectory fragment in order to compenaste for the representation drift,
due to differnet RNN parameters $\theta_-$ and $\theta$.

There is no clear-cut evidence suggesting that the hidden recurrent sequences $h_t$
and $h^-_t$ yield significantly different results.

In [8]:
import torch.nn.functional as F
import torch

# @torch.enable_grad()
def ddq_learn(fragment, module, *, gamma=0.95, target=None, double=False):
    r"""Compute the Double-DQN loss over a _contiguous_ fragment of a trajectory.

    Details
    -------
    In Q-learning the action value function minimizes the TD-error

    $$
        r_{t+1}
            + \gamma 1_{\neg d_{t+1}} v^*(z_{t+1})
            - q(z_t, a_t; \theta)
        \,, $$

    w.r.t. Q-network parameters $\theta$ where $z_t$ is the actionable state,
    $r_{t+1}$ is the reward for $s_t \to s_{t+1}$ transition. The value of
    $z_t$ include the current observation $x_t$ and the recurrent state $h_t$,
    the last action $a_{t-1}$, the last reward $r_t$, and termination flag
    $d_t$.

    In the classic Q-learning there is no target network and the next state
    optimal state value function is bootstrapped using the current Q-network
    (`module`):

    $$
        v^*(z_{t+1})
            \approx \max_a q(z_{t+1}, a; \theta)
        \,. $$

    The DQN method, proposed by

        [Minh et al. (2013)](https://arxiv.org/abs/1312.5602),

    uses a secondary Q-network (`target`) to estimate the value of the next
    state:

    $$
        v^*(z_{t+1})
            \approx \max_a q(z_{t+1}, a; \theta^-)
        \,, $$

    where $\theta^-$ are the frozen parameters of the target Q-network. The
    Double DQN algorithm of

        [van Hasselt et al. (2015)](https://arxiv.org/abs/1509.06461)

    unravels the $\max$ operator as
    $
        \max_k u_k \equiv u_{\arg \max_k u_k}
    $
    and replaces the outer $u$ with the Q-values of the target Q-network, while
    computing the inner $u$ (inside the $\arg\max$) with the current Q-network.
    Specifically, the Double DQN value estimate is

    $$
        v^*(z_{t+1})
            \approx q(z_{t+1}, \hat{a}_{t+1}; \theta^-)
            \,,
            \hat{a}_{t+1}
                = \arg \max_a q(z_{t+1}, a; \theta)
        \,, $$

    for $
        \hat{a}_{t+1}
            = \arg \max_a q(s_{t+1}, a; \theta)
    $ being the action taken by the current Q-network $\theta$ at $z_{t+1}$.

    Recurrent DQN
    -------------
    The key problem with the recurrent state $h_t$ in $z_t$ is its representaion
    drift: the endogenous states used for collecting trajectory data during the
    rollout are produced by an actor with stale perameters $\theta_{\text{old}}$,
    and thus might have high discrepancy with the recurrent state produced by
    the current Q-network $\theta$ or the target $\theta-_$. To mitigate this
        
        [Kapturowski et al. (2018)](https://openreview.net/forum?id=r1lyTjAqYX)
    
    proposed to spend a slice `burnin` of the recorded trajectory on
    aligning the recurrent representation. Specifically, starting with $h_0$
    (contained in `fragment.hx`) they propose to launch two sequences $h_t$
    and $h^-_t$ from the same $h^-_0 = h_0$ using $q(\cdot; \theta)$ and
    $q(\cdot; \theta^-)$, respectively.
    """

    trajectory, hx = fragment.state, fragment.hx
    obs, act, rew, fin = trajectory.obs, trajectory.act, trajectory.rew, trajectory.fin

    # get $Q(z_t, h_t, \cdot; \theta)$ for all t=0..T
    _, _, info_module = module(
        obs, act, rew, fin, hx=hx, stepno=trajectory.stepno)

    # get the next state `state[t+1]` $z_{t+1}$ to access $a_t$
    state_next = suply(xgetitem, trajectory, index=slice(1, None))

    # $\hat{A}_t$, the module's response to current and next state,
    #  contains the q-values. `curr` is $q(z_t, h_{t+1}, \cdot; \theta)$
    #  and `next` is $q(z_{t+1}, h_{t+1}, \cdot; \theta)$ is `next`.
    info_module_curr, info_module_next = timeshift(info_module)

    # get $q(z_t, h_t, a_t; \theta)$ for all t=0..T-1
    q_replay = info_module_curr['q'].gather(-1, state_next.act.unsqueeze(-1))

    # get $\hat{v}_{t+1}(z_{t+1}) = ...$
    with torch.no_grad():
        if target is None:
            # get $... = \max_a Q(z_{t+1}, h_{t+1}, a; \theta)$
            q_value = info_module_next['q'].max(dim=-1, keepdim=True).values

        else:
            _, _, info_target = target(
                obs, act, rew, fin, hx=hx, stepno=trajectory.stepno)

            info_target_next = suply(xgetitem, info_target, index=slice(1, None))
            if not double:
                # get $... = \max_a Q(z_{t+1}, h^-_{t+1}, a; \theta^-)$
                q_value = info_target_next['q'].max(dim=-1, keepdim=True).values

            else:
                # get $\hat{a}_{t+1} = \arg \max_a Q(z_{t+1}, h_{t+1}, a; \theta)$
                hat_act = info_module_next['q'].max(dim=-1).indices.unsqueeze(-1)

                # get $... = Q(z_{t+1}, h^-_{t+1}, \hat{a}_{t+1}; \theta^-)$
                q_value = info_target_next['q'].gather(-1, hat_act)

        # get $r_{t+1} + \gamma 1_{d_{t+1}} \hat{v}_{t+1}(z_{t+1})$ using inplace ops
        q_value.masked_fill_(state_next.fin.unsqueeze(-1), 0.)
        q_value.mul_(gamma).add_(state_next.rew.unsqueeze(-1))

    # td-error ell-2 loss
    return F.mse_loss(q_replay, q_value, reduction='sum')

<br>

### Run!

prepare the optimizer for the learner

In [9]:
gamma = 0.6
use_target = False
use_double = False

# `target` does not work for some reason at all with taxi, maybe the freeze schedule is off?
#  or contiguous fragments work to the detrement of learning

# `duelling` also fails for both target and double, and sorta for ordinary q

Initialize the learner and the environment factories

In [10]:
from functools import partial

from train_worker import gen_navigation_env, gen_env

factory_eval = partial(gen_navigation_env, conf=config['env'])
factory = partial(gen_navigation_env, conf=config['env'])

learner = EncoderActor(net)

learner.train()
device_ = torch.device('cpu')  #  torch.device('cuda:0')
learner.to(device=device_)

optim = torch.optim.SGD(learner.parameters(), lr=1e-1)

Initialize the sampler

In [11]:
T, B = 25, 8

In [12]:
from rlplay.engine.rollout import same

batchit = same.rollout(
    [factory() for _ in range(B)],
    learner,
    n_steps=T,
    sticky=False,
    device=device_,
)

A generator of evaluation rewards

In [13]:
from rlplay.engine.rollout.evaluate import evaluate

test_it = evaluate(factory_eval, learner, n_envs=4, n_steps=500,
                   clone=False, device=device_, start_method='fork')

Implement your favourite training method

In [14]:
import tqdm
import copy
from math import log, exp
from torch.nn.utils import clip_grad_norm_

torch.set_num_threads(1)

# the training loop
losses, rewards = [], []
decay = -log(2) / 50  # exploration epsilon halflife
for epoch in tqdm.tqdm(range(400)):
    # freeze the target for q
    target = copy.deepcopy(learner) if use_target else None

    for j, batch in zip(range(100), batchit):
        loss = ddq_learn(batch, learner, target=target,
                         gamma=gamma, double=use_double)


        optim.zero_grad()
        loss.backward()
        grad = clip_grad_norm_(learner.parameters(), max_norm=1e3)
        optim.step()

        losses.append(dict(
            loss=float(loss), grad=float(grad),
        ))

    learner.epsilon.mul_(exp(decay)).clip_(0.1, 1.0)

    # fetch the evaluation results lagged by one inner loop!
    rewards.append(next(test_it))

  4%|▎         | 14/400 [28:59<13:32:52, 126.35s/it]Process ForkProcess-1:
  4%|▎         | 14/400 [30:53<14:11:41, 132.39s/it]Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/nikita/ML/work_repo/rlplay/rlplay/engine/rollout/evaluate.py", line 102, in p_evaluate
    ctrl.alpha.rx.recv()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt



KeyboardInterrupt: 

In [None]:
# close the generators
batchit.close()
test_it.close()

<br>

In [None]:
def collate(records):
    """collate identically keyed dicts"""
    out, n_records = {}, 0
    for record in records:
        for k, v in record.items():
            out.setdefault(k, []).append(v)
    
    return out


data = {k: numpy.array(v) for k, v in collate(losses).items()}

In [None]:
plt.semilogy(data['loss'])

In [None]:
plt.semilogy(data['grad'])

In [None]:
rewards = numpy.stack(rewards, axis=0)

In [None]:
rewards

In [None]:
m, s = numpy.median(rewards, axis=-1), rewards.std(axis=-1)

In [None]:
fi, ax = plt.subplots(1, 1, figsize=(4, 2), dpi=300)

ax.plot(numpy.mean(rewards, axis=-1))
ax.plot(numpy.median(rewards, axis=-1))
ax.plot(numpy.min(rewards, axis=-1))
ax.plot(numpy.std(rewards, axis=-1))
# ax.plot(m+s * 1.96)
# ax.plot(m-s * 1.96)

plt.show()

<br>

The ultimate evaluation run

In [None]:
from rlplay.engine import core

with factory_eval() as env:
    learner.eval()
    eval_rewards, info = core.evaluate([
        env
    ], learner, render=True, n_steps=1e2, device=device_)

print(sum(eval_rewards))

In [None]:
import sys
sys.path.append('../experiments')

from train_worker import gen_env

In [None]:
switch_reproducibility_on(config['seed'])

In [None]:
env = gen_navigation_env(config['env'])

In [None]:
_ = env.reset()

In [None]:
for x in range(20):
    state, _, _, _ = env.step(1)

In [None]:
state.keys()

In [None]:
from rllr.utils import convert_to_torch

In [None]:
state = convert_to_torch([state, state])

In [None]:
state = convert_to_torch([state['state']] * 2)

In [None]:
state_encoder(state)

In [None]:
# torch.Size([2, 25]) torch.Size([2, 21, 79])

In [None]:
state = convert_to_torch([state['state'], state['goal_state']])

In [None]:
state['goal_state']['position']

In [None]:
env.last_observation

In [None]:
env.buffer_random_choice().keys()

In [None]:
import gym
import nle
env_ = gym.make("NetHackScore-v0", observation_keys=("glyphs", "blstats"))

In [None]:
state = env.reset()

In [None]:
print(env_.last_observation)

In [None]:
env

In [None]:
state.keys()

In [None]:
state['goal_state']

In [None]:
from collections import Counter

Counter(state['glyphs'].reshape(-1))

In [None]:
glyphs = state["glyphs"]
T, B, *_ = glyphs.shape

blstats = state["blstats"]
blstats.view(T * B, -1).float()

In [None]:
env.observation

In [None]:
env.step_count

In [None]:

states0 = env.reset()  # each reset generates a new dungeon
env.render()

In [None]:
states0['blstats'][:2]

In [None]:
states0['glyphs'][6, 8]

In [None]:
states1, _, _, _ = env.step(1)  # move agent '@' north
env.render()

In [None]:
states1['blstats'][:2]

In [None]:
states1['glyphs'][5, 8]