In [1]:
import sys
import logging

import numpy as np
import matplotlib.pyplot as plt
from joblib import hash, dump, load
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from figure8_env import MyEnv as figure8_env
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy, FixedFigure8Policy

In [2]:
figure8_give_rewards = True
nn_yaml = 'network_noconv.yaml'
higher_dim_obs = False
internal_dim = 2

In [3]:
class Defaults:
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 5000
    epochs = 50
    steps_per_test = 1000
    period_btw_summary_perfs = 1

    # ----------------------
    # Temporal Processing Parameters
    # ----------------------
    nstep = 15
    recurrent = False
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2
    show_rewards = False

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    update_rule = 'rmsprop'
    learning_rate = 1 * 1E-4 # 1E-4
    learning_rate_decay = 0.9
    discount = 0.9
    discount_inc = 1
    discount_max = 0.99
    rms_decay = 0.9
    rms_epsilon = 0.0001
    momentum = 0
    clip_norm = 1.0
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 10000
    update_frequency = 1
    replay_memory_size = 1000000 #replacing with 200000 will works just fine (in case you dont have 18gb of memory)
    batch_size = 32
    freeze_interval = 1000
    deterministic = False
    
    # ----------------------
    # Learning algo parameters
    # ----------------------
    loss_weights = [1E-2, 1E-3, 1E-3, 1E-3, 1E-3, 1E-3, 1.]

In [4]:
parameters = Defaults()
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

# --- Instantiate environment ---
env = figure8_env(
    give_rewards=figure8_give_rewards,
    intern_dim=internal_dim,
    higher_dim_obs=higher_dim_obs,
    show_rewards=parameters.show_rewards,
    nstep=parameters.nstep
    )

# --- Instantiate learning_algo ---
learning_algo = CRAR(
    env,
    parameters.rms_decay,
    parameters.rms_epsilon,
    parameters.momentum,
    parameters.clip_norm,
    parameters.freeze_interval,
    parameters.batch_size,
    parameters.update_rule,
    rng,
    high_int_dim=False,
    internal_dim=internal_dim, lr=parameters.learning_rate,
    nn_yaml=nn_yaml, double_Q=True,
    loss_weights=parameters.loss_weights,
    nstep=parameters.nstep,
    recurrent=parameters.recurrent
    )

if figure8_give_rewards:
    train_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.2,
        consider_valid_transitions=False
        )
    test_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.
        )
else:
    train_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng, epsilon=0.2,
        height=env.HEIGHT, width=env.WIDTH
        )
    test_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng,
        height=env.HEIGHT, width=env.WIDTH
        )

# --- Instantiate agent ---
agent = NeuralAgent(
    env,
    learning_algo,
    parameters.replay_memory_size,
    1,
    parameters.batch_size,
    rng,
    train_policy=train_policy,
    test_policy=test_policy)

# --- Create unique filename for FindBestController ---
h = hash(vars(parameters), hash_name="sha1")
fname = "test_" + h
print("The parameters hash is: {}".format(h))
print("The parameters are: {}".format(parameters))

# As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
# policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
# precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
# episode or epoch (or never, hence the resetEvery='none').
agent.attach(bc.EpsilonController(
    initial_e=parameters.epsilon_start,
    e_decays=parameters.epsilon_decay,
    e_min=parameters.epsilon_min,
    evaluate_on='episode',
    periodicity=1,
    reset_every='none'))

agent.run(10, 500)
print("end gathering data")

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
# wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# Same for the discount factor.
agent.attach(bc.DiscountFactorController(
    initial_discount_factor=parameters.discount, 
    discount_factor_growth=parameters.discount_inc, 
    discount_factor_max=parameters.discount_max,
    periodicity=1))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
# has the highest validation score.
# To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
# important that the validationID is the same than the id argument of the InterleavedTestEpochController.
# The FindBestController will dump on disk the validation scores for each and every network, as well as the 
# structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
# of the validation and test scores (see below) or simply recover the resulting neural network for your 
# application.
agent.attach(bc.FindBestController(
    validationID=figure8_env.VALIDATION_MODE,
    testID=None,
    unique_fname=fname))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 
# rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 
# every [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=figure8_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1))

The parameters hash is: 62977be8e45d8a56a5537c11dfd5d2fd8dda69e0
The parameters are: <__main__.Defaults object at 0x124d1fbb0>
end gathering data


In [5]:
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
#agent.gathering_data=False
agent.run(parameters.epochs, parameters.steps_per_epoch)

# --- Show results ---
basename = "scores/" + fname
scores = load(basename + "_scores.jldump")
print(scores)

Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2334, -0.0143]) tensor([0.5849, 0.1674]) tensor([0.2188, 0.0185])
R[0]
tensor([-0.0825], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01104453498334624 0.01070014631824597 0.07545845989463851 0.01419798760942649 0.8061926189661026 0.0 0.3474926076829433
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3873, -0.1978]) tensor([ 0.3846, -0.1995]) tensor([ 0.3657, -0.2250])
R[0]
tensor([0.0152], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.0037904320370871574 0.012158394954618415 



Testing score per episode (id: 0) is 0.0 (average over 1 episode(s))
== Mean score per episode is 0.0 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.2521, 0.3956]) tensor([0.2887, 0.4275]) tensor([0.3661, 0.5091])
R[0]
tensor([0.0432], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.026326301941648124 0.012023141971963924 0.012768864651807234 0.012416279860306531 0.5483728458285332 0.007371815621852875 0.21411279591917992
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.3384, 0.5602]) tensor([0.3679, 0.5797]) tensor([0.4349, 0.6423])
R[0]
tensor([0.0265], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02471170519012958 0.01180069

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5339, 0.2874]) tensor([0.5420, 0.3198]) tensor([0.6252, 0.5795])
R[0]
tensor([0.0520], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.024418396648019553 0.012561231231869897 0.009356364128019778 0.01661758646881208 0.57176496976614 0.0121889438778162 0.23008953416347502
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.4592, 0.6119]) tensor([0.4799, 0.6134]) tensor([0.6889, 0.7237])
R[0]
tensor([-0.0106], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.023128273741342128 0.01303215179

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.9310, 0.8998]) tensor([0.9058, 0.8876]) tensor([0.8758, 0.8426])
R[0]
tensor([-0.0152], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.019138910461217164 0.013637180394609458 0.008768227619111713 0.018236979353707283 0.6031555152535438 0.003958982639014721 0.2370158165693283
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.4224, 0.4531]) tensor([0.4479, 0.4727]) tensor([0.4368, 0.4508])
R[0]
tensor([0.0389], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.018241334727499635 0.0121485

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.9170, 0.9307]) tensor([0.9016, 0.9058]) tensor([0.9025, 0.9082])
R[0]
tensor([0.0002], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.015751947396434842 0.011798032365564723 0.006926304009888554 0.01723188784066588 0.6184972515106201 0.0020303176790475845 0.23571173533797263
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.8794, 0.7265]) tensor([0.8514, 0.7201]) tensor([0.7957, 0.7452])
R[0]
tensor([0.0062], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.016088233693502842 0.0135527

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.6443, 0.0961]) tensor([0.5966, 0.1679]) tensor([0.7687, 0.7778])
R[0]
tensor([0.1182], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.0168585600964725 0.012397700776607963 0.006736498765509168 0.017370630567427724 0.6145730403065681 0.001282649539411068 0.23164668099582195
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.3612, 0.0942]) tensor([0.3647, 0.1521]) tensor([0.4137, 0.0841])
R[0]
tensor([0.0971], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.016352456118445845 0.011912190

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.8620, 0.8342]) tensor([0.8464, 0.8076]) tensor([0.8413, 0.8255])
R[0]
tensor([-0.0117], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.017466826546005905 0.013030266828893218 0.0044685646994585115 0.01888530354667455 0.6162555629014969 0.001931519478559494 0.22353161764144897
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.3891, 0.3376]) tensor([0.3945, 0.3281]) tensor([0.4978, 0.4227])
R[0]
tensor([-0.0101], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.017184390576556324 0.01327

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([1.0294, 0.8949]) tensor([1.0109, 0.8965]) tensor([1.0610, 0.9397])
R[0]
tensor([-0.0117], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.015625141235999764 0.01285052976058796 0.0051325881355041925 0.016996528825722636 0.6240837422609329 0.0028928418904542925 0.21896067740023137
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.9756, 0.9575]) tensor([0.9666, 0.9333]) tensor([0.9827, 0.9622])
R[0]
tensor([-0.0056], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.016645597793161868 0.0129

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([1.0138, 0.9677]) tensor([1.0046, 0.9718]) tensor([0.9941, 0.9828])
R[0]
tensor([-0.0074], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01602203391585499 0.012843351928371704 0.005229424704615667 0.017347290227655323 0.6209842606186867 0.0027830098867416384 0.2132491347193718
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.8877, 0.8466]) tensor([0.8673, 0.8299]) tensor([0.9180, 0.8460])
R[0]
tensor([0.0158], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.015495899505913258 0.0117758

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.3456, 0.3363]) tensor([0.3432, 0.3162]) tensor([0.4043, 0.4437])
R[0]
tensor([0.0080], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.015981801332905887 0.013373523977614241 0.004289819459352657 0.017591508427169174 0.622831092953682 0.0022536427974700927 0.20920969589054583
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.4842, 0.4031]) tensor([0.4839, 0.4059]) tensor([0.5004, 0.4509])
R[0]
tensor([0.0198], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.015627212908118963 0.0137656

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.9384, 0.8806]) tensor([0.9077, 0.8607]) tensor([0.7427, 0.7584])
R[0]
tensor([0.0081], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.015620908722281456 0.011535757038996963 0.0037988597227486026 0.01702904667519033 0.623139391362667 0.0007247366085648537 0.20944791522622108
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.4666, 0.3979]) tensor([0.4523, 0.3834]) tensor([0.4599, 0.4148])
R[0]
tensor([0.0079], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01567616871651262 0.01291848

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.4180, 0.3362]) tensor([0.4050, 0.3278]) tensor([0.4323, 0.2424])
R[0]
tensor([0.0151], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014262225795537233 0.012484287043640506 0.00482571738709521 0.016574598556384445 0.6327699041366577 0.0006916213408112526 0.21936820735037327
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5721, 0.6247]) tensor([0.5628, 0.5988]) tensor([0.1269, 0.2341])
R[0]
tensor([-0.0033], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014897064411081374 0.013045

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.3306, 0.3606]) tensor([0.3284, 0.3423]) tensor([0.2247, 0.2781])
R[0]
tensor([0.0047], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014728657400701195 0.013851031253885595 0.004932931650975661 0.016661236498970538 0.6323191702365876 0.0013658534362912178 0.21536604806780815
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5057, 0.5303]) tensor([0.5169, 0.5234]) tensor([0.4985, 0.5174])
R[0]
tensor([0.0080], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.015209269892424345 0.013349

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0262, -0.5464]) tensor([ 0.1116, -0.3275]) tensor([ 0.1803, -0.4048])
R[0]
tensor([0.2291], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014023937688209117 0.013060911175387446 0.0037464207998273197 0.017006015583407134 0.6369335741400719 0.001708110898733139 0.21873274111747742
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1102, -0.2162]) tensor([ 0.1564, -0.1021]) tensor([ 0.2003, -0.0536])
R[0]
tensor([0.1372], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014549918230623

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.3122, 0.2531]) tensor([0.3028, 0.2534]) tensor([0.3342, 0.2679])
R[0]
tensor([-8.2143e-05], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013398310326971113 0.012002515762025724 0.0044807854748960384 0.01568453655997291 0.6366506033539772 0.000937036469578743 0.2185753322839737
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.4786, 0.5005]) tensor([0.4823, 0.4973]) tensor([0.5039, 0.5875])
R[0]
tensor([0.0106], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014485205699689686 0.014

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.7548, 0.7026]) tensor([0.7401, 0.6974]) tensor([0.8754, 0.7817])
R[0]
tensor([-0.0033], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01369666236359626 0.012597231261606794 0.003951498488810103 0.01542572309449315 0.6398187952041626 0.001499851755797863 0.2238489276021719
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1615, -0.2915]) tensor([ 0.1477, -0.2256]) tensor([ 0.2321, -0.3424])
R[0]
tensor([0.1095], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013753892054315656 0.012

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.8218, 0.8455]) tensor([0.8133, 0.8335]) tensor([0.8709, 0.8830])
R[0]
tensor([0.0073], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014961523212492465 0.013268760910985293 0.003760749058345027 0.016213512759190054 0.6295388470888138 0.0014177080243825912 0.21241477705538273
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5294, 0.5170]) tensor([0.5118, 0.5043]) tensor([0.6069, 0.6077])
R[0]
tensor([0.0012], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014874479457736015 0.013284

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.3470, 0.2940]) tensor([0.3487, 0.3075]) tensor([0.2478, 0.0544])
R[0]
tensor([0.0320], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01509045921638608 0.011954066097525355 0.0038071420833202863 0.015478071350138635 0.6310588955879212 0.0015943510234355926 0.208065113350749
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.7726, 0.7799]) tensor([0.7557, 0.7627]) tensor([0.7540, 0.7803])
R[0]
tensor([0.0009], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014630714364349841 0.01270536

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.8530, 0.8417]) tensor([0.8349, 0.8252]) tensor([0.8304, 0.8251])
R[0]
tensor([0.0066], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014999310670420528 0.012876959479108337 0.003259138803010501 0.016343696902506054 0.6283231112957001 0.0015539903789758682 0.20580589154362677
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.7794, 0.7604]) tensor([0.7603, 0.7399]) tensor([0.6408, 0.6438])
R[0]
tensor([0.0105], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014939651838969439 0.012777

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5548, 0.5255]) tensor([0.5525, 0.5251]) tensor([0.5183, 0.4628])
R[0]
tensor([0.0055], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013835825404617936 0.012567950649870908 0.003480311128220819 0.01498961861524731 0.6378995316028595 0.0017252931445837021 0.21373987555503846
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.7263, 0.7058]) tensor([0.7074, 0.6950]) tensor([0.8315, 0.7869])
R[0]
tensor([0.0020], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014246781543362886 0.0120340

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.1217, 0.1574]) tensor([0.1257, 0.1516]) tensor([-0.0348,  0.0050])
R[0]
tensor([0.0139], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01413204526156187 0.012774906764781917 0.0045021228231235 0.01569440131634474 0.6340397011637687 0.001970743767917156 0.2072145841419697
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.6721, 0.6217]) tensor([0.6626, 0.6205]) tensor([0.5136, 0.5595])
R[0]
tensor([0.0164], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013896104381419718 0.0120029314

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.2388, 0.1679]) tensor([0.2273, 0.1696]) tensor([0.2331, 0.2253])
R[0]
tensor([0.0203], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013686711899470539 0.012397361736446328 0.0028189116117664525 0.015458682556403801 0.6381903882622719 0.001572417639195919 0.21068260112404824
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([1.0079, 0.9576]) tensor([0.9883, 0.9427]) tensor([1.0074, 0.9568])
R[0]
tensor([0.0118], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.0133178818305023 0.01196066

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5694, 0.5879]) tensor([0.5518, 0.5711]) tensor([0.5542, 0.5820])
R[0]
tensor([-0.0026], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013354573416523635 0.012131101188540924 0.004008184757116396 0.015173086945666001 0.6423766050338745 0.001436386026442051 0.20826283486187458
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.7287, 0.7102]) tensor([0.7222, 0.7089]) tensor([0.7278, 0.7081])
R[0]
tensor([-0.0055], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013403619422577322 0.01218

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0581, -0.1376]) tensor([ 0.0556, -0.0888]) tensor([0.1051, 0.1200])
R[0]
tensor([0.0033], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013298733818344772 0.011868981441075448 0.0033039524743471702 0.015452567532658577 0.6394016124606132 0.0009517104551196098 0.21560908633470535
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5255, 0.5320]) tensor([0.5091, 0.5210]) tensor([0.5461, 0.5554])
R[0]
tensor([0.0029], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013742207414936274 0.0

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.9412, 0.9116]) tensor([0.9185, 0.8922]) tensor([0.8161, 0.7823])
R[0]
tensor([0.0147], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013476131534203887 0.012768312212516321 0.003364896242072064 0.015253752022515983 0.637455983042717 0.0008264574259519577 0.20919353938102722
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.4753, 0.4353]) tensor([0.4679, 0.4437]) tensor([0.3308, 0.3064])
R[0]
tensor([0.0307], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013435852704569698 0.0127589

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0428, -0.0750]) tensor([-0.0294, -0.0537]) tensor([ 0.0413, -0.0161])
R[0]
tensor([0.0426], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.012784922391176224 0.01097021554870298 0.0034009687011002823 0.01441718903905712 0.6384305952191353 0.0011905659213662149 0.20739193162322045
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5276, 0.5472]) tensor([0.5088, 0.5335]) tensor([0.6859, 0.6556])
R[0]
tensor([-2.4065e-05], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01317187494412064

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5196, 0.5939]) tensor([0.5074, 0.5763]) tensor([0.5081, 0.5810])
R[0]
tensor([0.0066], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013104585941880941 0.013395394735242008 0.003607301201969676 0.015400833391351626 0.6382677931785583 0.001255634345114231 0.20566268721222877
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.4845, 0.5247]) tensor([0.4830, 0.5230]) tensor([0.4359, 0.4968])
R[0]
tensor([0.0175], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.012882929420098662 0.0125352

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.7235, 0.8295]) tensor([0.7131, 0.8108]) tensor([0.7450, 0.8404])
R[0]
tensor([0.0040], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.012893983595073223 0.013010706211120123 0.003210313935647719 0.014953966949600727 0.6429995820522308 0.0013766983300447463 0.20473691406846045
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.5102, 0.6661]) tensor([0.5182, 0.6539]) tensor([0.6339, 0.7663])
R[0]
tensor([-0.0084], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.012705971424467862 0.01143

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.6632, 0.7706]) tensor([0.6442, 0.7470]) tensor([0.6279, 0.7493])
R[0]
tensor([0.0017], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013325051452498883 0.012178543130125036 0.002527923331690545 0.014387883256189525 0.6395898575782776 0.001380286894738674 0.2020542972534895
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.2327, 0.1520]) tensor([0.2161, 0.1564]) tensor([0.2494, 0.1858])
R[0]
tensor([0.0331], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.014026436455082149 0.01343435

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.4756, 0.5819]) tensor([0.4801, 0.5764]) tensor([0.4517, 0.6055])
R[0]
tensor([0.0052], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013351411276962608 0.012394893345277524 0.0028254482942575124 0.01462070468114689 0.6395040767192841 0.0017258830443024635 0.20305458511412144
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.3022, 0.1134]) tensor([0.2786, 0.1253]) tensor([0.4676, 0.2300])
R[0]
tensor([0.0435], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01332582284975797 0.0117729

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0715, -0.0456]) tensor([-0.0559, -0.0289]) tensor([-0.0344, -0.0206])
R[0]
tensor([0.0314], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.013430412975139916 0.013985226599426824 0.0037525278655557485 0.015639516222290695 0.6382153388261795 0.0014672747254371643 0.20081318861246109
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([0.3823, 0.4840]) tensor([0.3717, 0.4662]) tensor([0.2638, 0.3682])
R[0]
tensor([0.0015], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01378022905997932 0.


KeyboardInterrupt



## Visualize performance

In [None]:
agent.setNetwork(fname, nEpoch=4)

In [None]:
agent._in_episode = True
agent._mode = 0 # Testing mode with plan_depth=0
initState = env.reset(agent._mode)
inputDims = env.inputDimensions()

for i in range(len(inputDims)):
    if inputDims[i][0] > 1:
        agent._state[i][1:] = initState[i][1:]
agent._Vs_on_last_episode = []
is_terminal = False
reward = 0

### Frame by frame

In [None]:
%matplotlib inline

for i in range(100):
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    plt.figure()
    plt.imshow(np.flip(_obs.squeeze()))
    plt.show()
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
    V, action, reward, _ = agent._step()
    print(action)
    agent._Vs_on_last_episode.append(V)
    is_terminal = env.inTerminalState()
    if is_terminal: break

### As animation

In [None]:
%matplotlib notebook

import numpy as np 
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation

fig, ax = plt.subplots(1,1)
obs = env.observe()
_obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
_obs = np.flip(_obs.squeeze())
ax.set_xticks([])
ax.set_yticks([])
im = ax.imshow(np.zeros(_obs.shape))

def init():
    plt.cla()
    im = ax.imshow(_obs)
    return [im]

def animate(i, *args, **kwargs):
    plt.cla()
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    _obs = np.flip(_obs.squeeze())
    im = ax.imshow(_obs)
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
        V, action, reward, _ = agent._step()
        agent._Vs_on_last_episode.append(V)
    return [im]

ani = animation.FuncAnimation(fig, animate, init_func=init, 
     frames=100, blit=False, repeat=True)
ani.save('behavior.gif', writer="ffmpeg", fps = 15)
fig.show()