In [1]:
import sys
import logging

import numpy as np
import matplotlib.pyplot as plt
from joblib import hash, dump, load
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from figure8_env import MyEnv as figure8_env
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy, FixedFigure8Policy

In [2]:
figure8_give_rewards = True
nn_yaml = 'network_noconv.yaml'
higher_dim_obs = False
internal_dim = 10

In [3]:
class Defaults:
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 5000
    epochs = 50
    steps_per_test = 1000
    period_btw_summary_perfs = 1

    # ----------------------
    # Temporal Processing Parameters
    # ----------------------
    nstep = 20
    nstep_decay = 0.8
    encoder_type = 'variational'
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2
    show_rewards = False

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    update_rule = 'rmsprop'
    learning_rate = 1 * 1E-4
    learning_rate_decay = 0.9
    discount = 0.9
    discount_inc = 1
    discount_max = 0.99
    rms_decay = 0.9
    rms_epsilon = 0.0001
    momentum = 0
    clip_norm = 1.0
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 1000
    update_frequency = 1
    replay_memory_size = 100000 #50000 #replacing with 200000 will works just fine (in case you dont have 18gb of memory)
    batch_size = 64 #32
    freeze_interval = 1000
    deterministic = False
    
    # ----------------------
    # Learning algo parameters
    # ----------------------
    #loss_weights = [5E-3, 1E-3, 5E-3, 5E-3, 5E-3, 5E-3, 1.]
    #loss_weights = [0, 0, 0, 0, 0, 0, 1.]
    loss_weights = [5E-3, 5E-3, 5E-3, 0, 5E-3, 1E-3, 1., 1E-6]

In [4]:
parameters = Defaults()
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

# --- Instantiate environment ---
env = figure8_env(
    give_rewards=figure8_give_rewards,
    intern_dim=internal_dim,
    higher_dim_obs=higher_dim_obs,
    show_rewards=parameters.show_rewards,
    nstep=parameters.nstep, nstep_decay=parameters.nstep_decay
    )

# --- Instantiate learning_algo ---
learning_algo = CRAR(
    env,
    parameters.rms_decay,
    parameters.rms_epsilon,
    parameters.momentum,
    parameters.clip_norm,
    parameters.freeze_interval,
    parameters.batch_size,
    parameters.update_rule,
    rng,
    high_int_dim=False,
    internal_dim=internal_dim, lr=parameters.learning_rate,
    nn_yaml=nn_yaml, double_Q=True,
    loss_weights=parameters.loss_weights,
    nstep=parameters.nstep, nstep_decay=parameters.nstep_decay,
    encoder_type=parameters.encoder_type
    )

if figure8_give_rewards:
    train_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.2,
        consider_valid_transitions=False
        )
    test_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.
        )
else:
    train_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng, epsilon=0.2,
        height=env.HEIGHT, width=env.WIDTH
        )
    test_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng,
        height=env.HEIGHT, width=env.WIDTH
        )

# --- Instantiate agent ---
agent = NeuralAgent(
    env,
    learning_algo,
    parameters.replay_memory_size,
    1,
    parameters.batch_size,
    rng,
    train_policy=train_policy,
    test_policy=test_policy)

agent.setNetwork('saved')

# --- Create unique filename for FindBestController ---
h = hash(vars(parameters), hash_name="sha1")
fname = "test_" + h
print("The parameters hash is: {}".format(h))
print("The parameters are: {}".format(parameters))

# As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
# policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
# precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
# episode or epoch (or never, hence the resetEvery='none').
agent.attach(bc.EpsilonController(
    initial_e=parameters.epsilon_start,
    e_decays=parameters.epsilon_decay,
    e_min=parameters.epsilon_min,
    evaluate_on='episode',
    periodicity=1,
    reset_every='none'))

agent.run(10, 500)
print("end gathering data")

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
# wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# Same for the discount factor.
agent.attach(bc.DiscountFactorController(
    initial_discount_factor=parameters.discount, 
    discount_factor_growth=parameters.discount_inc, 
    discount_factor_max=parameters.discount_max,
    periodicity=1))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
# has the highest validation score.
# To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
# important that the validationID is the same than the id argument of the InterleavedTestEpochController.
# The FindBestController will dump on disk the validation scores for each and every network, as well as the 
# structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
# of the validation and test scores (see below) or simply recover the resulting neural network for your 
# application.
agent.attach(bc.FindBestController(
    validationID=figure8_env.VALIDATION_MODE,
    testID=None,
    unique_fname=fname))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 
# rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 
# every [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=figure8_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1))

The parameters hash is: 62977be8e45d8a56a5537c11dfd5d2fd8dda69e0
The parameters are: <__main__.Defaults object at 0x13a4bb9d0>
end gathering data


In [5]:
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
#agent.gathering_data=False
agent.setNetwork('saved')
agent.run(parameters.epochs, parameters.steps_per_epoch)

# --- Show results ---
basename = "scores/" + fname
scores = load(basename + "_scores.jldump")
print(scores)

Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.7025, -0.4656,  1.6450,  1.3993, -0.7933, -0.1965, -0.0157, -0.6956,
         0.4271,  0.1160]) tensor([-0.1674, -0.5055,  1.6059,  1.3400, -0.7545, -0.1851, -0.0030, -0.7231,
         0.7229,  0.1217]) tensor([-0.4830, -0.4513,  1.6309,  1.3870, -0.8058, -0.2255,  0.0095, -0.6919,
         0.7994,  0.1073])
R[0]
tensor([-0.0048], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.030352426677942277 0.004225414296251983 0.027583783397218214 0.011717409119009972 0.07549799115210772 0.9459268748760223 0.015321112579666079 53.36420378112793
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0076, -1.0156,  1.1520, 



Testing score per episode (id: 0) is 2.0 (average over 1 episode(s))
== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  abs_states[i:i+1], torch.as_tensor([action_encoding])
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0442, -0.1661,  1.4172,  1.4498, -0.7383, -0.2926, -0.0364, -0.3251,
         0.3037,  0.1873]) tensor([-0.1036, -0.2485,  1.4099,  1.4063, -0.7471, -0.3041,  0.0411, -0.4222,
         0.4491,  0.0308]) tensor([-0.4967, -0.3210,  1.5247,  1.5180, -0.7102, -0.0712, -0.2419, -0.4681,
         0.4894,  0.0689])
R[0]
tensor([0.0022], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05199839198589325 0.002694318709329309 0.00655076961251325 0.00753559243073687 0.03532428617961705 0.678129638671875 0.009760236552916466 49.87619654083252
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1997, -0.5711,  1.0916,  1.21

Testing score per episode (id: 0) is 2.0 (average over 1 episode(s))
== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4606, -0.4164,  1.3281,  1.0229, -0.7756, -0.4201,  0.5831, -0.9080,
         0.5238,  0.3308]) tensor([ 0.0046, -0.4263,  1.3256,  1.0482, -0.8122, -0.3993,  0.5256, -0.9492,
         0.3322, -0.0997]) tensor([ 0.2888, -0.4453,  1.3543,  1.0515, -0.7943, -0.4023,  0.5392, -0.9313,
        -0.0319, -0.1637])
R[0]
tensor([0.0036], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05351440098136664 0.0030754895334466708 0.00546531065783347 0.0077868165918625895 0.03474021206796169 0.8323102221488953 0.009228115730918944 48.86151324462891
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1716, -0.5537,  1.5512,  

Testing score per episode (id: 0) is 14.0 (average over 1 episode(s))
== Mean score per episode is 13.998600139986001 over 1 episodes ==


  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2590, -0.6452,  1.3061,  1.0378, -0.6222, -0.1865,  0.1096, -0.6731,
         0.1829,  0.1592]) tensor([-0.1039, -0.6589,  1.2860,  1.0579, -0.6451, -0.1841,  0.1111, -0.7102,
         0.4185, -0.0035]) tensor([-0.1297, -0.6326,  1.2572,  1.0285, -0.6151, -0.2158,  0.1566, -0.6490,
         0.1639,  0.1455])
R[0]
tensor([-0.0048], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052606833927333355 0.003407142484680662 0.004325878029674641 0.007192716043908149 0.03525588376075029 0.762849146604538 0.009110903750173748 47.89295104217529
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.6630, -0.9089,  0.9464,  

Testing score per episode (id: 0) is 7.0 (average over 1 episode(s))
== Mean score per episode is 6.999300069993001 over 1 episodes ==


  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4001, -0.5041,  1.3490,  1.2146, -0.7346, -0.2152,  0.0597, -0.6470,
         1.0009,  0.8121]) tensor([-0.0354, -0.5415,  1.3426,  1.1619, -0.7121, -0.2231,  0.1092, -0.6788,
         0.3722,  0.0912]) tensor([ 0.2104, -0.4843,  1.3150,  1.1872, -0.7400, -0.2574,  0.1142, -0.6314,
        -0.1334,  0.1354])
R[0]
tensor([-0.0037], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05344015694409609 0.004110423419091603 0.0033335831182339463 0.006944776912219822 0.034880387097597124 0.7053435447216034 0.010222952811513097 47.23030869293213
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0309, -0.7432,  1.2116,

Testing score per episode (id: 0) is 50.0 (average over 1 episode(s))
== Mean score per episode is 49.99500049995 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2369, -0.6599,  1.1558,  1.0261, -0.6580, -0.1943,  0.1802, -0.7442,
         0.3511, -0.2240]) tensor([-0.0607, -0.6814,  1.1873,  1.0426, -0.6716, -0.1923,  0.1869, -0.7772,
         0.2349,  0.0075]) tensor([ 0.9743, -0.6304,  1.0475,  0.9908, -0.6159, -0.2370,  0.2831, -0.7068,
         0.1296,  0.1150])
R[0]
tensor([0.0021], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05465754283964634 0.004287948152003083 0.0037573764743865467 0.007231499769957736 0.03370983035117388 0.6834487912654876 0.010145923571661115 46.65123282623291
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.6985, -0.6738,  1.1544,  

Testing score per episode (id: 0) is 26.0 (average over 1 episode(s))
== Mean score per episode is 25.997400259974004 over 1 episodes ==


  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4906, -1.2077,  1.7338,  0.4944, -0.2838,  0.0784,  0.2049, -1.2376,
         1.4229,  0.4229]) tensor([-0.0206, -1.1535,  1.6416,  0.5212, -0.2790,  0.1162,  0.1825, -1.2231,
         0.7576,  0.2808]) tensor([ 0.0746, -1.3307,  1.7914,  0.3676, -0.1924,  0.1205,  0.1970, -1.3048,
         0.7278,  0.2200])
R[0]
tensor([-5.0046e-05], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05499792963266373 0.0036458955029775096 0.003639038102242921 0.0060910001888405535 0.033907878648489716 0.6472731496095657 0.008565362791065127 45.902900154113766
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0698, -0.7018,  1

Testing score per episode (id: 0) is 64.0 (average over 1 episode(s))
== Mean score per episode is 63.993600639936005 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2755, -1.1637,  1.0123,  0.0260,  0.0697, -0.0471,  0.4730, -0.9172,
         0.4305,  0.4304]) tensor([-2.4340e-01, -1.1130e+00,  1.0956e+00,  2.2806e-01, -4.5176e-02,
        -8.8833e-05,  3.6179e-01, -9.4774e-01,  2.2437e-01,  6.2476e-02]) tensor([-0.7514, -0.9706,  1.0495,  0.4186, -0.2382, -0.0628,  0.3256, -0.9042,
         0.0371,  0.4421])
R[0]
tensor([0.0072], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.053540505811572076 0.0035099637223102035 0.003254573781901854 0.00573328065732494 0.03417489617317915 0.5404889825582504 0.007992826852481812 43.574907363891604
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp

Testing score per episode (id: 0) is 99.0 (average over 1 episode(s))
== Mean score per episode is 98.99010098990101 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1173, -0.5101,  1.1152,  0.9552, -0.8195, -0.3738,  0.5918, -1.0237,
        -0.2215, -0.2401]) tensor([ 0.0513, -0.5074,  1.0951,  0.9644, -0.8324, -0.3520,  0.5506, -1.0487,
         0.0575, -0.0457]) tensor([-0.2568, -0.5252,  1.1294,  0.9361, -0.8130, -0.3781,  0.5827, -1.0254,
        -0.2140, -0.5821])
R[0]
tensor([-0.0001], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05360113993287086 0.00485722878407978 0.003229791264217056 0.006360047461465001 0.03435185860656202 0.4906564502120018 0.00850651432806626 42.96613729858399
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4613, -0.4012,  0.9047,  0.

Testing score per episode (id: 0) is 98.0 (average over 1 episode(s))
== Mean score per episode is 97.99020097990201 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.6876, -0.5815,  1.1421,  0.9636, -0.8039, -0.3142,  0.5644, -1.1189,
        -0.1134, -0.1040]) tensor([ 0.0582, -0.5575,  1.1537,  0.9685, -0.7836, -0.2859,  0.5251, -1.0888,
         0.1034, -0.0257]) tensor([-0.3876, -0.5974,  1.1368,  0.9496, -0.8155, -0.3124,  0.5441, -1.1137,
         0.0883,  0.5951])
R[0]
tensor([0.0040], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05317619341611862 0.0037479404478235664 0.0025616503708770326 0.005620019535999745 0.033678467635065316 0.4786956864595413 0.008528361968230456 42.67402359008789
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3452, -0.5030,  1.1317,

Testing score per episode (id: 0) is 100.0 (average over 1 episode(s))
== Mean score per episode is 99.9900009999 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3048, -0.4181,  1.0582,  1.1651, -0.6074, -0.2141,  0.1262, -0.5159,
         0.1245,  0.2211]) tensor([-0.0346, -0.4570,  1.0466,  1.1399, -0.6244, -0.2120,  0.1208, -0.5804,
         0.0237, -0.0727]) tensor([-0.0188, -0.4320,  1.1481,  1.2576, -0.5908, -0.0903, -0.0245, -0.5233,
        -0.5285,  0.1376])
R[0]
tensor([0.0021], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05330820053070784 0.0039051887722598624 0.002637557925834699 0.006043092942563817 0.03415632300451398 0.47979560953378675 0.00783381005981937 42.544046989440915
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0296, -0.6058,  1.3013, 

Testing score per episode (id: 0) is 106.0 (average over 1 episode(s))
== Mean score per episode is 105.989401059894 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.6872, -0.7166,  1.2176,  0.9909, -0.8686, -0.2411,  0.4329, -1.2201,
         0.3123, -0.5866]) tensor([ 0.0187, -0.6913,  1.2068,  0.9527, -0.8189, -0.2222,  0.4035, -1.1654,
         0.1115, -0.0659]) tensor([-0.3719, -0.6521,  1.1594,  0.9494, -0.8356, -0.2856,  0.5118, -1.1713,
        -0.3560, -0.5001])
R[0]
tensor([-0.0007], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.0531440357118845 0.004003672052149341 0.0024178203909687 0.005884997563669458 0.03409904981032014 0.4593898646235466 0.007890491175930946 42.58304334259033
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2889, -0.5606,  0.9525,  1.0

Testing score per episode (id: 0) is 99.0 (average over 1 episode(s))
== Mean score per episode is 98.99010098990101 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1096, -0.5228,  0.9910,  1.0960, -0.5555, -0.1084,  0.1164, -0.5947,
        -0.0893,  0.1258]) tensor([-0.1114, -0.5397,  1.0241,  1.0973, -0.5516, -0.0876,  0.1034, -0.6172,
         0.0921, -0.0449]) tensor([-0.0346, -0.5663,  1.1026,  1.0702, -0.5325, -0.1007,  0.1140, -0.5993,
         0.1355,  0.2046])
R[0]
tensor([-0.0021], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05298052956163883 0.0035044425317828426 0.002718408272417946 0.0059213942750357094 0.034073834359645845 0.41216207218170164 0.007837328062858433 42.23785514068604
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0965, -0.5064,  0.701

Testing score per episode (id: 0) is 117.0 (average over 1 episode(s))
== Mean score per episode is 116.98830116988302 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2903, -0.6950,  0.9543,  1.0203, -0.6343, -0.1161,  0.0942, -0.7525,
        -0.2375,  0.1820]) tensor([-0.1292, -0.6818,  0.9763,  1.0014, -0.5784, -0.0969,  0.0755, -0.7008,
         0.1087, -0.0297]) tensor([-0.0249, -0.6898,  1.0978,  1.1349, -0.6602, -0.0438, -0.0772, -0.7334,
         0.0590,  0.1374])
R[0]
tensor([-0.0009], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05274410558491945 0.004193886797856976 0.002989927728642215 0.005952146195573732 0.033697807051241396 0.43663395911455155 0.007839851914905011 42.5223217086792
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1911, -0.4913,  0.8276, 

Testing score per episode (id: 0) is 118.0 (average over 1 episode(s))
== Mean score per episode is 117.98820117988201 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0343, -0.0126,  0.8052,  0.6582, -0.4413,  0.0560,  0.1217, -1.1737,
        -0.5784,  0.6910]) tensor([ 0.0177, -0.0627,  0.8514,  0.6608, -0.4930, -0.0424,  0.2351, -1.1827,
        -0.1154, -0.0847]) tensor([-0.0285, -0.0895,  0.7484,  0.5644, -0.4687,  0.1680, -0.0196, -1.2444,
         0.1280, -0.2383])
R[0]
tensor([0.0632], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05273013079166412 0.003973735508734535 0.002972322411093046 0.005777182741323486 0.033593815930187704 0.42357069182395934 0.008085406223777682 42.522126007080075
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2442, -0.7618,  0.6682,

Testing score per episode (id: 0) is 87.0 (average over 1 episode(s))
== Mean score per episode is 86.991300869913 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.6344, -0.6843,  1.2302,  0.9551, -0.8792, -0.2490,  0.3894, -1.1994,
        -0.0752,  0.4887]) tensor([ 0.0106, -0.6661,  1.2011,  0.9346, -0.8487, -0.2620,  0.4139, -1.1577,
         0.1217, -0.0296]) tensor([-0.4426, -0.6863,  1.2780,  0.9891, -0.8852, -0.2507,  0.3841, -1.2010,
         0.7343, -0.4926])
R[0]
tensor([0.0010], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.053249672934412955 0.0037855795121540725 0.002327739889755321 0.005690590754151344 0.03411541969701648 0.4192601398229599 0.007967819712590427 42.3043949584961
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3887, -0.5679,  1.1116,  

Testing score per episode (id: 0) is 115.0 (average over 1 episode(s))
== Mean score per episode is 114.98850114988501 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3708, -0.4810,  1.0971,  1.1804, -0.5633, -0.1146,  0.0344, -0.5041,
        -0.0545, -0.6487]) tensor([-0.1090, -0.5264,  1.0987,  1.1275, -0.5679, -0.1447,  0.0868, -0.5391,
         0.1463, -0.0180]) tensor([ 0.5050, -0.6115,  1.1649,  1.0546, -0.6676, -0.1706,  0.1039, -0.7083,
         0.9197, -0.5106])
R[0]
tensor([-0.0050], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05296795478463173 0.0036850109001134116 0.002504532242734058 0.005705471157794818 0.03302979025617242 0.4250154520273209 0.007966527359560133 42.41981079101563
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1961, -0.6321,  0.8430, 

Testing score per episode (id: 0) is 88.0 (average over 1 episode(s))
== Mean score per episode is 87.99120087991201 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.9345, -0.8907,  1.0331,  0.5010, -0.0684,  0.1917,  0.0703, -0.7125,
         0.4468,  0.6868]) tensor([-0.2295, -0.9008,  0.9679,  0.4772, -0.0992,  0.1890,  0.0602, -0.7765,
         0.2226,  0.0582]) tensor([-0.3876, -1.1580,  1.1573,  0.1113,  0.0756,  0.1635,  0.0729, -0.8255,
         0.7021,  0.4461])
R[0]
tensor([0.0120], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05266468677669764 0.0037693392027231313 0.002496048041186441 0.0060253155259415506 0.03388444599509239 0.4302035987377167 0.0074961930485442285 42.5674253616333
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0995, -0.4784,  1.1279, 

Testing score per episode (id: 0) is 113.0 (average over 1 episode(s))
== Mean score per episode is 112.98870112988702 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1088, -1.1098,  1.2063,  0.1213, -0.0341, -0.1148, -0.0280, -0.6137,
         0.5521,  0.4824]) tensor([-0.2840, -1.0626,  1.1946,  0.1795, -0.0059, -0.0760, -0.0347, -0.5838,
         0.3216,  0.1281]) tensor([-0.0145, -1.0176,  1.1237,  0.3721, -0.2360, -0.1280, -0.0199, -0.6734,
         0.2947,  0.2304])
R[0]
tensor([0.0023], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05238526930660009 0.004003838790607915 0.0025625396906598327 0.005970996242482215 0.033522969208657744 0.42753854048252105 0.007531527810264379 42.6003258895874
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0011, -0.7054,  1.0743, 

Testing score per episode (id: 0) is 85.0 (average over 1 episode(s))
== Mean score per episode is 84.99150084991501 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1818, -1.4656,  0.9672, -0.3937,  0.5020,  0.2026,  0.7346, -1.1934,
         0.1068,  0.3233]) tensor([-0.3907, -1.3882,  0.9745, -0.2688,  0.3999,  0.2534,  0.5589, -1.1908,
         0.3287,  0.1149]) tensor([-0.6279, -1.0561,  0.7817,  0.3227, -0.0793,  0.0818,  0.6205, -1.1234,
        -0.1078,  0.1223])
R[0]
tensor([-0.0367], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052468593217432496 0.0037550591855560925 0.0024337966691000473 0.005829234872246161 0.033465767480432985 0.42938923358917236 0.007619267662521452 42.465772804260254
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2599, -0.7768,  0.8

Testing score per episode (id: 0) is 111.0 (average over 1 episode(s))
== Mean score per episode is 110.98890110988901 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0045, -0.5186,  1.1328,  1.1792, -0.6193, -0.0988, -0.0398, -0.5705,
         0.6420,  0.2344]) tensor([-0.0457, -0.5361,  1.0948,  1.1468, -0.6211, -0.1067, -0.0075, -0.6120,
         0.0936, -0.0205]) tensor([ 0.1793, -0.5585,  1.0519,  1.1291, -0.6262, -0.1619,  0.0676, -0.5838,
        -0.4446,  0.3503])
R[0]
tensor([-0.0007], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05260105015337467 0.004076021988248612 0.0022193772265181907 0.005983002248452976 0.03376371479034424 0.4270752133727074 0.007574804403819144 42.473583679199216
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1314, -0.1683,  0.5247,

Testing score per episode (id: 0) is 99.0 (average over 1 episode(s))
== Mean score per episode is 98.99010098990101 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5028, -0.5208,  1.0408,  0.9500, -0.7864, -0.3795,  0.6299, -0.9941,
         0.0359, -1.0937]) tensor([-0.0038, -0.5219,  1.0498,  0.9223, -0.7676, -0.3627,  0.6049, -0.9845,
         0.0068, -0.1870]) tensor([-0.5562, -0.5572,  1.0601,  0.9213, -0.7484, -0.3318,  0.6535, -1.0799,
         0.5796, -0.3187])
R[0]
tensor([-0.0003], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05291664337366819 0.004051132584550942 0.002293280171873448 0.006039519598009065 0.03339914213865995 0.4240256954431534 0.007395779558923095 42.29952210235596
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1447, -0.8183,  1.0042,  

Testing score per episode (id: 0) is 91.0 (average over 1 episode(s))
== Mean score per episode is 90.99090090990902 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4618, -1.0203,  1.1338,  0.1507,  0.0599, -0.1443,  0.1912, -0.5320,
         0.2163, -0.1645]) tensor([-0.2819, -0.9884,  1.1647,  0.2365,  0.0440, -0.1070,  0.1620, -0.5280,
         0.2596,  0.1456]) tensor([-0.4571, -1.0266,  1.1262,  0.1577,  0.0641, -0.1616,  0.2306, -0.5515,
        -0.2068, -0.0288])
R[0]
tensor([-0.0028], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052457638770341875 0.00443154367000534 0.0021569085937917405 0.006549619593424722 0.03423248647153378 0.43766048389673234 0.007342720545828342 42.519056617736815
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0044, -0.8161,  0.7774

Testing score per episode (id: 0) is 117.0 (average over 1 episode(s))
== Mean score per episode is 116.98830116988302 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1036, -0.2125,  0.7297,  0.7093, -0.2260,  0.4632, -0.0295, -1.4263,
        -0.0169,  0.2397]) tensor([-0.0287, -0.2571,  0.7898,  0.7085, -0.3041,  0.2511,  0.1578, -1.3020,
        -0.0160, -0.0255]) tensor([ 0.3031, -0.6126,  0.9471,  0.9188, -0.7818, -0.3002,  0.6019, -1.1377,
         0.4892,  0.0771])
R[0]
tensor([0.3797], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05258479810506105 0.004643085019113641 0.0023468521308750495 0.0063810566685860975 0.033204464487731455 0.42326321750879287 0.007450483383610844 42.436827224731445
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1047, -0.5221,  1.016

Testing score per episode (id: 0) is 109.0 (average over 1 episode(s))
== Mean score per episode is 108.98910108989101 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0901, -0.8010,  0.8741,  0.3850, -0.1469, -0.2379,  0.1907, -0.3694,
        -0.2001,  0.0432]) tensor([-0.2057, -0.8275,  0.8674,  0.4156, -0.1891, -0.1769,  0.1351, -0.5111,
         0.0673,  0.0315]) tensor([-0.5714, -1.2379,  1.2318,  0.4286, -0.2356,  0.0026,  0.2712, -1.1260,
         0.4514,  0.4364])
R[0]
tensor([0.0162], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05243603336066008 0.0038857715215854112 0.0021498797364965867 0.005889462770661339 0.033757249083369974 0.4138888111114502 0.007582979323808104 42.39364839172363
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1523, -0.5930,  1.1982,

Testing score per episode (id: 0) is 102.0 (average over 1 episode(s))
== Mean score per episode is 101.98980101989801 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1302, -0.6265,  1.0555,  1.0037, -0.7606, -0.2377,  0.1999, -0.8277,
        -0.0683,  0.4786]) tensor([-0.0303, -0.6324,  1.1026,  1.0156, -0.7274, -0.2048,  0.1803, -0.8203,
         0.1281, -0.0193]) tensor([-0.2642, -0.7730,  1.3017,  1.0554, -0.8006, -0.0596, -0.0089, -1.0362,
         0.5510, -0.6746])
R[0]
tensor([0.0019], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052929519139230254 0.004282393519193647 0.0020569291982901632 0.005974638394545764 0.03331246193870902 0.42293008568882945 0.007165010320488364 42.45098928070068
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.5578, -0.6249,  1.0849,

Testing score per episode (id: 0) is 129.0 (average over 1 episode(s))
== Mean score per episode is 128.98710128987102 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2003, -0.8862,  1.3182,  1.1076, -0.9056, -0.0085, -0.2435, -1.0705,
         0.5845, -0.2436]) tensor([-0.0241, -0.8692,  1.3171,  1.0881, -0.8125,  0.0124, -0.2306, -1.0039,
         0.3105,  0.1044]) tensor([ 0.1780, -0.8285,  1.4465,  1.1672, -0.8712,  0.0685, -0.3547, -1.0361,
         0.7120, -0.3729])
R[0]
tensor([0.0031], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05229800739139318 0.0038481271659074993 0.0022093022774961357 0.005604055622359738 0.034237861104309555 0.41125810754299164 0.007368549291975796 42.484813262939454
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2102, -0.7385,  1.127

Testing score per episode (id: 0) is 96.0 (average over 1 episode(s))
== Mean score per episode is 95.99040095990401 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.7828, -0.8298,  0.9706,  0.3588, -0.0598, -0.1760,  0.2166, -0.4725,
         0.6056, -0.3107]) tensor([-0.2364, -0.8162,  0.9962,  0.4505, -0.1299, -0.1673,  0.1934, -0.5105,
         0.0969,  0.0685]) tensor([-0.2793, -0.8564,  0.9881,  0.3374, -0.0490, -0.1978,  0.2835, -0.4766,
         0.9477, -0.6087])
R[0]
tensor([0.0041], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052394131377339365 0.003983947041469946 0.0021471598008320145 0.005962467807810754 0.03385777585580945 0.4096962563991547 0.007584244304802269 42.44449405670166
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3492, -0.4047,  0.8671, 

Testing score per episode (id: 0) is 107.0 (average over 1 episode(s))
== Mean score per episode is 106.98930106989302 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3356, -0.7838,  1.1545,  0.9605, -0.8675, -0.1391,  0.1145, -1.1212,
        -0.4876,  0.5444]) tensor([-0.0051, -0.7665,  1.1308,  0.9641, -0.8417, -0.1628,  0.1577, -1.0751,
         0.1672,  0.0201]) tensor([ 0.2953, -0.7218,  1.1389,  0.9618, -0.8377, -0.2126,  0.2738, -1.1124,
         0.3459, -0.6393])
R[0]
tensor([-0.0036], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052655362166464326 0.0042216123369635174 0.0025144584079625928 0.00604132378147915 0.033625356800854204 0.41744732770323756 0.007673738304525614 42.54315280151367
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1392, -0.5161,  0.937

Testing score per episode (id: 0) is 129.0 (average over 1 episode(s))
== Mean score per episode is 128.98710128987102 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4090, -0.5140,  0.9851,  0.8879, -0.7180, -0.3388,  0.6589, -0.9887,
         0.3344, -0.7890]) tensor([-0.0016, -0.5101,  0.9751,  0.8919, -0.7512, -0.3351,  0.6240, -1.0121,
         0.0296, -0.0849]) tensor([-0.2755, -0.5451,  1.0315,  0.8529, -0.6451, -0.2989,  0.7207, -1.0394,
         0.6311, -0.3585])
R[0]
tensor([0.0025], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.0530200654566288 0.003982096736866879 0.0018956092577145681 0.00585015779430978 0.033652182295918466 0.40886970442533493 0.007405484125483781 42.382374855041505
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4966, -0.6629,  1.3377, 

Testing score per episode (id: 0) is 98.0 (average over 1 episode(s))
== Mean score per episode is 97.99020097990201 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5861, -0.8929,  1.0283,  0.4105, -0.2144, -0.2549,  0.2987, -0.5966,
         0.1963,  0.4634]) tensor([-0.1689, -0.8862,  0.9692,  0.3896, -0.2105, -0.2033,  0.2478, -0.6757,
         0.0781,  0.0451]) tensor([-0.2808, -0.7846,  1.0248,  0.6173, -0.3459, -0.2539,  0.3194, -0.6123,
         0.1860,  0.1156])
R[0]
tensor([-0.0045], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05238779255747795 0.004397968792147367 0.001980773262805087 0.006213240211363882 0.03364926026389003 0.4251742123961449 0.007155030610971153 42.59016069793701
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1656, -0.8060,  0.8177,  

Testing score per episode (id: 0) is 97.0 (average over 1 episode(s))
== Mean score per episode is 96.99030096990302 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.6096, -0.3589,  0.9109,  1.3421, -0.3893,  0.0592, -0.1058, -0.3052,
         0.2567, -0.0271]) tensor([-0.0810, -0.4033,  0.9419,  1.3085, -0.3970,  0.0738, -0.1208, -0.3654,
         0.0982,  0.0348]) tensor([ 0.2447, -0.3257,  0.8690,  1.4026, -0.3499,  0.0963, -0.1312, -0.2671,
         0.0112, -0.0731])
R[0]
tensor([-0.0015], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052661598727107045 0.004231201931523174 0.001961664189815565 0.00579851605091244 0.034257917379960416 0.4376180119514465 0.007314007812179625 42.69823840332031
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-2.7061e-05, -5.8059e-01,  

Testing score per episode (id: 0) is 119.0 (average over 1 episode(s))
== Mean score per episode is 118.98810118988101 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2345, -0.6884,  1.0687,  1.0236, -0.6377, -0.1412,  0.0441, -0.6842,
        -0.3606,  1.0845]) tensor([-0.0527, -0.7072,  1.0621,  1.0246, -0.6337, -0.1445,  0.0552, -0.7191,
         0.0794,  0.0795]) tensor([-0.2966, -0.6599,  0.9241,  0.9718, -0.6211, -0.1738,  0.1514, -0.6599,
        -0.1497,  0.0028])
R[0]
tensor([-0.0052], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05269985917210579 0.004439254051345415 0.00269641890818275 0.006124531554058194 0.03383085222542286 0.44498284697532653 0.007318381372839212 42.80504772949219
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3787, -0.7272,  0.7569,  

Testing score per episode (id: 0) is 132.0 (average over 1 episode(s))
== Mean score per episode is 131.98680131986802 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0308, -0.8422,  1.5999,  1.2415, -0.6511,  0.2910, -0.6903, -0.8513,
         0.5419,  0.2892]) tensor([-0.0402, -0.8151,  1.4477,  1.1707, -0.6067,  0.2275, -0.5253, -0.8207,
         0.4224,  0.1672]) tensor([-0.2082, -0.9245,  1.4608,  1.0577, -0.5798,  0.2391, -0.6083, -0.8573,
         0.4466,  0.1951])
R[0]
tensor([0.0153], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052592651888728144 0.004800075347236998 0.0021669550417918798 0.006108349109417759 0.034139351760968564 0.4455396302640438 0.007137235651724041 42.773993080139164
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0194, -0.5801,  1.1753

Testing score per episode (id: 0) is 132.0 (average over 1 episode(s))
== Mean score per episode is 131.98680131986802 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2145, -0.5560,  0.8891,  0.8021, -0.6776, -0.2074,  0.5989, -1.1553,
        -0.4340, -0.1116]) tensor([-0.0029, -0.5506,  0.8810,  0.8161, -0.7136, -0.2054,  0.5655, -1.1782,
         0.0381, -0.0560]) tensor([-0.1571, -0.6933,  1.0691,  0.8674, -0.7620, -0.1566,  0.4740, -1.3013,
         0.6588,  0.4329])
R[0]
tensor([-0.0011], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05244971693307161 0.004544041225264664 0.0020068374371630853 0.0060474482463905585 0.03389741912856698 0.426663200199604 0.00724793943297118 42.61693579864502
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4147, -0.8070,  1.3925,  

Testing score per episode (id: 0) is 132.0 (average over 1 episode(s))
== Mean score per episode is 131.98680131986802 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2090, -0.6284,  1.0136,  0.8358, -0.7003, -0.1653,  0.5175, -1.2678,
        -0.1348,  0.0955]) tensor([ 0.0053, -0.6025,  0.9923,  0.8044, -0.6622, -0.1529,  0.4940, -1.2203,
         0.0937, -0.0171]) tensor([ 0.1801, -0.7124,  0.9805,  0.8805, -0.7370, -0.1241,  0.4533, -1.3299,
         0.3709,  0.1904])
R[0]
tensor([-0.0002], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052856932081282136 0.004689643746032744 0.001740953503505807 0.006118977289414033 0.033902277156710625 0.42260562247037886 0.007394261364359409 42.695368034362794
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2842, -0.7162,  0.695

Testing score per episode (id: 0) is 132.0 (average over 1 episode(s))
== Mean score per episode is 131.98680131986802 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1623, -0.6259,  1.1736,  1.1168, -0.5964, -0.0752, -0.0585, -0.6143,
        -0.3361, -0.0724]) tensor([-0.0822, -0.6274,  1.1569,  1.0859, -0.5565, -0.0568, -0.0477, -0.6182,
         0.2054,  0.0517]) tensor([-0.4671, -0.6309,  0.9855,  0.9712, -0.6024, -0.1940,  0.1445, -0.6349,
         0.1512,  0.4265])
R[0]
tensor([-0.0001], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052825918555259706 0.004116744812246907 0.0019128811362988927 0.005819176183547824 0.033868471700698136 0.4197612495422363 0.0071894048852846025 42.663539726257326
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3059, -0.6045,  1.11

Testing score per episode (id: 0) is 124.0 (average over 1 episode(s))
== Mean score per episode is 123.98760123987601 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3697, -0.6100,  0.9598,  1.0888, -0.6471, -0.1351, -0.0435, -0.5803,
         0.1782, -0.0154]) tensor([-0.0620, -0.6327,  1.0245,  1.0959, -0.6094, -0.1037, -0.0538, -0.5817,
         0.1059,  0.0331]) tensor([-0.4847, -0.5727,  1.0786,  1.1430, -0.5589, -0.0379, -0.1511, -0.5269,
        -0.0602,  0.3043])
R[0]
tensor([0.0007], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05276254904270172 0.004523670692657106 0.0019633494716554197 0.005676254858379253 0.034732470978051426 0.4097990151643753 0.007495580712333322 42.56948078155518
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4086, -1.1392,  1.1065, 

Testing score per episode (id: 0) is 124.0 (average over 1 episode(s))
== Mean score per episode is 123.98760123987601 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2101, -0.5930,  0.8494,  0.7786, -0.7322, -0.0717,  0.3085, -1.3263,
        -0.1159, -0.0872]) tensor([-0.0032, -0.5935,  0.9006,  0.7720, -0.6815, -0.0498,  0.2785, -1.2754,
         0.0644, -0.0042]) tensor([ 0.5060, -0.5795,  0.8018,  0.7145, -0.6622, -0.0770,  0.4183, -1.3147,
        -0.1387, -0.6178])
R[0]
tensor([-0.0006], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052653663851320746 0.004942065833360175 0.001747807772066153 0.005900198440998793 0.03386859978735447 0.4181776105761528 0.0072314839260652665 42.61050804901123
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-1.0563, -1.2158,  0.9323,

Testing score per episode (id: 0) is 132.0 (average over 1 episode(s))
== Mean score per episode is 131.98680131986802 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3673, -0.5096,  0.6562,  0.8445, -0.6373, -0.3373,  0.3740, -0.6292,
        -0.5344,  0.0531]) tensor([-0.0508, -0.5636,  0.7791,  0.8930, -0.6630, -0.3199,  0.3398, -0.6703,
        -0.0583, -0.0757]) tensor([-0.6162, -0.5420,  0.7506,  0.8951, -0.6379, -0.3165,  0.3192, -0.6129,
        -0.3939,  0.1067])
R[0]
tensor([0.0049], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05275624990463257 0.004329152084386806 0.0015255139800483448 0.005626686625182628 0.03402067817933858 0.4160116692781448 0.007865904303267598 42.6309701461792
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2975, -0.9261,  1.1231,  0

Testing score per episode (id: 0) is 132.0 (average over 1 episode(s))
== Mean score per episode is 131.98680131986802 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3198, -0.7562,  1.1978,  1.1153, -0.6327,  0.0553, -0.2752, -0.7283,
         0.8475,  0.1252]) tensor([-0.0836, -0.7548,  1.1818,  1.0750, -0.5637,  0.0727, -0.2610, -0.7057,
         0.2262,  0.0913]) tensor([-0.3190, -0.6454,  1.1113,  1.1684, -0.5980,  0.0163, -0.1647, -0.6291,
        -0.2424,  0.0899])
R[0]
tensor([0.0016], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05282614380121231 0.004642024677445079 0.0019011136401823024 0.005991770030464977 0.034322577103972435 0.4120712466239929 0.00762473339214921 42.591102584838865
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.7811, -1.1145,  1.2190, 

Testing score per episode (id: 0) is 134.0 (average over 1 episode(s))
== Mean score per episode is 133.986601339866 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0016, -0.7964,  1.4901,  1.1633, -0.8284,  0.0454, -0.2429, -1.0337,
         0.1913,  0.1899]) tensor([-0.0387, -0.7638,  1.4169,  1.1270, -0.7647,  0.0517, -0.1980, -0.9989,
         0.3667,  0.1145]) tensor([-0.3851, -0.6199,  1.4047,  1.1627, -0.6801,  0.0414, -0.2106, -0.8045,
         0.8514,  0.4861])
R[0]
tensor([0.0011], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052954287827014924 0.004721677374562205 0.0019430650932208665 0.006269546497613192 0.03383141314983368 0.41284500309824945 0.007323110060300678 42.58208409118652
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1830, -0.6549,  1.1130,

Testing score per episode (id: 0) is 132.0 (average over 1 episode(s))
== Mean score per episode is 131.98680131986802 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5474, -0.7715,  1.3367,  1.1043, -0.6340,  0.0504, -0.2367, -0.7697,
         0.2643, -0.8406]) tensor([-0.1035, -0.7726,  1.3022,  1.0403, -0.5774,  0.0508, -0.2067, -0.7477,
         0.2868,  0.0792]) tensor([-0.1993, -0.7061,  1.1618,  1.0850, -0.6351, -0.0606, -0.0750, -0.6981,
        -0.1087, -0.0158])
R[0]
tensor([-0.0007], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.053125540636479854 0.004190984825068881 0.0013398867484602306 0.0055831608483567835 0.03414855994842946 0.41381361919641496 0.007429423925932497 42.56330660247803
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2481, -0.6813,  1.055

Testing score per episode (id: 0) is 135.0 (average over 1 episode(s))
== Mean score per episode is 134.98650134986502 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2058, -0.4600,  1.0707,  1.2337, -0.4607, -0.0076, -0.0490, -0.4502,
        -0.1251, -0.7685]) tensor([-9.0623e-02, -4.8363e-01,  1.0569e+00,  1.1847e+00, -4.4161e-01,
         9.0895e-04, -4.4079e-02, -4.8487e-01,  1.0658e-01,  3.5771e-02]) tensor([-0.9437, -0.4449,  1.0445,  1.2210, -0.4753, -0.0456, -0.0061, -0.4425,
         0.5619, -0.4102])
R[0]
tensor([-0.0018], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05323827987164259 0.004489199343512155 0.001642631443467053 0.0059961037589237095 0.034006964832544326 0.4129493629336357 0.0073108244179748 42.51241926574707
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_

Testing score per episode (id: 0) is 135.0 (average over 1 episode(s))
== Mean score per episode is 134.98650134986502 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0604,  0.0899,  0.7700,  1.1369, -0.2872,  0.5623, -0.6406, -0.8865,
         0.0935,  0.4265]) tensor([ 0.0108, -0.0225,  0.8153,  1.0911, -0.3415,  0.3996, -0.4457, -0.8932,
        -0.0150, -0.0112]) tensor([ 0.4560, -0.2760,  0.9309,  1.0561, -0.2376,  1.0297, -0.8842, -1.4423,
         0.6849, -0.3974])
R[0]
tensor([0.1208], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05278044382482767 0.004466256522424374 0.0015483521135774936 0.00606248141406104 0.03401126246154308 0.41625371962785723 0.007211745561566204 42.5304190826416
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2131, -0.7507,  1.0600,  0

Testing score per episode (id: 0) is 133.0 (average over 1 episode(s))
== Mean score per episode is 132.986701329867 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1918, -0.5682,  0.7998,  0.8387, -0.7002, -0.1556,  0.4772, -1.2153,
         0.1411,  0.1953]) tensor([ 0.0093, -0.5734,  0.8609,  0.8380, -0.6714, -0.1320,  0.4306, -1.1801,
         0.0032, -0.0332]) tensor([ 0.5638, -0.6748,  1.0391,  0.9677, -0.8260, -0.1699,  0.3271, -1.2234,
        -0.0773,  0.7164])
R[0]
tensor([-0.0007], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.053347492776811126 0.00472445832457197 0.0017119886771433813 0.006163692146190442 0.03420457236096263 0.4212376393675804 0.007421267750207335 42.43948818206787
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4648, -0.8179,  1.3986, 

Testing score per episode (id: 0) is 125.0 (average over 1 episode(s))
== Mean score per episode is 124.98750124987501 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4723, -0.5035,  0.9398,  0.8901, -0.7240, -0.3360,  0.6487, -0.9730,
        -0.0924, -0.2300]) tensor([-0.0356, -0.5269,  0.9663,  0.8832, -0.7525, -0.3441,  0.6370, -0.9961,
         0.0223, -0.0351]) tensor([-0.3312, -0.5282,  0.9636,  0.8885, -0.7314, -0.3528,  0.6677, -1.0030,
        -0.1547, -0.0635])
R[0]
tensor([-0.0026], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.05285697884857655 0.004124284572124452 0.0018193283318769318 0.005732052551116794 0.03368167556449771 0.4271892120242119 0.007377067970111966 42.45393675994873
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0159, -0.4872,  0.9205, 

Testing score per episode (id: 0) is 129.0 (average over 1 episode(s))
== Mean score per episode is 128.98710128987102 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2898, -0.9217,  1.0437,  0.4152, -0.2290, -0.1192,  0.2811, -0.7423,
         0.0973, -0.4996]) tensor([-0.1729, -0.9191,  0.9817,  0.3792, -0.2331, -0.0843,  0.2349, -0.8145,
         0.1460,  0.0980]) tensor([-0.4213, -0.9293,  0.9217,  0.3168, -0.0962, -0.0876,  0.4454, -0.7710,
         0.0115,  0.0321])
R[0]
tensor([0.0064], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.0528000638410449 0.004080717799999548 0.0017217382321141486 0.00565961909876205 0.03404497668519616 0.42007965648174284 0.0074181656530126925 42.30434118652344
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0879, -0.5568,  0.9766,  

Testing score per episode (id: 0) is 127.0 (average over 1 episode(s))
== Mean score per episode is 126.98730126987302 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4413, -1.1693,  1.0453,  0.1261, -0.0941, -0.0071,  0.1890, -0.9104,
         0.3848,  0.6814]) tensor([-0.1814, -1.1125,  0.9109,  0.1236, -0.1012,  0.0465,  0.1419, -0.9818,
         0.2054,  0.1112]) tensor([-0.2615, -1.1050,  1.0002, -0.0201,  0.0514, -0.0129,  0.1658, -0.8083,
         0.7352,  0.5348])
R[0]
tensor([0.0416], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.052596515700221065 0.004301929554749222 0.0014796795093880064 0.005417072794400155 0.03447517057880759 0.4188016831278801 0.007495489842724055 42.28589402770996
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2418, -0.0124,  0.9388, 

Testing score per episode (id: 0) is 126.0 (average over 1 episode(s))
== Mean score per episode is 125.98740125987402 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4266, -0.6201,  1.1499,  1.0926, -0.5943, -0.1082, -0.0070, -0.5875,
         0.2006,  0.3858]) tensor([-0.0515, -0.6246,  1.1077,  1.0779, -0.6023, -0.1113,  0.0173, -0.6295,
         0.1376,  0.0276]) tensor([ 0.8512, -0.5768,  1.2807,  1.1408, -0.6125, -0.0793, -0.1022, -0.5931,
        -0.3059,  0.2428])
R[0]
tensor([0.0043], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2, self.loss_VAE
0.053029097661376 0.0046811138080702225 0.0018029542921526626 0.005690684226690792 0.033766132716089484 0.42409024673700335 0.0075583585347048935 42.321840385437014
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1671, -0.6929,  1.1444

Testing score per episode (id: 0) is 127.0 (average over 1 episode(s))
== Mean score per episode is 126.98730126987302 over 1 episodes ==


  ax.scatter(
  plt.show()


Best neural net obtained after 43 epochs, with validation score 135.0
{'vs': [2.0, 2.0, 14.0, 7.0, 50.0, 26.0, 64.0, 99.0, 98.0, 100.0, 106.0, 99.0, 117.0, 118.0, 87.0, 115.0, 88.0, 113.0, 85.0, 111.0, 99.0, 91.0, 117.0, 109.0, 102.0, 129.0, 96.0, 107.0, 129.0, 98.0, 97.0, 119.0, 132.0, 132.0, 132.0, 132.0, 124.0, 124.0, 132.0, 132.0, 134.0, 132.0, 135.0, 135.0, 133.0, 125.0, 129.0, 127.0, 126.0, 127.0], 'ts': []}


## Visualize performance

In [None]:
agent.setNetwork(fname, nEpoch=33)

In [None]:
agent._in_episode = True
agent._mode = 0 # Testing mode with plan_depth=0
initState = env.reset(agent._mode)
inputDims = env.inputDimensions()

for i in range(len(inputDims)):
    if inputDims[i][0] > 1:
        agent._state[i][1:] = initState[i][1:]
agent._Vs_on_last_episode = []
is_terminal = False
reward = 0

### Frame by frame

In [None]:
%matplotlib inline

for i in range(100):
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    plt.figure()
    plt.imshow(np.flip(_obs.squeeze()))
    plt.show()
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
    V, action, reward, _ = agent._step()
    print(action)
    agent._Vs_on_last_episode.append(V)
    is_terminal = env.inTerminalState()
    if is_terminal: break

### As animation

In [None]:
%matplotlib notebook

import numpy as np 
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation

fig, ax = plt.subplots(1,1)
obs = env.observe()
_obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
_obs = np.flip(_obs.squeeze())
ax.set_xticks([])
ax.set_yticks([])
im = ax.imshow(np.zeros(_obs.shape))

def init():
    plt.cla()
    im = ax.imshow(_obs)
    return [im]

def animate(i, *args, **kwargs):
    plt.cla()
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    _obs = np.flip(_obs.squeeze())
    im = ax.imshow(_obs)
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
        V, action, reward, _ = agent._step()
        agent._Vs_on_last_episode.append(V)
    return [im]

ani = animation.FuncAnimation(fig, animate, init_func=init, 
     frames=100, blit=False, repeat=True)
ani.save('behavior.gif', writer="ffmpeg", fps = 15)
fig.show()