In [1]:
import sys
import logging

import numpy as np
import matplotlib.pyplot as plt
from joblib import hash, dump, load
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from figure8_env import MyEnv as figure8_env
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy, FixedFigure8Policy

In [2]:
figure8_give_rewards = True
nn_yaml = 'network_noconv.yaml'
higher_dim_obs = False
internal_dim = 10

In [3]:
class Defaults:
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 5000
    epochs = 50
    steps_per_test = 1000
    period_btw_summary_perfs = 1

    # ----------------------
    # Temporal Processing Parameters
    # ----------------------
    nstep = 15
    recurrent = False
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2
    show_rewards = False

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    update_rule = 'rmsprop'
    learning_rate = 1 * 1E-4 # 1E-4
    learning_rate_decay = 0.9
    discount = 0.9
    discount_inc = 1
    discount_max = 0.99
    rms_decay = 0.9
    rms_epsilon = 0.0001
    momentum = 0
    clip_norm = 1.0
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 10000
    update_frequency = 1
    replay_memory_size = 50000 #1000000 #replacing with 200000 will works just fine (in case you dont have 18gb of memory)
    batch_size = 32
    freeze_interval = 1000
    deterministic = False
    
    # ----------------------
    # Learning algo parameters
    # ----------------------
    loss_weights = [5E-3, 1E-3, 5E-3, 5E-3, 5E-3, 5E-3, 1.] #[0, 0, 0, 0, 0, 0, 1.] #[1E-3, 1E-3, 1E-3, 1E-3, 1E-3, 1E-3, 1.]

In [4]:
parameters = Defaults()
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

# --- Instantiate environment ---
env = figure8_env(
    give_rewards=figure8_give_rewards,
    intern_dim=internal_dim,
    higher_dim_obs=higher_dim_obs,
    show_rewards=parameters.show_rewards,
    nstep=parameters.nstep
    )

# --- Instantiate learning_algo ---
learning_algo = CRAR(
    env,
    parameters.rms_decay,
    parameters.rms_epsilon,
    parameters.momentum,
    parameters.clip_norm,
    parameters.freeze_interval,
    parameters.batch_size,
    parameters.update_rule,
    rng,
    high_int_dim=False,
    internal_dim=internal_dim, lr=parameters.learning_rate,
    nn_yaml=nn_yaml, double_Q=True,
    loss_weights=parameters.loss_weights,
    nstep=parameters.nstep,
    recurrent=parameters.recurrent
    )

if figure8_give_rewards:
    train_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.2,
        consider_valid_transitions=False
        )
    test_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.
        )
else:
    train_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng, epsilon=0.2,
        height=env.HEIGHT, width=env.WIDTH
        )
    test_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng,
        height=env.HEIGHT, width=env.WIDTH
        )

# --- Instantiate agent ---
agent = NeuralAgent(
    env,
    learning_algo,
    parameters.replay_memory_size,
    1,
    parameters.batch_size,
    rng,
    train_policy=train_policy,
    test_policy=test_policy)

# --- Create unique filename for FindBestController ---
h = hash(vars(parameters), hash_name="sha1")
fname = "test_" + h
print("The parameters hash is: {}".format(h))
print("The parameters are: {}".format(parameters))

# As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
# policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
# precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
# episode or epoch (or never, hence the resetEvery='none').
agent.attach(bc.EpsilonController(
    initial_e=parameters.epsilon_start,
    e_decays=parameters.epsilon_decay,
    e_min=parameters.epsilon_min,
    evaluate_on='episode',
    periodicity=1,
    reset_every='none'))

agent.run(10, 500)
print("end gathering data")

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
# wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# Same for the discount factor.
agent.attach(bc.DiscountFactorController(
    initial_discount_factor=parameters.discount, 
    discount_factor_growth=parameters.discount_inc, 
    discount_factor_max=parameters.discount_max,
    periodicity=1))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
# has the highest validation score.
# To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
# important that the validationID is the same than the id argument of the InterleavedTestEpochController.
# The FindBestController will dump on disk the validation scores for each and every network, as well as the 
# structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
# of the validation and test scores (see below) or simply recover the resulting neural network for your 
# application.
agent.attach(bc.FindBestController(
    validationID=figure8_env.VALIDATION_MODE,
    testID=None,
    unique_fname=fname))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 
# rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 
# every [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=figure8_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1))

The parameters hash is: 62977be8e45d8a56a5537c11dfd5d2fd8dda69e0
The parameters are: <__main__.Defaults object at 0x127b732e0>
end gathering data


In [5]:
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
#agent.gathering_data=False
agent.run(parameters.epochs, parameters.steps_per_epoch)

# --- Show results ---
basename = "scores/" + fname
scores = load(basename + "_scores.jldump")
print(scores)

Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2475, -0.0807,  0.0903, -0.1572, -0.1948,  0.2499, -0.0362,  0.1996,
        -0.1272, -0.0909]) tensor([ 0.2406,  0.0717,  0.2921, -0.1544, -0.3494,  0.5412, -0.2794,  0.3914,
        -0.1751, -0.2439]) tensor([ 0.2336, -0.1109,  0.0662, -0.1431, -0.2142,  0.2458, -0.0254,  0.1770,
        -0.1266, -0.0876])
R[0]
tensor([-0.1373], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.010113606852479279 0.006552207931585144 0.16744829382840545 0.01458500341582112 0.5067190005779266 0.0 0.08160869422927498
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2221, -0.1266, -0.0896, -0.2552, -0.6228,  0.1906, -0.1427,  0.1059,
       



Testing score per episode (id: 0) is 0.0 (average over 1 episode(s))
== Mean score per episode is 0.0 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1235,  0.2706,  0.6636, -0.5422, -0.3906,  0.0878, -0.1146,  0.1714,
         0.0328,  0.6091]) tensor([-0.1721,  0.2198,  0.6275, -0.5283, -0.4505,  0.0300, -0.1098,  0.2076,
         0.0047,  0.6564]) tensor([-0.0155,  0.4316,  0.6882, -0.5752, -0.2154,  0.2133, -0.2109,  0.0824,
        -0.0292,  0.5623])
R[0]
tensor([-0.0159], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.01888653507269919 0.006016331738283043 0.013699142336408841 0.004048267596226651 0.24155641874670983 0.0003440786972641945 0.01581633604085073
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2068,  0.0041,  0.5866, -0.5133, -0.6962, -0.2230,  0.10

== Mean score per episode is 0.9999000099990001 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3919,  0.6285,  0.4498, -0.5310,  0.4432,  0.8347,  0.0618, -0.6564,
         0.2804, -0.5141]) tensor([ 0.3797,  0.5799,  0.4498, -0.5169,  0.3985,  0.8046,  0.0672, -0.5990,
         0.2742, -0.4904]) tensor([ 0.2523,  0.3845,  0.3068, -0.3952,  0.2944,  0.7529,  0.1204, -0.6649,
         0.0955, -0.2446])
R[0]
tensor([-0.0011], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.022767915157601237 0.006401070468040416 0.008704128814002616 0.005574549739714712 0.20806913766264915 0.0002929259464144707 0.009912539151555393
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3322,  0.0296, -0.0415, -0.2586, -0.2073,  0.3244,  0.

Testing score per episode (id: 0) is 102.0 (average over 1 episode(s))
== Mean score per episode is 101.98980101989801 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3315,  0.5105,  0.8647, -0.6727, -0.1465,  0.0479,  0.2394,  0.5562,
         0.7680, -0.8919]) tensor([ 0.3663,  0.5730,  0.9178, -0.7015, -0.0498,  0.1964,  0.2066,  0.4849,
         0.7672, -0.9238]) tensor([ 0.3994,  0.6111,  0.9069, -0.7121, -0.0239,  0.1647,  0.1944,  0.4440,
         0.7552, -0.9270])
R[0]
tensor([0.0051], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.022073978301137685 0.006662368233824964 0.0084300954264545 0.005489678370533511 0.2002567837834358 0.00047574901580810545 0.009394837500411085
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0644,  0.1885,  0.5491, -0.4945, -0.4568, -0.2347,  0.381

== Mean score per episode is 90.99090090990902 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3381, -0.1300, -0.4213, -0.0687, -0.1598,  0.5157,  0.2164, -0.6635,
        -0.4072,  0.0177]) tensor([-0.3018, -0.0691, -0.3624, -0.1135, -0.1118,  0.5373,  0.1844, -0.6761,
        -0.3980,  0.0149]) tensor([-0.2518, -0.1321, -0.4952, -0.0195, -0.0923,  0.5284,  0.2535, -0.7246,
        -0.3711, -0.1737])
R[0]
tensor([-0.0141], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021590746676549314 0.005959771543704847 0.00662613893240632 0.005014899510890246 0.1929120756983757 0.00029295361787080767 0.010077033950132317
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1545, -0.3308,  0.3961, -0.1890, -0.5856,  0.1336, -0.2

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2507,  0.3150,  0.7500, -0.5169, -0.3224, -0.1308,  0.2707,  0.6674,
         0.6872, -0.7926]) tensor([ 0.2368,  0.3632,  0.7436, -0.5436, -0.2682, -0.0744,  0.2598,  0.5751,
         0.6756, -0.8155]) tensor([ 0.3265,  0.4700,  0.8515, -0.6059, -0.1882, -0.0102,  0.2041,  0.5900,
         0.6903, -0.8349])
R[0]
tensor([0.0027], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.022223267963156105 0.005571522228849062 0.006605032914798357 0.005899808969348669 0.18985865093767643 0.00038509376347064975 0.008870028614881448
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 1.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1615, -0.3767, -0.4542,  0.0459, -0.1506,  0.5047,  0.

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3732,  0.1252,  0.1274, -0.0852,  0.0199,  0.6329, -0.1493, -0.4243,
        -0.0545, -0.1179]) tensor([ 0.3494,  0.0809,  0.1122, -0.0679, -0.0056,  0.6308, -0.1546, -0.3973,
        -0.1005, -0.0443]) tensor([ 0.3497,  0.1896,  0.2015, -0.1628,  0.0274,  0.6764, -0.1597, -0.4467,
        -0.0035, -0.1081])
R[0]
tensor([-0.0186], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021833924777805803 0.005643741392181255 0.004951068176633271 0.0056184358574682845 0.18896091303229332 0.000394541896879673 0.009096604125399609
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0754,  0.0256,  0.7323, -0.4625, -0.4717, -0.1431,  0.

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0964,  0.3251,  0.3107, -0.4246,  0.1082,  0.6482, -0.0262, -0.3493,
        -0.1183,  0.0455]) tensor([ 0.0955,  0.2962,  0.3101, -0.4087,  0.0711,  0.5980, -0.0027, -0.2921,
        -0.0944,  0.0242]) tensor([ 0.0027,  0.2540,  0.0190, -0.3233,  0.1536,  0.7541,  0.0118, -0.5790,
        -0.2817, -0.0488])
R[0]
tensor([-0.0048], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02170901247113943 0.006195935702649876 0.003874226060357614 0.005374644065392203 0.18644451463222503 0.0005340586453676224 0.008721186243521516
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0259, -0.4782, -0.2462, -0.0870, -0.5843,  0.6752, -0.4

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4069, -0.3916, -0.5042,  0.0998, -0.2394,  0.5343,  0.0141,  0.0510,
        -0.1718, -0.6849]) tensor([ 0.3402, -0.3262, -0.3790,  0.0187, -0.2757,  0.4975,  0.0447,  0.0864,
        -0.0996, -0.6345]) tensor([-0.0291, -0.3362, -0.3606,  0.1217, -0.3141, -0.0366,  0.4303, -0.1330,
        -0.0094, -0.6136])
R[0]
tensor([0.1039], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02234029170498252 0.005519960496530985 0.004221850222464127 0.005840006146812812 0.1848248127102852 0.00035403257608413694 0.009443174437968991
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0190,  0.0294, -0.1031, -0.1568,  0.0965,  0.7213,  0.02

Testing score per episode (id: 0) is 2.0 (average over 1 episode(s))
== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5414, -0.6121,  0.0626, -0.0043, -0.9960, -0.5895,  0.6000,  0.5635,
         0.1415,  0.2283]) tensor([-0.4277, -0.5397,  0.0845, -0.0187, -0.9091, -0.5435,  0.5666,  0.5552,
         0.1675,  0.1072]) tensor([-0.5273, -0.6427, -0.1112,  0.0709, -0.9263, -0.5301,  0.6747,  0.4198,
         0.1557, -0.0544])
R[0]
tensor([-0.0073], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.022466423155739903 0.005509516130579868 0.004688504290203128 0.006181547747575678 0.1788155740350485 0.00027895911782979964 0.009086083060479724
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0348, -0.1896,  0.4734, -0.2344, -0.7758, -0.4241,  0.

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0247,  0.3588,  0.6015, -0.5753, -0.1206,  0.3244, -0.0485, -0.1904,
         0.1303,  0.2586]) tensor([-0.0444,  0.3050,  0.5473, -0.5369, -0.1592,  0.3004, -0.0214, -0.1737,
         0.1257,  0.2274]) tensor([-0.0217,  0.3653,  0.6042, -0.5792, -0.1156,  0.3279, -0.0504, -0.1910,
         0.1311,  0.2537])
R[0]
tensor([0.0044], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.022345048636198045 0.006034122462602681 0.001977261825448295 0.00613837550021708 0.18270260606706143 0.00047258005291223525 0.008300914993626066
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1469,  0.2778,  0.7953, -0.5657, -0.2341, -0.0536,  0.1

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1574,  0.3067,  0.9060, -0.5970, -0.1681,  0.2490, -0.0076,  0.2805,
         0.2209,  0.3653]) tensor([ 0.1482,  0.2934,  0.8527, -0.5787, -0.1662,  0.2715, -0.0031,  0.2482,
         0.2115,  0.3128]) tensor([ 0.0347,  0.2371,  0.9424, -0.6163, -0.2647,  0.1921,  0.0069,  0.3845,
         0.1675,  0.6012])
R[0]
tensor([-0.0066], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.0218908657040447 0.005852792206183949 0.0017513459711899486 0.005730856928159483 0.18376985137164592 0.00052733463793993 0.009365263532148674
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4200, -0.4195, -0.3347,  0.0297, -0.2393,  0.5667,  0.104

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2523,  0.1817, -0.2144, -0.2584, -0.0750,  0.6134, -0.0022, -0.6192,
        -0.2925, -0.1041]) tensor([-0.2075,  0.1983, -0.1630, -0.2723, -0.0700,  0.5789,  0.0048, -0.5616,
        -0.2506, -0.1405]) tensor([-0.2760,  0.1897,  0.0007, -0.3526, -0.1411,  0.5438, -0.0418, -0.4438,
        -0.3055,  0.2006])
R[0]
tensor([-0.0053], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021780310628935696 0.006276743900350994 0.0016605839549993107 0.005136372101609595 0.18546766990423202 0.00035242697596549985 0.00900325253570918
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4355, -0.6887, -0.5692,  0.1924, -0.8582,  0.0563,  0

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2809, -0.1139, -0.0990, -0.1271, -0.1690,  0.5988, -0.0758, -0.0459,
        -0.2276, -0.1939]) tensor([ 0.3046, -0.0983, -0.0647, -0.1380, -0.1459,  0.6027, -0.0676, -0.0307,
        -0.2097, -0.2036]) tensor([ 0.1398, -0.0481,  0.0039, -0.2254, -0.2452,  0.6135, -0.1585,  0.0019,
        -0.3096,  0.0715])
R[0]
tensor([-0.0026], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021719031885266304 0.005762523027602583 0.001302655806799521 0.005170955971814692 0.18361528761684895 0.0004826270490884781 0.009481437228503637
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4457, -0.4125, -0.1902, -0.0346, -0.7600,  0.1126,  0.

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.6473,  0.9562,  0.5974, -0.6771,  0.6534,  0.9350, -0.2071, -0.3911,
         0.2030, -0.7258]) tensor([ 0.5003,  0.6769,  0.5055, -0.5622,  0.3815,  0.6449, -0.0393, -0.1632,
         0.2577, -0.6625]) tensor([ 0.5171,  0.7261,  0.7156, -0.6440,  0.4106,  0.7104, -0.1104, -0.1226,
         0.1831, -0.3532])
R[0]
tensor([0.0008], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02175390463322401 0.006388009273498028 0.0008948371018996113 0.004943962254677899 0.18079099641740323 0.0003729144260287285 0.009107013491040561
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1625, -0.0815,  0.4597, -0.2189, -0.4340, -0.2514,  0.5

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.7656,  0.9826,  0.8325, -0.7166,  0.5983,  0.8726, -0.2311, -0.2173,
         0.3520, -0.7708]) tensor([ 0.7620,  0.9791,  0.8381, -0.7184,  0.6047,  0.8475, -0.2084, -0.2115,
         0.3701, -0.7910]) tensor([ 0.7688,  0.9906,  0.8275, -0.7212,  0.6086,  0.8906, -0.2388, -0.2338,
         0.3491, -0.7704])
R[0]
tensor([0.0012], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021311094967648388 0.005003722605259099 0.0011379112805425394 0.004459344425064046 0.17907493448257447 0.0003100198283791542 0.008918409514939413
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4485,  0.2750,  0.7609, -0.4133, -0.1547,  0.0249,  0.

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3336,  0.2779,  0.7073, -0.4793, -0.0291,  0.2389,  0.1683,  0.2850,
         0.3636, -0.1896]) tensor([ 0.3475,  0.3284,  0.6891, -0.4999,  0.0286,  0.3104,  0.1275,  0.2009,
         0.3406, -0.2177]) tensor([ 0.2588,  0.1764,  0.7637, -0.4675, -0.1700,  0.0771,  0.2261,  0.4490,
         0.3986, -0.0670])
R[0]
tensor([-0.0046], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021711725322529674 0.00635126769314229 0.0007819106202150578 0.004735687180655077 0.18164812514185905 0.00041497774422168734 0.008982731656578835
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 1.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4293, -0.1952, -0.3465, -0.1763, -0.3511,  0.3621,  0

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 5.3411e-01,  4.4252e-01,  8.4614e-01, -4.9275e-01, -5.9533e-04,
         1.6563e-01,  1.5512e-01,  4.5992e-01,  5.7073e-01, -7.6512e-01]) tensor([ 0.5211,  0.5284,  0.8451, -0.5345,  0.0808,  0.2792,  0.1010,  0.3080,
         0.5364, -0.7394]) tensor([ 0.2716,  0.0386,  0.5683, -0.2912, -0.3442, -0.1413,  0.4155,  0.6152,
         0.5598, -0.6945])
R[0]
tensor([0.0016], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021340465558692814 0.005399854369799869 0.00035374849034542423 0.004634816635982133 0.17808181236684323 0.00024307353049516677 0.009037724308436737
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.6296, -0.454

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3566,  0.6577,  0.1504, -0.4357,  0.4619,  0.8371, -0.0294, -0.6870,
         0.1089, -0.8032]) tensor([ 0.2793,  0.5211,  0.1229, -0.3817,  0.3265,  0.6941,  0.0567, -0.5812,
         0.1436, -0.7445]) tensor([ 0.3412,  0.6632,  0.2467, -0.4937,  0.4645,  0.9000, -0.1018, -0.6317,
         0.0076, -0.5313])
R[0]
tensor([0.0078], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021085604820400478 0.004818155172590196 0.00011277735544445022 0.004264434093318414 0.18219856087863445 0.00017717768996953963 0.008979438859736547
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1568,  0.0653, -0.2289, -0.1749,  0.0722,  0.7449,  

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0915,  0.0981, -0.4195, -0.1405,  0.0609,  0.5401,  0.3141, -0.7796,
        -0.0275, -0.7610]) tensor([-0.0481,  0.1034, -0.4077, -0.1322,  0.0802,  0.5808,  0.2470, -0.7443,
        -0.0802, -0.7031]) tensor([-0.1878,  0.0494, -0.4302, -0.0849, -0.0158,  0.5325,  0.2219, -0.8589,
        -0.1425, -0.4596])
R[0]
tensor([0.0177], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021381977213546635 0.00498493081554625 8.943240819462517e-06 0.004285284431418404 0.17911752672493458 0.0002755168005824089 0.008808014591573737
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1642,  0.1637,  0.7077, -0.4771, -0.1850,  0.0396,  0.3

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3485,  0.0773,  0.3866, -0.5057, -0.5574,  0.1383, -0.0766,  0.1068,
         0.0396,  0.5426]) tensor([-0.3381,  0.0787,  0.3645, -0.4923, -0.5447,  0.1474, -0.0673,  0.0952,
         0.0403,  0.4973]) tensor([-0.1803,  0.2357,  0.6440, -0.5299, -0.3705,  0.1076, -0.1572,  0.2238,
        -0.0743,  0.7727])
R[0]
tensor([0.0025], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021230316113680602 0.005191589572306839 4.844496088253436e-06 0.004213343029899988 0.1782010730057955 0.0003155739679932594 0.00893574873590842
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4299,  0.3174,  0.5222, -0.3647, -0.1275,  0.7407, -0.76

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5895, -0.5457,  0.1635, -0.0854, -0.9887, -0.6484,  0.6345,  0.6199,
         0.2585,  0.3068]) tensor([-0.5528, -0.4409,  0.1768, -0.1405, -0.8958, -0.4672,  0.4856,  0.4797,
         0.1653,  0.4007]) tensor([-0.5489, -0.4787,  0.3180, -0.1586, -0.9810, -0.6911,  0.5667,  0.7424,
         0.2569,  0.4832])
R[0]
tensor([-0.0019], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020698201002553106 0.0059303517973530685 2.621763389925036e-06 0.004431465760048013 0.17827118861675262 0.00014498617500066757 0.009544510149455164
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3055,  0.3195,  0.5115, -0.4423, -0.1286,  0.7839, 

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.5463,  0.5236,  0.9796, -0.5871,  0.0960,  0.1928,  0.1883,  0.4419,
         0.6720, -0.6878]) tensor([ 0.6377,  0.6799,  0.9846, -0.6436,  0.2606,  0.4124,  0.0471,  0.2398,
         0.6055, -0.7281]) tensor([ 0.6282,  0.6775,  0.9812, -0.6427,  0.2581,  0.3873,  0.0737,  0.2697,
         0.6156, -0.7421])
R[0]
tensor([-0.0044], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.0204582677744329 0.005030193912149116 1.7092777657126135e-06 0.003986681417096407 0.17816969573497773 0.00026260879635810853 0.009640740356291644
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4688,  0.7609,  0.2266, -0.5132,  0.5484,  0.8818, -0

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2876,  0.1015,  0.6768, -0.3529, -0.3226, -0.1964,  0.4078,  0.7119,
         0.6270, -0.6525]) tensor([ 0.2559,  0.1015,  0.6485, -0.3506, -0.3147, -0.1863,  0.4142,  0.6551,
         0.6140, -0.6269]) tensor([ 0.4094,  0.2617,  0.8065, -0.4394, -0.1715, -0.0330,  0.3083,  0.6244,
         0.6527, -0.6955])
R[0]
tensor([0.0030], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020615144222974776 0.004750421282355092 1.3136068553762925e-06 0.0038459943251218645 0.1787773876339197 0.0003439649343490601 0.009619399934192188
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.7412, -0.5818, -0.5372, -0.0126, -0.9276,  0.0466,  0

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.6395,  0.5870,  0.9348, -0.6320,  0.1477,  0.7158, -0.5132,  0.3831,
        -0.0874,  0.3162]) tensor([ 0.6432,  0.6800,  0.9844, -0.6851,  0.2100,  0.7709, -0.4972,  0.3021,
        -0.0142,  0.2157]) tensor([ 0.5094,  0.4414,  0.8965, -0.5404, -0.0212,  0.6566, -0.5608,  0.4099,
        -0.2033,  0.6063])
R[0]
tensor([0.0781], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020437697146087886 0.005615922473436513 1.009191107073093e-06 0.004539308115199674 0.18201895190775394 0.00023810716718435288 0.008955705019528977
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.6416,  0.7782,  0.9150, -0.6607,  0.3626,  0.5205, -0

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.5216,  0.4956,  0.8177, -0.5069,  0.0930,  0.2468,  0.1579,  0.3186,
         0.5788, -0.7513]) tensor([ 0.5293,  0.5823,  0.8125, -0.5461,  0.1866,  0.3604,  0.1017,  0.1670,
         0.5476, -0.7463]) tensor([ 0.6089,  0.6758,  0.8680, -0.5946,  0.2694,  0.4424,  0.0271,  0.1699,
         0.5161, -0.7577])
R[0]
tensor([6.8791e-05], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020768966117873787 0.004904968631533848 8.478984986766136e-07 0.004247062470181845 0.17756326116621493 0.00028310129791498187 0.009004237942863256
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3873, -0.1198, -0.1499, -0.1833, -0.5230,  0.3178

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3627, -0.0550,  0.5825, -0.4064, -0.7301, -0.3404,  0.1624,  0.7103,
         0.1601,  0.5788]) tensor([-0.3532, -0.0481,  0.5552, -0.3928, -0.6991, -0.3133,  0.1754,  0.6640,
         0.1536,  0.5455]) tensor([-0.4372, -0.1626,  0.6023, -0.3831, -0.8320, -0.4905,  0.2596,  0.8049,
         0.2158,  0.6945])
R[0]
tensor([-0.0005], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020786355152726175 0.005226587873963581 7.214952757976789e-07 0.004454405789088923 0.17833160711824894 0.0003126829192042351 0.009421050586155616
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.5032,  0.4126,  0.8751, -0.5024, -0.0191,  0.1276,  0

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1917,  0.0743, -0.0606, -0.1953, -0.0375,  0.5383,  0.0881, -0.4848,
        -0.4108,  0.1214]) tensor([-0.1522,  0.0924, -0.0235, -0.2089, -0.0364,  0.5089,  0.0867, -0.4180,
        -0.3816,  0.0654]) tensor([-0.1358,  0.1975,  0.2282, -0.3520, -0.0246,  0.5343,  0.0108, -0.2193,
        -0.4037,  0.3114])
R[0]
tensor([-0.0014], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021248863495886325 0.0062333558619211546 6.156101933925128e-07 0.0046420185858733025 0.1742890004068613 0.0002549061551690102 0.008539317138318438
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0169,  0.3848,  0.3860, -0.4055, -0.1022,  0.4276, -

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2829,  0.2739,  0.5804, -0.3944,  0.0707,  0.6012, -0.1223,  0.0204,
        -0.1566,  0.3470]) tensor([ 0.2723,  0.2271,  0.5186, -0.3575,  0.0580,  0.5891, -0.1064,  0.0136,
        -0.1802,  0.3205]) tensor([ 0.3934,  0.2938,  0.4955, -0.3100,  0.1216,  0.7638, -0.2868, -0.2109,
        -0.2202,  0.3296])
R[0]
tensor([-0.0167], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02099984231777489 0.0052075250774651066 5.240429943853541e-07 0.004395008831867017 0.17469936536252498 0.0003069174811244011 0.008943678411189466
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5848, -0.4667, -0.3428,  0.0412, -0.6634, -0.1823,  0

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1527,  0.3190,  0.9537, -0.6385, -0.1744,  0.1382,  0.0869,  0.4547,
         0.3215,  0.2548]) tensor([ 0.2449,  0.3991,  0.9080, -0.6526, -0.0403,  0.3007,  0.0048,  0.2862,
         0.2679,  0.1730]) tensor([ 0.3473,  0.5002,  0.9953, -0.6986,  0.0399,  0.3502, -0.0124,  0.3091,
         0.3272,  0.0742])
R[0]
tensor([0.0001], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021338286539539696 0.005675116219004849 4.730840350646304e-07 0.004123318390804343 0.17194282133877278 0.0003883655965328217 0.008610083907376975
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1477, -0.2628,  0.5066, -0.2522, -0.8020, -0.5474,  0.

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5385, -0.6232, -0.4529,  0.2120, -0.7231, -0.4036,  0.8534,  0.0359,
         0.2775, -0.6676]) tensor([-0.6236, -0.6883, -0.4227,  0.2218, -0.8641, -0.4616,  0.8127,  0.1564,
         0.2053, -0.4468]) tensor([-0.5443, -0.6464, -0.2156,  0.1462, -0.8509, -0.5350,  0.7856,  0.3052,
         0.2652, -0.2688])
R[0]
tensor([-0.0044], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021435345083475114 0.005628592457138438 4.5181921552739367e-07 0.004488409996964037 0.1720065757930279 0.00020337864011526108 0.008511525929381605
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5741, -0.1641,  0.1963, -0.2950, -0.6465, -0.0943,  

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0060,  0.4322,  0.4705, -0.5845,  0.0292,  0.4682, -0.0828, -0.3760,
         0.0279,  0.2194]) tensor([-4.4686e-04,  4.0550e-01,  4.5083e-01, -5.5925e-01,  1.4977e-02,
         4.5922e-01, -6.8720e-02, -3.8795e-01,  5.5246e-02,  1.8408e-01]) tensor([-0.0046,  0.4372,  0.4805, -0.5894,  0.0318,  0.4635, -0.0799, -0.3664,
         0.0306,  0.2198])
R[0]
tensor([0.0037], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02138231598958373 0.006147845909094031 4.1053345410091423e-07 0.004990368156461045 0.1733270131200552 0.0003429588079452515 0.008855600067181512
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.6953, -0.6039, 

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3864, -0.4997, -0.3610,  0.1167, -0.5887, -0.2846,  0.5498, -0.0381,
        -0.0784, -0.2200]) tensor([-0.2546, -0.4081, -0.3044,  0.0791, -0.4828, -0.1851,  0.4676, -0.0393,
        -0.0740, -0.2810]) tensor([-0.4050, -0.5795, -0.5320,  0.2460, -0.5891, -0.2819,  0.5732, -0.1716,
        -0.1355, -0.3343])
R[0]
tensor([0.0062], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021179248161613942 0.005770264723490982 3.8148290170170183e-07 0.004452051926695276 0.17187625753879548 0.0002493270859122276 0.00920741125906352
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.7928, -0.6958, -0.3634,  0.1544, -0.9328, -0.4577,  0.

Testing score per episode (id: 0) is 2.0 (average over 1 episode(s))
== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1160, -0.6028, -0.6843,  0.1685, -0.3591,  0.3763,  0.2563, -0.3331,
        -0.3641, -0.0866]) tensor([-0.0964, -0.5761, -0.6084,  0.1287, -0.3612,  0.3704,  0.2282, -0.2811,
        -0.3506, -0.0251]) tensor([-0.2698, -0.5050, -0.4289,  0.0891, -0.4401,  0.3434,  0.1661, -0.3225,
        -0.3964,  0.3442])
R[0]
tensor([-0.0020], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021196729302406313 0.00568250457260001 3.588835006667068e-07 0.00463771703466773 0.17286010609567165 0.00023641324788331984 0.008344857223069994
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4486,  0.7860,  0.4860, -0.5992,  0.4938,  0.8777, -0.

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1414,  0.4565, -0.0702, -0.3328,  0.3002,  0.7991,  0.0095, -0.7942,
        -0.0237, -0.6341]) tensor([ 0.1830,  0.4753, -0.0142, -0.3500,  0.3089,  0.7895, -0.0047, -0.7295,
        -0.0114, -0.6286]) tensor([ 0.3204,  0.6140,  0.0955, -0.4208,  0.4342,  0.8662, -0.0557, -0.7374,
         0.0627, -0.7272])
R[0]
tensor([-0.0256], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02118867453187704 0.005589826617684594 3.313706570793329e-07 0.00458044774265727 0.17115679574012757 0.00029903542250394824 0.008799591436749325
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2858,  0.5701,  1.0005, -0.7265,  0.0828,  0.4055, -0.

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.5120,  0.7857,  0.8340, -0.7396,  0.4462,  0.8338, -0.2706, -0.1183,
         0.0069,  0.0627]) tensor([ 0.5104,  0.7372,  0.8370, -0.7256,  0.3887,  0.7451, -0.2205, -0.0184,
         0.0534,  0.0017]) tensor([ 5.6152e-01,  8.8028e-01,  7.6746e-01, -7.4696e-01,  5.4623e-01,
         9.2795e-01, -3.1703e-01, -2.3505e-01, -5.6358e-04, -8.5201e-02])
R[0]
tensor([0.0238], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021099880781024694 0.006377747235823335 3.136139691690687e-07 0.004771508374309633 0.17327477495372295 0.00025135351717472075 0.008480222156154923
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0045,  0.4241

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0623,  0.1220,  0.7057, -0.3528, -0.4424,  0.1829, -0.4056,  0.5097,
        -0.2430,  0.8979]) tensor([ 0.1125,  0.1996,  0.6818, -0.3792, -0.3458,  0.2915, -0.4623,  0.3865,
        -0.2399,  0.8020]) tensor([ 0.0353,  0.0803,  0.7153, -0.3549, -0.4747,  0.1554, -0.3641,  0.5637,
        -0.2192,  0.9272])
R[0]
tensor([-0.0045], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020830852134153248 0.0056059643075059284 2.951887636868378e-07 0.004546352488163393 0.1724890453070402 0.0001952434703707695 0.009379203558404697
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3566,  0.6929,  0.4311, -0.5565,  0.4544,  0.8474, -0

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5898, -0.5880,  0.1179, -0.0335, -0.9978, -0.6651,  0.6424,  0.6285,
         0.2348,  0.2923]) tensor([-0.5506, -0.4721,  0.1517, -0.1000, -0.9027, -0.4949,  0.4956,  0.4914,
         0.1558,  0.3918]) tensor([-0.5535, -0.5148,  0.2843, -0.1202, -0.9947, -0.7037,  0.5677,  0.7569,
         0.2373,  0.4929])
R[0]
tensor([-0.0028], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020952562240883706 0.005918885927210795 2.8507533338029134e-07 0.0046204208321869375 0.1727769018858671 0.00020787525177001954 0.00910302420204971
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3389, -0.4731, -0.3020,  0.0693, -0.1392,  0.3878,  

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.5501, -0.7491, -0.7833,  0.5652, -0.3067, -0.0732,  0.1804,  0.2392,
        -0.3323, -0.7315]) tensor([ 0.4883, -0.6257, -0.6386,  0.4462, -0.2918, -0.0503,  0.1724,  0.2152,
        -0.2602, -0.6541]) tensor([ 0.6353, -0.6305, -0.5993,  0.5013, -0.3372, -0.0488, -0.0822,  0.4898,
        -0.4896, -0.4243])
R[0]
tensor([0.1007], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021047254502773286 0.005848443056005635 2.7145032102282586e-07 0.004383853876497597 0.17353049056231976 0.00022227461636066436 0.008505041132564657
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3932,  0.0130, -0.1850, -0.1878, -0.2055,  0.4083,  

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2786,  0.0707,  0.5387, -0.4778, -0.5575, -0.0707,  0.0291,  0.3811,
         0.1428,  0.5793]) tensor([-0.2432,  0.1407,  0.5065, -0.4930, -0.4718,  0.0649, -0.0583,  0.2336,
         0.0966,  0.5649]) tensor([-0.1253,  0.3477,  0.6470, -0.6070, -0.2801,  0.1840, -0.1275,  0.1891,
         0.0700,  0.5337])
R[0]
tensor([0.0020], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020756884571164848 0.005657481690243003 2.7099878860781245e-07 0.004349647115333937 0.17787633009254933 0.0002435363009572029 0.008818346004467457
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-3.3460e-01,  3.1912e-02,  2.7170e-01, -3.9767e-01, -3.5

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0670, -0.2003,  0.2805, -0.1208, -0.4817, -0.2851,  0.5826,  0.5834,
         0.4904, -0.7176]) tensor([-0.0716, -0.3500,  0.2341, -0.0684, -0.6830, -0.4405,  0.6105,  0.7253,
         0.4284, -0.5334]) tensor([ 0.0041, -0.2583,  0.3870, -0.1577, -0.6702, -0.4803,  0.4770,  0.8549,
         0.4498, -0.3705])
R[0]
tensor([-0.0080], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020635037694126366 0.005520908684062306 2.560743815962496e-07 0.004233198935224209 0.1765242883116007 0.0002065262272953987 0.009630410924437456
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0934,  0.2611,  0.5216, -0.5082, -0.0037,  0.4311,  0.

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5501, -0.1895,  0.1848, -0.2538, -0.6557, -0.1223,  0.1851,  0.2020,
        -0.2107,  0.6501]) tensor([-0.4724, -0.0913,  0.2004, -0.2948, -0.5478,  0.0029,  0.1052,  0.0823,
        -0.2105,  0.5907]) tensor([-0.4407, -0.0964, -0.0509, -0.2000, -0.3746,  0.2734,  0.0940, -0.3654,
        -0.3110,  0.4701])
R[0]
tensor([-0.0008], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.021111387774348258 0.004928235363793647 2.400961014075165e-07 0.004686201916600112 0.17364126642048358 0.00010649415850639344 0.00972277429269161
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5067, -0.6407, -0.4774,  0.2376, -0.7232, -0.4103,  0

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.6721, -0.7218, -0.6766,  0.2160, -0.8334, -0.2119,  0.5175, -0.0553,
        -0.2348, -0.0432]) tensor([-0.6207, -0.6284, -0.6066,  0.1628, -0.7515, -0.1643,  0.4826, -0.0809,
        -0.2279, -0.0439]) tensor([-0.4274, -0.5082, -0.7902,  0.2602, -0.5369,  0.1078,  0.2987, -0.6224,
        -0.2676, -0.2111])
R[0]
tensor([0.0058], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020530499581247568 0.004436013216116408 2.3379685588054145e-07 0.00428648424660787 0.1742482605278492 0.00020683072507381438 0.009043993327999487
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1278,  0.2973,  0.8005, -0.5749, -0.2070,  0.1436,  0.

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4158,  0.5557,  0.7534, -0.6181,  0.2541,  0.5381,  0.0041,  0.0368,
         0.2396, -0.1883]) tensor([ 0.4271,  0.5818,  0.7439, -0.6303,  0.2850,  0.5687, -0.0145, -0.0020,
         0.2303, -0.2164]) tensor([ 0.4455,  0.6818,  0.8398, -0.7148,  0.3325,  0.6917, -0.1622, -0.0067,
         0.0969,  0.0285])
R[0]
tensor([-0.0044], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020459398563951254 0.004768913882071501 2.2613633889534412e-07 0.004027210462198127 0.17543232695758343 0.00018386701494455337 0.0090407362980186
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2629,  0.2650,  0.2566, -0.2411,  0.0560,  0.7163, -0

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.7308, -0.4944, -0.3133, -0.0301, -0.8678, -0.0943,  0.3020,  0.0960,
        -0.2900,  0.3674]) tensor([-0.6602, -0.4025, -0.2644, -0.0701, -0.7730, -0.0502,  0.2916,  0.0530,
        -0.2580,  0.2825]) tensor([-0.4986, -0.3549, -0.3410, -0.0133, -0.6640,  0.1675,  0.1074, -0.3359,
        -0.2470,  0.2207])
R[0]
tensor([0.0004], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02025264902226627 0.0041055327658978055 2.1545117793664305e-07 0.003788262933085207 0.17857097209990025 0.00026002651453018187 0.009153789637726732
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.5598,  0.9131,  0.8647, -0.7694,  0.5145,  0.8109, -

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2669,  0.1933,  0.7437, -0.4780, -0.1423,  0.0940,  0.2369,  0.4561,
         0.4252, -0.1402]) tensor([ 0.2954,  0.2046,  0.7140, -0.4760, -0.1067,  0.1174,  0.2307,  0.4163,
         0.4224, -0.1975]) tensor([ 0.1759,  0.1116,  0.7799, -0.4724, -0.2643, -0.0354,  0.2817,  0.5767,
         0.4352, -0.0061])
R[0]
tensor([0.0017], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02067994718439877 0.004896212354869931 2.168770443233825e-07 0.003968982430000324 0.1740129989683628 0.0002150023877620697 0.009055104834609666
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.7597,  0.9304,  0.8472, -0.6953,  0.5658,  0.7871, -0.16

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4748,  0.3978,  0.7636, -0.4679, -0.0043,  0.1785,  0.2022,  0.4169,
         0.5724, -0.7597]) tensor([ 0.2326,  0.0657,  0.6180, -0.3154, -0.3523, -0.1591,  0.3753,  0.6611,
         0.5490, -0.5979]) tensor([ 0.2368, -0.0090,  0.4636, -0.2274, -0.3509, -0.1685,  0.4499,  0.5966,
         0.5480, -0.7556])
R[0]
tensor([-0.0062], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02071552968584001 0.004476046972326003 2.0637982237303732e-07 0.0038899420020752588 0.17691990847885608 0.00019195333868265153 0.00853001249092631
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1032,  0.2051,  0.4482, -0.4678, -0.3851,  0.2456, -

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4982, -0.1943, -0.0586, -0.1926, -0.4762,  0.0946,  0.2466, -0.4464,
        -0.1034,  0.3144]) tensor([-0.4390, -0.1341, -0.0258, -0.2187, -0.4111,  0.1627,  0.2070, -0.5101,
        -0.0804,  0.2783]) tensor([-0.3938, -0.0402,  0.0973, -0.3168, -0.3571,  0.1909,  0.1668, -0.4345,
        -0.0526,  0.3426])
R[0]
tensor([-0.0055], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020870189497247338 0.00501505910607375 2.0876113640611039e-07 0.0040331790227792225 0.17219555714726448 0.00024300993978977202 0.008865135833038948
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2037,  0.4701,  0.1576, -0.4319,  0.3207,  0.8111, 

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4812,  0.3325,  0.7784, -0.4422, -0.0675,  0.1258,  0.2190,  0.5067,
         0.5872, -0.7505]) tensor([ 0.4643,  0.4076,  0.7626, -0.4750,  0.0101,  0.2342,  0.1683,  0.3492,
         0.5492, -0.7222]) tensor([ 0.2276, -0.0182,  0.4640, -0.2326, -0.3578, -0.1585,  0.4554,  0.5999,
         0.5412, -0.7527])
R[0]
tensor([0.0055], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.020583905631676315 0.005489421307767771 2.062648163700942e-07 0.0048266367674223145 0.17465168924629687 0.00024051455408334731 0.009403791073185857
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3095,  0.4520,  0.0168, -0.3623,  0.3897,  0.7718,  

== Mean score per episode is 1.9998000199980002 over 1 episodes ==


  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4817, -0.3493, -0.1348,  0.0505, -0.1791, -0.3214,  0.3916,  0.7137,
         0.0522, -0.6428]) tensor([ 0.4925, -0.2489, -0.0190, -0.0266, -0.1643, -0.2569,  0.3579,  0.7077,
         0.1192, -0.6402]) tensor([ 0.5127, -0.1802, -0.1333, -0.0470, -0.0210,  0.0748,  0.3060,  0.2516,
         0.1655, -0.6489])
R[0]
tensor([-0.0089], grad_fn=<SelectBackward0>)
self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q,                 self.loss_disentangle_t, self.loss_disambiguate1,                self.loss_disambiguate2
0.02069698636047542 0.005052687564006192 2.0748368636702708e-07 0.0043193511219578795 0.17694543935358525 0.0002499006688594818 0.009169382726249752
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1780, -0.0593, -0.5720, -0.0470, -0.0541,  0.3840,  

  plt.show()
  plt.plot(np.log(learning_algo.tracked_disamb1))


Best neural net obtained after 3 epochs, with validation score 102.0
{'vs': [0.0, 1.0, 102.0, 91.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], 'ts': []}


## Visualize performance

In [6]:
agent.setNetwork(fname, nEpoch=16)

FileNotFoundError: [Errno 2] No such file or directory: 'nnets/test_62977be8e45d8a56a5537c11dfd5d2fd8dda69e0.epoch=16'

In [None]:
agent._in_episode = True
agent._mode = 0 # Testing mode with plan_depth=0
initState = env.reset(agent._mode)
inputDims = env.inputDimensions()

for i in range(len(inputDims)):
    if inputDims[i][0] > 1:
        agent._state[i][1:] = initState[i][1:]
agent._Vs_on_last_episode = []
is_terminal = False
reward = 0

### Frame by frame

In [None]:
%matplotlib inline

for i in range(100):
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    plt.figure()
    plt.imshow(np.flip(_obs.squeeze()))
    plt.show()
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
    V, action, reward, _ = agent._step()
    print(action)
    agent._Vs_on_last_episode.append(V)
    is_terminal = env.inTerminalState()
    if is_terminal: break

### As animation

In [None]:
%matplotlib notebook

import numpy as np 
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation

fig, ax = plt.subplots(1,1)
obs = env.observe()
_obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
_obs = np.flip(_obs.squeeze())
ax.set_xticks([])
ax.set_yticks([])
im = ax.imshow(np.zeros(_obs.shape))

def init():
    plt.cla()
    im = ax.imshow(_obs)
    return [im]

def animate(i, *args, **kwargs):
    plt.cla()
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    _obs = np.flip(_obs.squeeze())
    im = ax.imshow(_obs)
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
        V, action, reward, _ = agent._step()
        agent._Vs_on_last_episode.append(V)
    return [im]

ani = animation.FuncAnimation(fig, animate, init_func=init, 
     frames=100, blit=False, repeat=True)
ani.save('behavior.gif', writer="ffmpeg", fps = 15)
fig.show()