In [1]:
import sys
import logging
import pickle

import numpy as np
import matplotlib.pyplot as plt
from joblib import hash, dump, load
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from figure8_env import MyEnv as figure8_env
from figure8_alt1 import MyEnv as figure8_alt1
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy, FixedFigure8Policy

In [2]:
figure8_give_rewards = True
nn_yaml = 'network_noconv.yaml'
higher_dim_obs = False
internal_dim = 10
fname = 'figure8_alt1_mf_start'
set_network = ['mf_only_0', 50, True]

In [3]:
class Defaults:
    # ----------------------
    # Setup Parameters (copied for convenience)
    # ----------------------
    figure8_give_rewards = figure8_give_rewards
    nn_yaml = nn_yaml
    higher_dim_obs = higher_dim_obs
    internal_dim = internal_dim
    fname = fname
    
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 5000
    epochs = 50
    steps_per_test = 1000
    period_btw_summary_perfs = 1

    # ----------------------
    # Temporal Processing Parameters
    # ----------------------
    nstep = 15
    nstep_decay = 0.8
    encoder_type = 'regular'
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2
    show_rewards = False

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    learning_rate = 1*1E-4
    learning_rate_decay = 1.0
    discount = 0.9
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 1000
    update_frequency = 1
    replay_memory_size = 100000 #50000
    batch_size = 64
    freeze_interval = 1000
    deterministic = False
    
    # ----------------------
    # Learning algo parameters
    # ----------------------
    # T, entropy_neighbor, entropy_random, volume, gamma, R, Q, variational
    #loss_weights = [5E-3, 1E-3, 5E-3, 5E-3, 5E-3, 5E-3, 1.]
    #loss_weights = [0., 0., 0., 0., 0., 0., 1., 2E-4]
    loss_weights = [0, 0, 0, 0, 0, 0, 1., 0.]
    #loss_weights = [5E-3, 5E-3, 5E-3, 0, 5E-3, 5E-3, 1., 0.]

In [4]:
parameters = Defaults()
with open(f'params/{fname}.p', 'wb') as f:
    pickle.dump(parameters, f)

In [5]:
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

# --- Instantiate environment ---
env = figure8_alt1(
    give_rewards=figure8_give_rewards,
    intern_dim=internal_dim,
    higher_dim_obs=higher_dim_obs,
    show_rewards=parameters.show_rewards,
    nstep=parameters.nstep, nstep_decay=parameters.nstep_decay
    )

# --- Instantiate learning_algo ---
learning_algo = CRAR(
    env,
    parameters.freeze_interval,
    parameters.batch_size,
    rng,
    high_int_dim=False,
    internal_dim=internal_dim, lr=parameters.learning_rate,
    nn_yaml=nn_yaml, double_Q=True,
    loss_weights=parameters.loss_weights,
    nstep=parameters.nstep, nstep_decay=parameters.nstep_decay,
    encoder_type=parameters.encoder_type
    )

if figure8_give_rewards:
    train_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.2,
        consider_valid_transitions=False
        )
    test_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.
        )
else:
    train_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng, epsilon=0.2,
        height=env.HEIGHT, width=env.WIDTH
        )
    test_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng,
        height=env.HEIGHT, width=env.WIDTH
        )

# --- Instantiate agent ---
agent = NeuralAgent(
    env, learning_algo,
    parameters.replay_memory_size,
    1, parameters.batch_size, rng,
    train_policy=train_policy, test_policy=test_policy)
if set_network is not None:
    agent.setNetwork(
        f'{set_network[0]}/fname', nEpoch=set_network[1],
        encoder_only=set_network[2]
        )

agent.run(10, 500)
print("end gathering data")

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# Learning rate may follow a scheduler
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
# has the highest validation score.
# To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
# important that the validationID is the same than the id argument of the InterleavedTestEpochController.
# The FindBestController will dump on disk the validation scores for each and every network, as well as the 
# structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
# of the validation and test scores (see below) or simply recover the resulting neural network for your 
# application.
agent.attach(bc.FindBestController(
    validationID=figure8_env.VALIDATION_MODE,
    testID=None,
    unique_fname=fname, savefrequency=5))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 
# rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 
# every [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=figure8_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1,
    unique_fname=fname))

end gathering data


In [6]:
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
#agent.gathering_data=False
if set_network is not None:
    agent.setNetwork(
        f'{set_network[0]}/fname', nEpoch=set_network[1],
        encoder_only=set_network[2]
        )
agent.run(parameters.epochs, parameters.steps_per_epoch)

# --- Show results ---
basename = "scores/" + fname
scores = load(basename + "_scores.jldump")
print(scores)

Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.5480,  0.9623, -0.1419,  0.3934, -0.5034, -0.2301,  0.8100, -0.4594,
        -0.5248,  1.1416]) tensor([-0.3699,  1.2042,  0.0321,  0.1808, -0.6477, -0.0147,  0.8216, -0.5381,
        -0.5714,  0.9956]) tensor([-0.5469,  0.9560, -0.1442,  0.3814, -0.4987, -0.2318,  0.7973, -0.4524,
        -0.5182,  1.1343])
R[0]
tensor([0.1233], grad_fn=<SelectBackward0>)
LOSSES
T = 0.035313689216971395; R = 0.006990905100712552;                 Gamma = 0.4544175966978073; Q = 0.004692082088207826;
Entropy Neighbor = 0.6686173263788223;                 Entropy Random = 0.06888983841985465;                 Volume = 0.11939426591619849; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1785,  0.4931,  0.0392,  0.0036, -0.1122, -0.4981,  0.0172, -0.1991,
        -0.2385,



Testing score per episode (id: 0) is 0.0 (average over 1 episode(s))
== Mean score per episode is 0.0 over 1 episodes ==


  plt.show()
  dist_matrix = dist_matrix/np.nanpercentile(dist_matrix.flatten(), 99)
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.6800,  0.3819,  0.1102, -0.6408, -0.0958, -0.3058, -0.3015,  0.3314,
         0.0851,  0.5171]) tensor([-0.5334,  0.6625,  0.3198, -0.8472, -0.2319, -0.0633, -0.2532,  0.2118,
        -0.0009,  0.3592]) tensor([-0.6809,  0.3879,  0.1106, -0.6387, -0.0959, -0.3067, -0.2982,  0.3306,
         0.0844,  0.5206])
R[0]
tensor([0.0409], grad_fn=<SelectBackward0>)
LOSSES
T = 0.03757921337336302; R = 0.004966317308833823;                 Gamma = 0.411633510529995; Q = 0.003419392359646736;
Entropy Neighbor = 0.6282619569301605;                 Entropy Random = 0.08228026916831732;                 Volume = 0.0056800199672579765; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.7691,  0.5925,  0.2212, -0.6352, -0.0363, -0.3361, -0.1761,  0.3469,
         0.1213,

  plt.show()
  dist_matrix = dist_matrix/np.nanpercentile(dist_matrix.flatten(), 99)
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.6720, -0.0985,  0.1777, -0.6516, -0.0085, -0.1806, -0.2272,  0.3784,
         0.2581,  0.2598]) tensor([-0.5327,  0.1870,  0.3906, -0.8537, -0.1445,  0.0667, -0.1816,  0.2509,
         0.1657,  0.1055]) tensor([-0.7076,  0.0282,  0.1948, -0.6586,  0.0144, -0.2188, -0.2142,  0.3431,
         0.2713,  0.3966])
R[0]
tensor([0.0334], grad_fn=<SelectBackward0>)
LOSSES
T = 0.03729532536864281; R = 0.005449885371606797;                 Gamma = 0.4067010858058929; Q = 0.00356418513355311;
Entropy Neighbor = 0.6017276881337166;                 Entropy Random = 0.057329307692125436;                 Volume = 0.0010995797030627728; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0100, -0.1941,  0.3989,  0.2660, -0.3583, -0.0917, -0.0034,  0.3283,
        -0.1799

  abs_states[i:i+1], torch.as_tensor([action_encoding])
  ax.scatter(
  plt.show()
  r, k = function_base._ureduce(a,
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4106, -0.0523,  0.1257, -0.4395, -0.2411, -0.2325, -0.3919,  0.4587,
        -0.0943, -0.0909]) tensor([-0.3094,  0.2632,  0.3562, -0.6489, -0.3694,  0.0363, -0.3229,  0.3048,
        -0.2116, -0.2559]) tensor([-0.4201, -0.0153,  0.1523, -0.4032, -0.2515, -0.2313, -0.3633,  0.4511,
        -0.1019, -0.0620])
R[0]
tensor([0.0219], grad_fn=<SelectBackward0>)
LOSSES
T = 0.0369844803661108; R = 0.005892159186303616;                 Gamma = 0.4066513162255287; Q = 0.003817963464593049;
Entropy Neighbor = 0.6252735531330109;                 Entropy Random = 0.09130358604341746;                 Volume = 0.02404880567267537; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2029,  0.0767,  0.2024, -0.1509, -0.2446, -0.3249, -0.1863,  0.2651,
        -0.2755,  

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-4.5338e-01,  7.8186e-05,  1.9051e-02, -6.5422e-01, -5.1739e-02,
        -2.5292e-01, -4.2006e-01,  3.4947e-01,  1.0249e-01,  2.0004e-01]) tensor([-0.2826,  0.2619,  0.2153, -0.8481, -0.1835, -0.0310, -0.3995,  0.2501,
         0.0430,  0.0569]) tensor([-0.4773, -0.0168,  0.0305, -0.6442, -0.0580, -0.2363, -0.3930,  0.3639,
         0.1181,  0.1806])
R[0]
tensor([0.0430], grad_fn=<SelectBackward0>)
LOSSES
T = 0.04027770849317312; R = 0.010617707474622876;                 Gamma = 0.410041510283947; Q = 0.008057588683092035;
Entropy Neighbor = 0.5869137902259827;                 Entropy Random = 0.11797800303995609;                 Volume = 0.024988117840141058; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3249, -0.0632,  0.6305, -0.6411,  0.0722, -0.2

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2671, -0.4020,  0.0483, -0.6118, -0.1036, -0.0974, -0.5509,  0.3801,
        -0.0055, -0.2232]) tensor([-0.1649, -0.0952,  0.2696, -0.8264, -0.2424,  0.1674, -0.4951,  0.2280,
        -0.1222, -0.3893]) tensor([-0.2666, -0.4054,  0.0828, -0.5970, -0.0880, -0.0978, -0.5365,  0.3389,
        -0.0049, -0.1849])
R[0]
tensor([0.0565], grad_fn=<SelectBackward0>)
LOSSES
T = 0.046331945046782494; R = 0.015477141992654651;                 Gamma = 0.4152944161891937; Q = 0.012054342814953998;
Entropy Neighbor = 0.48953793931007383;                 Entropy Random = 0.06182186610624194;                 Volume = 0.005934673093259335; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4720, -0.2098,  0.1821, -0.6307, -0.1366, -0.1425, -0.5120,  0.3716,
         0.006

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3086,  0.4293,  1.1142, -0.1779,  0.0556, -0.4150,  0.4389,  0.5593,
        -0.2490, -0.2282]) tensor([-0.1802,  0.7087,  1.3149, -0.3868, -0.0831, -0.1687,  0.4611,  0.4338,
        -0.3290, -0.3741]) tensor([-0.2257,  0.3621,  1.0148, -0.0101, -0.0687, -0.2943,  0.4622,  0.5750,
        -0.2289, -0.3093])
R[0]
tensor([0.0321], grad_fn=<SelectBackward0>)
LOSSES
T = 0.045150101833045485; R = 0.015035003060474991;                 Gamma = 0.4071952460408211; Q = 0.011534374461276456;
Entropy Neighbor = 0.460580710709095;                 Entropy Random = 0.052580585565418;                 Volume = 0.007066512767225504; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1926,  0.4201, -0.3852, -0.5019, -0.1726, -0.3196, -0.2471,  0.2105,
        -0.1343,  

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3033,  0.0460,  0.1342, -0.3924, -0.2122, -0.2232, -0.3203,  0.3358,
        -0.1541,  0.1277]) tensor([-0.1489,  0.3143,  0.3299, -0.5942, -0.3479,  0.0069, -0.2989,  0.2278,
        -0.2225, -0.0183]) tensor([-0.2824,  0.0727,  0.2166, -0.3514, -0.2082, -0.2149, -0.3286,  0.2698,
        -0.2274,  0.1460])
R[0]
tensor([0.0596], grad_fn=<SelectBackward0>)
LOSSES
T = 0.048381976433098314; R = 0.01614777412312105;                 Gamma = 0.4156964069008827; Q = 0.013538163342745975;
Entropy Neighbor = 0.4300527136325836;                 Entropy Random = 0.05501135975494981;                 Volume = 0.002426970113068819; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1594,  0.1496, -0.0249, -0.3986, -0.1654, -0.3539, -0.3030,  0.3007,
        -0.1535,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3448,  0.0537, -0.0098, -0.5002, -0.2853, -0.2324, -0.4577,  0.4532,
        -0.1323,  0.0153]) tensor([-0.2368,  0.3661,  0.2187, -0.7107, -0.4136,  0.0320, -0.3874,  0.3040,
        -0.2471, -0.1514]) tensor([-0.3646,  0.1528,  0.0023, -0.4884, -0.2762, -0.2563, -0.4279,  0.4009,
        -0.1357,  0.1494])
R[0]
tensor([0.0308], grad_fn=<SelectBackward0>)
LOSSES
T = 0.04949201831966638; R = 0.01856504122680053;                 Gamma = 0.4188125228881836; Q = 0.014284953501657583;
Entropy Neighbor = 0.40645742905139925;                 Entropy Random = 0.05024161439016461;                 Volume = 0.000506687380373478; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3903, -0.0301,  0.1401, -0.4461, -0.2565, -0.1971, -0.3955,  0.4047,
        -0.1066,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3348, -0.0369,  0.0322, -0.4327, -0.3176, -0.1871, -0.4295,  0.5010,
        -0.1435, -0.1397]) tensor([-0.2348,  0.2798,  0.2622, -0.6443, -0.4468,  0.0816, -0.3592,  0.3455,
        -0.2639, -0.3067]) tensor([-0.3176, -0.0354,  0.0143, -0.4251, -0.3243, -0.1649, -0.4362,  0.4920,
        -0.1505, -0.1401])
R[0]
tensor([0.0312], grad_fn=<SelectBackward0>)
LOSSES
T = 0.04948230511695147; R = 0.02318704635510221;                 Gamma = 0.4205773378610611; Q = 0.016739436360774563;
Entropy Neighbor = 0.39927949517965317;                 Entropy Random = 0.058243155676871536;                 Volume = 0.0031092513725161555; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1159,  0.1846,  0.1136, -0.4253, -0.1758, -0.2262, -0.4783,  0.2695,
        -0.307

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3362,  0.1223, -0.3462, -0.2321, -0.6493, -0.1064, -0.1792,  0.3428,
        -0.4331, -0.1645]) tensor([-0.2396,  0.4422, -0.1211, -0.4525, -0.7831,  0.1613, -0.1041,  0.1835,
        -0.5647, -0.3373]) tensor([-0.4122,  0.2778, -0.1699, -0.4479, -0.4677, -0.1818, -0.3736,  0.3183,
        -0.3752,  0.0047])
R[0]
tensor([0.0728], grad_fn=<SelectBackward0>)
LOSSES
T = 0.050676584757864476; R = 0.02820183549914509;                 Gamma = 0.4219714090824127; Q = 0.019565855386899783;
Entropy Neighbor = 0.3850180544257164;                 Entropy Random = 0.058517956007272005;                 Volume = 0.004142605505883694; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3880, -0.0679,  0.2466, -0.3685, -0.2434, -0.1920, -0.3459,  0.3290,
        -0.0833

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3748,  0.4610,  0.3563,  0.1470, -0.5688,  0.0175,  0.4822,  0.5569,
        -0.4172, -0.6100]) tensor([-0.2520,  0.7476,  0.5526, -0.0701, -0.7126,  0.2619,  0.5102,  0.4204,
        -0.5191, -0.7630]) tensor([-0.3125,  0.4086,  0.5138,  0.2472, -0.5110,  0.0559,  0.6633,  0.5087,
        -0.3640, -0.6112])
R[0]
tensor([0.0873], grad_fn=<SelectBackward0>)
LOSSES
T = 0.0529217594191432; R = 0.030789277526549995;                 Gamma = 0.4240458319187164; Q = 0.021676862997934223;
Entropy Neighbor = 0.3700431781411171;                 Entropy Random = 0.05461218423768878;                 Volume = 0.004568661570549012; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3166,  0.0852, -0.1950, -0.2404, -0.4909, -0.1762, -0.1638,  0.3968,
        -0.2771, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3164,  0.3299,  1.0680, -0.4234,  0.2037, -0.5468,  0.3012,  0.4558,
        -0.3325, -0.3123]) tensor([-0.2463,  0.6577,  1.3050, -0.6425,  0.0738, -0.2593,  0.3761,  0.2832,
        -0.4610, -0.4824]) tensor([-0.3159,  0.3236,  1.0785, -0.4207,  0.2087, -0.5457,  0.3054,  0.4518,
        -0.3279, -0.3059])
R[0]
tensor([-0.0031], grad_fn=<SelectBackward0>)
LOSSES
T = 0.05374783887714148; R = 0.03369687126763165;                 Gamma = 0.4252391586303711; Q = 0.023189955125330015;
Entropy Neighbor = 0.3503930449783802;                 Entropy Random = 0.05043853000551462;                 Volume = 0.004805319700390101; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3203,  0.1600,  0.1655, -0.0803, -0.3774, -0.2422, -0.0560,  0.2646,
        -0.2176,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3131,  0.3277,  0.4217, -0.4496, -0.0129, -0.4736,  0.0476,  0.4688,
        -0.2694, -0.1994]) tensor([-0.2179,  0.6375,  0.6423, -0.6720, -0.1528, -0.2053,  0.1088,  0.3121,
        -0.3896, -0.3700]) tensor([-0.3026,  0.3252,  0.4804, -0.4684,  0.0371, -0.4907,  0.0905,  0.4739,
        -0.2737, -0.2331])
R[0]
tensor([0.0545], grad_fn=<SelectBackward0>)
LOSSES
T = 0.05681290607154369; R = 0.03595506475120783;                 Gamma = 0.4257734426856041; Q = 0.02414332677330822;
Entropy Neighbor = 0.33052478766441346;                 Entropy Random = 0.04214961399510503;                 Volume = 0.00481129152700305; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3395,  0.1504,  0.0896, -0.4726, -0.2622, -0.2262, -0.4973,  0.3382,
        -0.2473,  

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2033,  0.3173,  0.6668, -0.0521, -0.1845, -0.2206,  0.3521,  0.3781,
        -0.4454, -0.2762]) tensor([-0.0807,  0.6006,  0.8653, -0.2660, -0.3256,  0.0262,  0.3773,  0.2480,
        -0.5342, -0.4270]) tensor([-0.3506,  0.1345,  0.4342, -0.1335, -0.2569, -0.2711, -0.0099,  0.3567,
        -0.2577, -0.0257])
R[0]
tensor([0.0712], grad_fn=<SelectBackward0>)
LOSSES
T = 0.05742205820977688; R = 0.03855598240252584;                 Gamma = 0.42957794523239134; Q = 0.025782973265275358;
Entropy Neighbor = 0.3207059180736542;                 Entropy Random = 0.0427573975212872;                 Volume = 0.006409511290490627; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1650,  0.1313,  0.6288,  0.1684, -0.2261, -0.2755,  0.2449,  0.0504,
        -0.8016, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2603, -0.0476,  0.0464, -0.4360, -0.2411, -0.1822, -0.4459,  0.3755,
        -0.1837, -0.0870]) tensor([-0.1584,  0.2588,  0.2645, -0.6553, -0.3814,  0.0817, -0.3886,  0.2242,
        -0.3007, -0.2555]) tensor([-0.2367, -0.0628,  0.0733, -0.4525, -0.2104, -0.1907, -0.4656,  0.3218,
        -0.2020, -0.0436])
R[0]
tensor([0.0665], grad_fn=<SelectBackward0>)
LOSSES
T = 0.05808712060004473; R = 0.04040473735053092;                 Gamma = 0.42936199271678926; Q = 0.02697197707137093;
Entropy Neighbor = 0.31407878708839415;                 Entropy Random = 0.04448785222321749;                 Volume = 0.006899793069809675; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3283,  0.4027, -0.0731, -0.1261, -0.6188, -0.0570, -0.0718,  0.1578,
        -0.6884,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1580,  0.1320,  0.1777,  0.2969, -0.5594, -0.0661,  0.0437,  0.2030,
        -0.2379,  0.2685]) tensor([-0.0094,  0.3965,  0.3639,  0.0893, -0.7018,  0.1656,  0.0565,  0.0956,
        -0.3067,  0.1251]) tensor([-0.3106,  0.2011,  0.0485, -0.1729, -0.3872, -0.2245, -0.1741,  0.2971,
        -0.2239,  0.1940])
R[0]
tensor([0.0886], grad_fn=<SelectBackward0>)
LOSSES
T = 0.059360144793987275; R = 0.041846992440521716;                 Gamma = 0.43117039793729783; Q = 0.02733487758319825;
Entropy Neighbor = 0.3080156331956387;                 Entropy Random = 0.04473591877147556;                 Volume = 0.005414037123322487; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2807,  0.0143, -0.4865, -0.1386, -0.7131, -0.0494, -0.0872,  0.3594,
        -0.4001

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1552,  0.4825,  0.1708,  0.2860, -0.8185,  0.2876,  0.1693,  0.1195,
        -0.9483, -0.5920]) tensor([-0.0993,  0.8107,  0.3879,  0.0438, -0.9681,  0.5718,  0.2382, -0.0636,
        -1.1028, -0.7744]) tensor([-0.0647,  0.1099,  0.2320, -0.2163, -0.2255, -0.2688, -0.3450,  0.1662,
        -0.3676,  0.2104])
R[0]
tensor([0.1348], grad_fn=<SelectBackward0>)
LOSSES
T = 0.059825474604964256; R = 0.043897336596623066;                 Gamma = 0.43079269528388975; Q = 0.029485707168467344;
Entropy Neighbor = 0.3000645781159401;                 Entropy Random = 0.04508312679827213;                 Volume = 0.00535297654941678; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1228,  0.4586,  0.5871,  0.0302, -0.4617,  0.0875,  0.2524,  0.2541,
        -0.8943

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0379,  0.6690,  0.2886,  0.4504, -0.8628,  0.2780,  0.3120,  0.0494,
        -1.1792, -0.6099]) tensor([ 0.0097,  1.0003,  0.5060,  0.2049, -1.0123,  0.5661,  0.3831, -0.1372,
        -1.3360, -0.7947]) tensor([ 0.0020,  0.1246,  0.2455, -0.1578, -0.2227, -0.2770, -0.3109,  0.1318,
        -0.4172,  0.2316])
R[0]
tensor([0.1502], grad_fn=<SelectBackward0>)
LOSSES
T = 0.05987241191416979; R = 0.044499610052444044;                 Gamma = 0.4326297387480736; Q = 0.028436175275593997;
Entropy Neighbor = 0.2982250301241875;                 Entropy Random = 0.04317301355861127;                 Volume = 0.006298695728182793; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3142,  0.1002,  0.4157,  0.3029, -0.2805, -0.2825,  0.3233, -0.2402,
        -0.9003,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0433,  0.4816,  0.5413,  0.0860, -0.3803, -0.0865,  0.1504,  0.2999,
        -0.8124, -0.6319]) tensor([ 0.0945,  0.8111,  0.7639, -0.1539, -0.5260,  0.2016,  0.2194,  0.1164,
        -0.9598, -0.8144]) tensor([ 0.0597,  0.4781,  0.5203,  0.2472, -0.5275,  0.0954,  0.2841,  0.2830,
        -0.8834, -0.7977])
R[0]
tensor([0.1091], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06177285172790289; R = 0.047013285600580275;                 Gamma = 0.4312462041974068; Q = 0.03023075246810913;
Entropy Neighbor = 0.2894984424412251;                 Entropy Random = 0.04222081385739147;                 Volume = 0.006185311503708363; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2843, -0.0114,  0.0344, -0.3888, -0.3079, -0.0828, -0.4568,  0.3950,
        -0.1232, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1709,  0.3121,  0.2786,  0.0540, -0.5693,  0.0500,  0.1649,  0.1490,
        -0.8042, -0.3601]) tensor([-0.1036,  0.6344,  0.4959, -0.1823, -0.7168,  0.3299,  0.2305, -0.0258,
        -0.9480, -0.5381]) tensor([-0.2332,  0.2092, -0.1591,  0.1732, -0.9218,  0.2959,  0.0617, -0.0375,
        -0.9939, -0.2293])
R[0]
tensor([0.1144], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06193655905127525; R = 0.04824333554692566;                 Gamma = 0.43476522475481033; Q = 0.03148969704378396;
Entropy Neighbor = 0.26516240710020067;                 Entropy Random = 0.043461428258568045;                 Volume = 0.005424583368003368; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2884,  0.1713, -0.5451, -0.1497, -0.6142, -0.1665,  0.0734,  0.1721,
        -0.3486

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3579,  0.1915,  0.2718, -0.0219, -0.4141, -0.2401, -0.0686,  0.2892,
        -0.2240,  0.2587]) tensor([-0.2054,  0.4571,  0.4627, -0.2263, -0.5532, -0.0083, -0.0515,  0.1826,
        -0.2913,  0.1156]) tensor([-0.1923, -0.0175, -0.2520, -0.1587, -0.4872, -0.1699, -0.0979,  0.3663,
        -0.2629, -0.0774])
R[0]
tensor([0.0634], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06320727935433387; R = 0.051358754568733275;                 Gamma = 0.4373980283141136; Q = 0.03259363831602968;
Entropy Neighbor = 0.24797004625201224;                 Entropy Random = 0.04342892863601446;                 Volume = 0.005409622006118298; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2289,  0.3012,  1.0055, -0.3407,  0.1340, -0.4951,  0.2750,  0.4396,
        -0.3581,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3126,  0.1648,  0.5089,  0.2335, -0.2379, -0.2893,  0.3106, -0.1148,
        -0.9057, -0.2131]) tensor([ 0.4027,  0.4656,  0.7123, -0.0015, -0.3836, -0.0285,  0.3590, -0.2666,
        -1.0204, -0.3885]) tensor([-0.0233,  0.5578,  0.6208, -0.0861, -0.2775, -0.1595,  0.0371,  0.3563,
        -0.7448, -0.5895])
R[0]
tensor([0.1161], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06555750075727701; R = 0.05410279109980911;                 Gamma = 0.4386836109757423; Q = 0.03427283151913434;
Entropy Neighbor = 0.23106800180673598;                 Entropy Random = 0.04267477928847074;                 Volume = 0.006070332374423742; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0369,  0.5342,  0.5681,  0.0685, -0.3762, -0.0882,  0.1439,  0.2815,
        -0.8665, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0399,  0.2537,  0.7262,  0.0679, -0.2681,  0.0011,  0.2114,  0.3366,
        -0.6109, -0.7064]) tensor([ 0.0922,  0.5815,  0.9498, -0.1683, -0.4142,  0.2899,  0.2759,  0.1529,
        -0.7567, -0.8847]) tensor([-0.0925,  0.2030,  0.5165,  0.2436, -0.5519,  0.2556,  0.2361,  0.3469,
        -0.5634, -0.7791])
R[0]
tensor([0.0944], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06644682170450687; R = 0.05651622217148542;                 Gamma = 0.4397477408647537; Q = 0.035488744542934;
Entropy Neighbor = 0.21223320585489272;                 Entropy Random = 0.042227826483547684;                 Volume = 0.004332564324140548; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2479, -0.0666,  0.0405, -0.4209, -0.2504, -0.1544, -0.4424,  0.3544,
        -0.1703, -

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1384,  0.1723,  0.3838, -0.2111, -0.2087, -0.2409, -0.0629,  0.5208,
        -0.3341, -0.3606]) tensor([-0.0121,  0.4616,  0.5878, -0.4304, -0.3491,  0.0014, -0.0216,  0.3845,
        -0.4350, -0.5258]) tensor([-0.1870,  0.2778,  0.5093, -0.1872, -0.1552, -0.2988,  0.0542,  0.4800,
        -0.3228, -0.2486])
R[0]
tensor([0.0657], grad_fn=<SelectBackward0>)
LOSSES
T = 0.0685934093222022; R = 0.06041307135671377;                 Gamma = 0.43958647787570954; Q = 0.03780803407169878;
Entropy Neighbor = 0.19985984706878662;                 Entropy Random = 0.04299892558157444;                 Volume = 0.006304986823350191; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0171,  0.5082,  0.4192,  0.1117, -0.5073,  0.0406,  0.3043,  0.2053,
        -0.9749, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1317,  0.4171,  0.3602,  0.0338, -0.5346,  0.0420,  0.2119,  0.2318,
        -0.8639, -0.5685]) tensor([-0.0338,  0.7209,  0.5647, -0.2017, -0.6809,  0.2973,  0.2647,  0.0754,
        -0.9906, -0.7449]) tensor([-0.0638,  0.3448,  0.6048, -0.0116, -0.3707,  0.0263,  0.1754,  0.3845,
        -0.6890, -0.7123])
R[0]
tensor([0.1069], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06927619408071041; R = 0.06183280664309859;                 Gamma = 0.44178726881742475; Q = 0.03844988373760134;
Entropy Neighbor = 0.1929399054646492;                 Entropy Random = 0.04256181044876575;                 Volume = 0.004523412369191647; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3050,  0.1845,  0.3686,  0.0778, -0.3943, -0.1931,  0.0637,  0.2503,
        -0.3128, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0996,  0.5916,  0.2566,  0.3965, -0.8426,  0.2837,  0.2635,  0.1315,
        -1.0251, -0.6209]) tensor([-0.0470,  0.9207,  0.4735,  0.1531, -0.9922,  0.5694,  0.3325, -0.0530,
        -1.1802, -0.8038]) tensor([-0.0463,  0.0747,  0.1864, -0.1991, -0.2231, -0.2645, -0.2843,  0.2280,
        -0.3102,  0.1385])
R[0]
tensor([0.1407], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06999594926834106; R = 0.06346808502450585;                 Gamma = 0.44514719235897066; Q = 0.03954012828506529;
Entropy Neighbor = 0.188967964053154;                 Entropy Random = 0.04423686358518898;                 Volume = 0.005621081370860338; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1858,  0.0237, -0.1979, -0.0783, -0.5150, -0.1571, -0.0466,  0.3156,
        -0.3101, -

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1443,  0.1096, -0.2196, -0.2647, -0.4598, -0.0952, -0.2903,  0.4389,
        -0.3593, -0.1824]) tensor([-0.0501,  0.4292,  0.0067, -0.4845, -0.5928,  0.1733, -0.2177,  0.2801,
        -0.4869, -0.3553]) tensor([-0.1382,  0.3490,  0.3213,  0.1909, -0.5800,  0.0457,  0.1404,  0.5171,
        -0.4310, -0.4416])
R[0]
tensor([0.0697], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07183817882090807; R = 0.06674956739787012;                 Gamma = 0.4460348203778267; Q = 0.04099075127579272;
Entropy Neighbor = 0.1866288069486618;                 Entropy Random = 0.043456528674811125;                 Volume = 0.006758335947990418; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3656,  0.1445,  0.2222, -0.0389, -0.4381, -0.1985, -0.0914,  0.3517,
        -0.2098, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2641,  0.0782, -0.1900, -0.4369, -0.3652, -0.1712, -0.4044,  0.4831,
        -0.2287, -0.1318]) tensor([-0.1615,  0.3948,  0.0381, -0.6514, -0.4951,  0.0942, -0.3321,  0.3290,
        -0.3505, -0.3020]) tensor([-0.2561,  0.1876, -0.0389, -0.3114, -0.3778, -0.1881, -0.2587,  0.5245,
        -0.2974, -0.2467])
R[0]
tensor([0.0506], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07397801161557437; R = 0.06830950533412397;                 Gamma = 0.4478412626385689; Q = 0.04061779715865851;
Entropy Neighbor = 0.1786799399405718;                 Entropy Random = 0.04201091814786196;                 Volume = 0.009951655644923448; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3026,  0.1375, -0.6836, -0.0237, -0.8592,  0.0985,  0.1452,  0.1619,
        -0.6039, -

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3221,  0.2142, -0.2467, -0.2306, -0.3899, -0.1350, -0.0346,  0.2890,
        -0.1424,  0.1890]) tensor([-0.2007,  0.5174, -0.0282, -0.4425, -0.5221,  0.1197,  0.0296,  0.1493,
        -0.2518,  0.0239]) tensor([-0.2999,  0.1774, -0.2775, -0.2222, -0.4139, -0.1481, -0.0471,  0.3421,
        -0.1689,  0.0436])
R[0]
tensor([0.0625], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07218689885735512; R = 0.06530813727527857;                 Gamma = 0.4480564603805542; Q = 0.040063324470072985;
Entropy Neighbor = 0.17724113389849663;                 Entropy Random = 0.04185289538651705;                 Volume = 0.007065305732190609; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0067,  0.5628,  0.5768, -0.0182, -0.3290, -0.1472,  0.0028,  0.3685,
        -0.6891,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1801, -0.0633, -0.1913, -0.1533, -0.4532, -0.1764, -0.0997,  0.3594,
        -0.2330, -0.0548]) tensor([-0.0778,  0.2499,  0.0322, -0.3685, -0.5867,  0.0894, -0.0326,  0.2068,
        -0.3540, -0.2220]) tensor([-0.2871,  0.1390, -0.3957, -0.1595, -0.5928, -0.1040, -0.0200,  0.3069,
        -0.3588, -0.0467])
R[0]
tensor([0.0617], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07225106666982174; R = 0.06630360026843846;                 Gamma = 0.4474588929414749; Q = 0.04028095558099449;
Entropy Neighbor = 0.17925540003180504;                 Entropy Random = 0.04329890380054712;                 Volume = 0.009319488573819399; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0462,  0.7104,  0.2557,  0.4662, -0.8974,  0.2807,  0.2113,  0.1378,
        -1.0642, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0139,  0.0691,  0.1988, -0.2087, -0.1925, -0.2942, -0.2952,  0.1575,
        -0.3320,  0.2069]) tensor([ 0.1276,  0.3450,  0.3960, -0.4248, -0.3322, -0.0600, -0.2596,  0.0394,
        -0.4138,  0.0442]) tensor([-0.1239,  0.0850,  0.3034, -0.0666, -0.2893, -0.1755, -0.1847,  0.0893,
        -0.3970,  0.2431])
R[0]
tensor([0.0744], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07220071750879288; R = 0.06607940638065338;                 Gamma = 0.44799977499246596; Q = 0.040618466809391976;
Entropy Neighbor = 0.179525602042675;                 Entropy Random = 0.04261246819421649;                 Volume = 0.00842524453252554; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3401,  0.2610, -0.0021, -0.2179, -0.3758, -0.2337, -0.1706,  0.3384,
        -0.1860,  

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1199,  0.3026,  0.0950, -0.1484, -0.4446, -0.1045, -0.2758,  0.2461,
        -0.5736, -0.0960]) tensor([-0.0359,  0.6165,  0.3108, -0.3793, -0.5886,  0.1664, -0.2133,  0.0849,
        -0.7020, -0.2722]) tensor([ 0.0914,  0.2015,  0.4981,  0.0976, -0.3824, -0.0074,  0.1564,  0.2387,
        -0.7257, -0.5550])
R[0]
tensor([0.0990], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07259743520617486; R = 0.06809796184394509;                 Gamma = 0.44766893130540847; Q = 0.040646852347999814;
Entropy Neighbor = 0.17533399428427218;                 Entropy Random = 0.042054730985313656;                 Volume = 0.008004976999014616; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3140,  0.1830,  0.3908,  0.4507, -0.4009, -0.2133,  0.2872, -0.1911,
        -0.833

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2083,  0.0543, -0.1780, -0.3985, -0.3508, -0.1654, -0.3806,  0.4534,
        -0.2387, -0.1069]) tensor([-0.1067,  0.3705,  0.0495, -0.6136, -0.4813,  0.1004, -0.3091,  0.2995,
        -0.3599, -0.2771]) tensor([-0.2241,  0.1525, -0.0949, -0.3191, -0.4009, -0.1623, -0.3307,  0.5396,
        -0.2984, -0.2517])
R[0]
tensor([0.0537], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07297463156282902; R = 0.06810227429307997;                 Gamma = 0.44670758664608; Q = 0.04094150163605809;
Entropy Neighbor = 0.17709545236825944;                 Entropy Random = 0.041731198485940695;                 Volume = 0.006915840975940227; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3299,  0.1717, -0.5319, -0.0468, -0.8093,  0.0113,  0.0440,  0.2858,
        -0.5199, -

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3225,  0.2947,  0.3804, -0.0023, -0.3972, -0.1617, -0.0070,  0.4128,
        -0.3090, -0.0686]) tensor([-0.2269,  0.6010,  0.5940, -0.2254, -0.5409,  0.1053,  0.0464,  0.2582,
        -0.4284, -0.2350]) tensor([-0.2402,  0.0624,  0.3878,  0.1116, -0.4037, -0.0870,  0.0128,  0.3666,
        -0.2465, -0.0910])
R[0]
tensor([0.0720], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07311016437411308; R = 0.0678121032025665;                 Gamma = 0.4489157565832138; Q = 0.04073390229418874;
Entropy Neighbor = 0.1751883937716484;                 Entropy Random = 0.04139918131381273;                 Volume = 0.006710699606686831; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3027,  0.0555,  0.2063,  0.0134, -0.4302, -0.1550, -0.0337,  0.3511,
        -0.1852,  0

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2322,  0.1832,  0.5438, -0.0319, -0.2819, -0.2386,  0.0461,  0.4530,
        -0.3274, -0.2266]) tensor([-0.1051,  0.4642,  0.7423, -0.2421, -0.4216,  0.0062,  0.0691,  0.3268,
        -0.4116, -0.3748]) tensor([-0.1979,  0.1697,  0.5022,  0.0994, -0.3680, -0.1798,  0.0568,  0.3988,
        -0.3364, -0.1185])
R[0]
tensor([0.0613], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07338262295722961; R = 0.06881203765235841;                 Gamma = 0.44813840961456297; Q = 0.04098173703253269;
Entropy Neighbor = 0.17579233220219612;                 Entropy Random = 0.042586859093979;                 Volume = 0.006725347023457289; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3416,  0.2467,  0.1412, -0.0047, -0.4818, -0.1992, -0.0499,  0.3316,
        -0.2674,  

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3472,  0.3192,  0.0116, -0.1987, -0.3648, -0.2155, -0.1189,  0.3157,
        -0.1857,  0.3276]) tensor([-0.1811,  0.5793,  0.2006, -0.4019, -0.5028,  0.0076, -0.1008,  0.2169,
        -0.2479,  0.1827]) tensor([-0.3552,  0.2413,  0.2146, -0.0377, -0.4246, -0.2213, -0.0748,  0.3011,
        -0.2392,  0.2482])
R[0]
tensor([0.0741], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07116041236370801; R = 0.06535495527461171;                 Gamma = 0.44682632672786715; Q = 0.039734936045482755;
Entropy Neighbor = 0.1785153750628233;                 Entropy Random = 0.04348194923251867;                 Volume = 0.006187056109309196; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2520,  0.2525,  0.0785, -0.3407, -0.3089, -0.2707, -0.3308,  0.4280,
        -0.3357,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1623,  0.4089,  0.5010,  0.0101, -0.3590, -0.1404,  0.0739,  0.5406,
        -0.3717, -0.3319]) tensor([-0.0367,  0.6914,  0.6982, -0.2043, -0.4999,  0.1032,  0.0988,  0.4122,
        -0.4600, -0.4837]) tensor([ 0.0467,  0.2277,  0.6327,  0.2527, -0.4052, -0.0255,  0.2834,  0.4716,
        -0.4168, -0.5668])
R[0]
tensor([0.0736], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07242447850853205; R = 0.06709220697078853;                 Gamma = 0.44644881862401964; Q = 0.041197528773918746;
Entropy Neighbor = 0.17549164263904093;                 Entropy Random = 0.04209144090488553;                 Volume = 0.007801359876990318; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2094,  0.1174,  0.2567,  0.3845, -0.4204, -0.2116,  0.1193, -0.1229,
        -0.6386

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0533,  0.5336,  0.3765,  0.1414, -0.5922,  0.0786,  0.1898,  0.2871,
        -0.8777, -0.6266]) tensor([ 0.0414,  0.8335,  0.5778, -0.0882, -0.7372,  0.3367,  0.2290,  0.1368,
        -0.9927, -0.7916]) tensor([-0.1042,  0.5874,  0.2609,  0.0508, -0.5862,  0.0651,  0.0703,  0.3067,
        -0.8292, -0.5459])
R[0]
tensor([0.1193], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07202757103741168; R = 0.06788184289447963;                 Gamma = 0.4468274375200272; Q = 0.03984219542890787;
Entropy Neighbor = 0.17691388185322285;                 Entropy Random = 0.0414359873123467;                 Volume = 0.00699079617485404; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2993,  0.1566, -0.4319, -0.1859, -0.5847, -0.0634, -0.0014,  0.2788,
        -0.3728, -0

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0153,  0.7141,  0.3632,  0.4713, -0.8447,  0.2974,  0.2799,  0.1710,
        -1.0687, -0.6467]) tensor([ 0.0346,  1.0439,  0.5803,  0.2271, -0.9938,  0.5842,  0.3488, -0.0142,
        -1.2231, -0.8301]) tensor([-0.0054,  0.1084,  0.1799, -0.1947, -0.2097, -0.2686, -0.2645,  0.2444,
        -0.3353,  0.0989])
R[0]
tensor([0.1440], grad_fn=<SelectBackward0>)
LOSSES
T = 0.0724373149946332; R = 0.06830874746851623;                 Gamma = 0.44748747229576114; Q = 0.041048392793163656;
Entropy Neighbor = 0.17462596663832664;                 Entropy Random = 0.04231519894860685;                 Volume = 0.00627083732932806; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0716,  0.1058,  0.4124,  0.2068, -0.3255, -0.2774,  0.1593,  0.1500,
        -0.5223, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1031,  0.4984,  0.1475,  0.0084, -0.5082, -0.1172, -0.1628,  0.2713,
        -0.6050, -0.2268]) tensor([-0.0262,  0.8157,  0.3631, -0.2259, -0.6529,  0.1563, -0.0986,  0.1054,
        -0.7379, -0.4053]) tensor([-0.0015,  0.4584,  0.2018,  0.1938, -0.5778, -0.0279, -0.0416,  0.2176,
        -0.6992, -0.2814])
R[0]
tensor([0.1031], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07242862934619189; R = 0.06713349619600922;                 Gamma = 0.44663830476999283; Q = 0.04029434067197144;
Entropy Neighbor = 0.17627467915415765;                 Entropy Random = 0.04113662754185498;                 Volume = 0.007962612845003604; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0806,  0.1381,  0.5303,  0.1591, -0.2453, -0.3478,  0.2496,  0.2081,
        -0.5171,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0474,  0.5977,  0.2706,  0.5040, -0.8940,  0.3456,  0.2585,  0.2215,
        -0.9306, -0.6743]) tensor([ 0.0441,  0.9023,  0.4709,  0.2634, -1.0438,  0.6021,  0.3095,  0.0614,
        -1.0633, -0.8522]) tensor([-0.1941,  0.6047,  0.0042,  0.6180, -1.0527,  0.4768,  0.2199, -0.0184,
        -0.9083, -0.1789])
R[0]
tensor([0.1319], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07305621518194676; R = 0.06767626385483891;                 Gamma = 0.4469559881091118; Q = 0.040978466687723994;
Entropy Neighbor = 0.17718768599629403;                 Entropy Random = 0.041627087721601126;                 Volume = 0.0096091735586524; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2304,  0.0684, -0.1801, -0.3795, -0.3657, -0.1573, -0.3475,  0.4721,
        -0.2257, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1504,  0.1613, -0.1807, -0.2490, -0.4543, -0.1273, -0.2975,  0.5260,
        -0.3351, -0.2538]) tensor([-0.0579,  0.4823,  0.0465, -0.4689, -0.5869,  0.1421, -0.2246,  0.3655,
        -0.4636, -0.4267]) tensor([-0.1119,  0.3287,  0.4941,  0.1264, -0.4341, -0.0789,  0.0763,  0.6233,
        -0.2786, -0.4040])
R[0]
tensor([0.0654], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07182046634703874; R = 0.06614721758756786;                 Gamma = 0.4457656869292259; Q = 0.03984947442635894;
Entropy Neighbor = 0.1779020839035511;                 Entropy Random = 0.041813402581959966;                 Volume = 0.008020546641200782; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0878,  0.1321,  0.5461,  0.1730, -0.2478, -0.3401,  0.2618,  0.2171,
        -0.5211, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0950,  0.0249,  0.1221, -0.1490, -0.3121, -0.1365, -0.2347,  0.1533,
        -0.4007,  0.0794]) tensor([ 0.0421,  0.3048,  0.3194, -0.3668, -0.4533,  0.0994, -0.1979,  0.0294,
        -0.4903, -0.0840]) tensor([-0.1689,  0.1680,  0.2659,  0.1304, -0.4881, -0.0407, -0.1035,  0.2017,
        -0.3850,  0.1932])
R[0]
tensor([0.0843], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07222190472483635; R = 0.06692816922441125;                 Gamma = 0.44541938918828966; Q = 0.04007067630626261;
Entropy Neighbor = 0.17748620453476907;                 Entropy Random = 0.04247701465338469;                 Volume = 0.008190542921423911; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0116,  0.5669,  0.5254, -0.0293, -0.3468, -0.1432, -0.0312,  0.4203,
        -0.6357,

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0023,  0.3033,  0.6374, -0.0093, -0.3177,  0.0304,  0.0744,  0.4570,
        -0.5567, -0.7729]) tensor([ 0.0527,  0.6312,  0.8613, -0.2455, -0.4634,  0.3174,  0.1399,  0.2736,
        -0.7033, -0.9519]) tensor([-0.1972,  0.4580,  0.5180, -0.0755, -0.4185,  0.0328,  0.0380,  0.2615,
        -0.7091, -0.5484])
R[0]
tensor([0.0902], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07283822473138571; R = 0.06944145798590035;                 Gamma = 0.44572029238939287; Q = 0.042672827031463384;
Entropy Neighbor = 0.17474611178040506;                 Entropy Random = 0.042629688419401644;                 Volume = 0.008144350279122591; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.2614,  0.2560,  0.4178,  0.4193, -0.4238, -0.1885,  0.2286, -0.0922,
        -0.799

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.1840,  0.1826,  0.0155, -0.3013, -0.3057, -0.2350, -0.1977,  0.5566,
        -0.2773, -0.3134]) tensor([-0.0944,  0.5050,  0.2452, -0.5199, -0.4373,  0.0370, -0.1245,  0.3941,
        -0.4055, -0.4855]) tensor([-0.1049,  0.1590, -0.0476, -0.2679, -0.3211, -0.2202, -0.1971,  0.5301,
        -0.2784, -0.2815])
R[0]
tensor([0.0497], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07227421531081199; R = 0.06629048132896423;                 Gamma = 0.4457977250814438; Q = 0.03876696278434247;
Entropy Neighbor = 0.17627121633291246;                 Entropy Random = 0.04311239530518651;                 Volume = 0.008197449047118426; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.2918,  0.1641, -0.4639, -0.1792, -0.6035, -0.0408,  0.0094,  0.2689,
        -0.4024, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.0118,  0.5574,  0.5090, -0.0146, -0.3610, -0.1321, -0.0334,  0.4209,
        -0.6229, -0.5349]) tensor([ 0.0501,  0.8821,  0.7302, -0.2515, -0.5053,  0.1500,  0.0335,  0.2442,
        -0.7632, -0.7155]) tensor([ 0.0569,  0.5164,  0.4339,  0.1233, -0.4543, -0.0552,  0.0430,  0.3251,
        -0.7537, -0.5457])
R[0]
tensor([0.0930], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07285898599773645; R = 0.06624566954560578;                 Gamma = 0.4466731590628624; Q = 0.04029926670715213;
Entropy Neighbor = 0.17464527341723443;                 Entropy Random = 0.04156630584225059;                 Volume = 0.009378733102232218; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0471,  0.5168,  0.4143,  0.1233, -0.4624, -0.0578,  0.0424,  0.3136,
        -0.7611, 

  ax.scatter(
  plt.show()
  ylim_max = np.nanmax(self._separability_tracking)*1.1


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4072,  0.1520,  0.2671, -0.1163, -0.3580, -0.1821, -0.0932,  0.3701,
        -0.2154, -0.0441]) tensor([-0.2638,  0.4255,  0.4623, -0.3214, -0.4961,  0.0537, -0.0725,  0.2540,
        -0.2917, -0.1895]) tensor([-0.2916,  0.0743,  0.3241,  0.0354, -0.3929, -0.1180, -0.0354,  0.3639,
        -0.2380, -0.0777])
R[0]
tensor([0.0606], grad_fn=<SelectBackward0>)
LOSSES
T = 0.0724344063848257; R = 0.0676562961731106;                 Gamma = 0.44681812047958375; Q = 0.03978688204754144;
Entropy Neighbor = 0.17483023259043692;                 Entropy Random = 0.0428663519397378;                 Volume = 0.00871304576098919; VAE = 0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.0801,  0.1705,  0.6263,  0.3413, -0.4384, -0.0160,  0.3009,  0.5053,
        -0.3275, -0.


KeyboardInterrupt



## Visualize performance

In [None]:
agent.setNetwork(f'{fname}/fname', nEpoch=50)

In [None]:
agent._in_episode = True
agent._mode = 0 # Testing mode with plan_depth=0
initState = env.reset(agent._mode)
inputDims = env.inputDimensions()

for i in range(len(inputDims)):
    if inputDims[i][0] > 1:
        agent._state[i][1:] = initState[i][1:]
agent._Vs_on_last_episode = []
is_terminal = False
reward = 0

### Frame by frame

In [None]:
%matplotlib inline

for i in range(100):
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    plt.figure()
    plt.imshow(np.flip(_obs.squeeze()))
    plt.show()
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
    V, action, reward, _ = agent._step()
    print(action)
    agent._Vs_on_last_episode.append(V)
    is_terminal = env.inTerminalState()
    if is_terminal: break

### As animation

In [None]:
%matplotlib notebook

import numpy as np 
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation

fig, ax = plt.subplots(1,1)
obs = env.observe()
_obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
_obs = np.flip(_obs.squeeze())
ax.set_xticks([])
ax.set_yticks([])
im = ax.imshow(np.zeros(_obs.shape))

def init():
    plt.cla()
    im = ax.imshow(_obs)
    return [im]

def animate(i, *args, **kwargs):
    plt.cla()
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    _obs = np.flip(_obs.squeeze())
    im = ax.imshow(_obs)
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
        V, action, reward, _ = agent._step()
        agent._Vs_on_last_episode.append(V)
    return [im]

ani = animation.FuncAnimation(fig, animate, init_func=init, 
     frames=100, blit=False, repeat=True)
ani.save(f'figs/{fname}/behavior.gif', writer="ffmpeg", fps = 15)
fig.show()