In [1]:
import sys
import logging
import numpy as np
from joblib import hash, dump, load
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from simple_maze_env import MyEnv as simple_maze_env
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy

In [2]:
class Defaults:
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 5000
    epochs = 50
    steps_per_test = 1000
    period_btw_summary_perfs = 1
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    update_rule = 'rmsprop'
    learning_rate = 5 * 1E-4 # 1E-4
    learning_rate_decay = 0.9
    discount = 0.9
    discount_inc = 1
    discount_max = 0.99
    rms_decay = 0.9
    rms_epsilon = 0.0001
    momentum = 0
    clip_norm = 1.0
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 10000
    update_frequency = 1
    replay_memory_size = 1000000 #replacing with 200000 will works just fine (in case you dont have 18gb of memory)
    batch_size = 32
    freeze_interval = 1000
    deterministic = False

higher_dim_obs = True

In [3]:

parameters = Defaults()
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

# --- Instantiate environment ---
env = simple_maze_env(rng, higher_dim_obs=higher_dim_obs)

# --- Instantiate learning_algo ---
learning_algo = CRAR(
    env,
    parameters.rms_decay,
    parameters.rms_epsilon,
    parameters.momentum,
    parameters.clip_norm,
    parameters.freeze_interval,
    parameters.batch_size,
    parameters.update_rule,
    rng,
    high_int_dim=False,
    internal_dim=2, lr=parameters.learning_rate)

test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.)

# --- Instantiate agent ---
agent = NeuralAgent(
    env,
    learning_algo,
    parameters.replay_memory_size,
    max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
    parameters.batch_size,
    rng,
    test_policy=test_policy)

# --- Create unique filename for FindBestController ---
h = hash(vars(parameters), hash_name="sha1")
fname = "test_" + h
print("The parameters hash is: {}".format(h))
print("The parameters are: {}".format(parameters))

# As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
# policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
# precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
# episode or epoch (or never, hence the resetEvery='none').
agent.attach(bc.EpsilonController(
    initial_e=parameters.epsilon_start,
    e_decays=parameters.epsilon_decay,
    e_min=parameters.epsilon_min,
    evaluate_on='action',
    periodicity=1,
    reset_every='none'))

agent.run(10, 500)
print("end gathering data")

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
# wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# Same for the discount factor.
agent.attach(bc.DiscountFactorController(
    initial_discount_factor=parameters.discount, 
    discount_factor_growth=parameters.discount_inc, 
    discount_factor_max=parameters.discount_max,
    periodicity=1))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
# has the highest validation score.
# To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
# important that the validationID is the same than the id argument of the InterleavedTestEpochController.
# The FindBestController will dump on disk the validation scores for each and every network, as well as the 
# structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
# of the validation and test scores (see below) or simply recover the resulting neural network for your 
# application.
agent.attach(bc.FindBestController(
    validationID=simple_maze_env.VALIDATION_MODE,
    testID=None,
    unique_fname=fname))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 
# rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 
# every [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=simple_maze_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1))

The parameters hash is: 62977be8e45d8a56a5537c11dfd5d2fd8dda69e0
The parameters are: <__main__.Defaults object at 0x2ae487dc08b0>
end gathering data


In [4]:
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
agent.gathering_data=False
agent.run(parameters.epochs, parameters.steps_per_epoch)

# --- Show results ---
basename = "scores/" + fname
scores = load(basename + "_scores.jldump")
print (scores)

Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.3075, -0.0738], device='cuda:0') tensor([-0.3935, -0.3524], device='cuda:0') tensor([-0.3075, -0.0738], device='cuda:0')
R[0]
tensor([0.2149], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.03939761593192816 0.06175614380836487 1.1208807349205017 0.0 0.3985723863840103 0.0007902259081602097 0.0944376584906131
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.5836, -0.9203], device='cuda:0') tensor([ 0.6992, -0.5662], device='cuda:0') tensor([ 0.8374, -0.4729], device='cuda:0')
R[0]
tensor([0.2582], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.6749, -0.1739], device='cuda:0') tensor([-0.6453,  0.2037], device='cuda:0') tensor([-0.6165,  0.2999], device='cuda:0')
R[0]
tensor([0.2096], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00721187219908461 0.6784051927104592 1.126071973323822 0.0 0.36924794697761537 0.0002430214136838913 0.05153244132269174
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.5488, 0.3806], device='cuda:0') tensor([0.5429, 0.6510], device='cuda:0') tensor([0.5047, 0.7238], device='cuda:0')
R[0]
tensor([0.2277], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500.,

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.0446,  0.6119], device='cuda:0') tensor([-0.0370,  0.6560], device='cuda:0') tensor([-0.0446,  0.6119], device='cuda:0')
R[0]
tensor([0.2577], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0030015448874328287 1.294117451198399 1.1255337076187133 0.0 0.35560623943805697 0.00019803528487682344 0.054517265439033506
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9752,  0.9720], device='cuda:0') tensor([-0.9678,  0.9525], device='cuda:0') tensor([-0.9752,  0.9720], device='cuda:0')
R[0]
tensor([0.2405], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.l

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9472,  1.0025], device='cuda:0') tensor([-0.9059,  0.9741], device='cuda:0') tensor([-0.9472,  1.0025], device='cuda:0')
R[0]
tensor([0.2014], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0023115290683927013 1.9114814739599824 1.1266026513576508 0.0 0.35422515311837194 0.00013620183616876603 0.05257123544532806
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.0867, -0.1065], device='cuda:0') tensor([ 0.0279, -0.4619], device='cuda:0') tensor([-0.0795, -0.5223], device='cuda:0')
R[0]
tensor([0.2637], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.l

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9867,  1.0015], device='cuda:0') tensor([-1.0051,  1.0090], device='cuda:0') tensor([-0.9867,  1.0015], device='cuda:0')
R[0]
tensor([0.2402], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0018245916831074282 2.528501107878983 1.1271438550949098 0.0 0.34952265337109567 0.0001233583241701126 0.05523459839913994
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.0456, -0.5467], device='cuda:0') tensor([-0.3037, -0.9731], device='cuda:0') tensor([-0.2943, -0.9914], device='cuda:0')
R[0]
tensor([0.2609], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.los

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9819,  0.9955], device='cuda:0') tensor([-0.9620,  1.0040], device='cuda:0') tensor([-0.9819,  0.9955], device='cuda:0')
R[0]
tensor([0.2403], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0013077952087623998 3.1461189639419316 1.1284418795108795 0.0 0.3552364190816879 9.439931809902191e-05 0.053137013701722025
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.1325,  0.9966], device='cuda:0') tensor([0.4503, 0.9535], device='cuda:0') tensor([0.4078, 0.9977], device='cuda:0')
R[0]
tensor([0.2420], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.3812,  0.9959], device='cuda:0') tensor([-0.3292,  1.0277], device='cuda:0') tensor([-0.3812,  0.9959], device='cuda:0')
R[0]
tensor([0.2374], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0009801153781008906 3.7635037332624197 1.1251854200363158 0.0 0.3432532220482826 7.266171276569366e-05 0.051510337892919776
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.1942,  0.1609], device='cuda:0') tensor([-0.6237, -0.1903], device='cuda:0') tensor([-0.5794, -0.1331], device='cuda:0')
R[0]
tensor([0.2579], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lo

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.4748, -0.4679], device='cuda:0') tensor([-0.9810, -0.6087], device='cuda:0') tensor([-0.9959, -0.6572], device='cuda:0')
R[0]
tensor([0.2551], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0007821764702675864 4.3811700055524705 1.1261335394382477 0.0 0.34062747672200205 6.723780930042266e-05 0.053362537678331136
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.5479, 0.5707], device='cuda:0') tensor([0.4448, 0.9914], device='cuda:0') tensor([0.4461, 0.9946], device='cuda:0')
R[0]
tensor([0.2270], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/5

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.5050, -0.4850], device='cuda:0') tensor([-0.9612, -0.6146], device='cuda:0') tensor([-0.9890, -0.6269], device='cuda:0')
R[0]
tensor([0.2545], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0006886455815983936 4.999282288715244 1.1273780322074891 0.0 0.33351966363191604 6.394374370574951e-05 0.05307617162540555
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.5106,  0.9933], device='cuda:0') tensor([-0.4847,  1.0105], device='cuda:0') tensor([-0.5106,  0.9933], device='cuda:0')
R[0]
tensor([0.2350], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.los

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9885, -0.6410], device='cuda:0') tensor([-0.9675, -0.2568], device='cuda:0') tensor([-0.9916, -0.2870], device='cuda:0')
R[0]
tensor([0.2070], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0007204109826125204 5.617001368716359 1.1249602150917053 0.0 0.33833376376330854 4.4145114719867705e-05 0.05276603698870167
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.5025, -0.5166], device='cuda:0') tensor([-0.4591, -0.4682], device='cuda:0') tensor([-0.5025, -0.5166], device='cuda:0')
R[0]
tensor([0.2348], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lo

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.2437, -0.0249], device='cuda:0') tensor([-0.0894,  0.3928], device='cuda:0') tensor([-0.1000,  0.4128], device='cuda:0')
R[0]
tensor([0.2659], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0005973416092747356 6.235621145941317 1.1270201704502105 0.0 0.33125979226827623 4.0727198123931884e-05 0.054192759522236886
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.4740, -0.5460], device='cuda:0') tensor([-0.4223, -0.0895], device='cuda:0') tensor([-0.4873, -0.1198], device='cuda:0')
R[0]
tensor([0.2140], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.l

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9951,  0.1388], device='cuda:0') tensor([-0.9854, -0.2594], device='cuda:0') tensor([-0.9927, -0.2597], device='cuda:0')
R[0]
tensor([0.2379], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0005003206431283615 6.854288552567363 1.1248030023574829 0.0 0.3312476730644703 4.2681559920310976e-05 0.05642190799023956
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9972, -0.9980], device='cuda:0') tensor([ 0.1049, -0.9793], device='cuda:0') tensor([ 0.1334, -0.9981], device='cuda:0')
R[0]
tensor([0.2826], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.los

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9963, -0.9958], device='cuda:0') tensor([ 0.1184, -1.0050], device='cuda:0') tensor([ 0.1249, -0.9996], device='cuda:0')
R[0]
tensor([0.2826], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00036800669771037064 7.472877452149987 1.1263760344982148 0.0 0.3349460007250309 3.675509989261627e-05 0.05223900036141276
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.6658, -0.9981], device='cuda:0') tensor([-0.5094, -0.5309], device='cuda:0') tensor([-0.4880, -0.5587], device='cuda:0')
R[0]
tensor([0.2129], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.los

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 1.0008, -0.9965], device='cuda:0') tensor([ 1.0067, -0.9967], device='cuda:0') tensor([ 1.0008, -0.9965], device='cuda:0')
R[0]
tensor([0.2857], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00033131908666109665 8.091422293543816 1.127376171350479 0.0 0.33711891770362856 3.0044019222259522e-05 0.05377027974277735
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.9984, 0.7186], device='cuda:0') tensor([1.0081, 0.7141], device='cuda:0') tensor([0.9984, 0.7186], device='cuda:0')
R[0]
tensor([0.2626], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/50

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 1.0002, -0.4493], device='cuda:0') tensor([ 0.6159, -0.9894], device='cuda:0') tensor([ 0.6606, -0.9971], device='cuda:0')
R[0]
tensor([0.2780], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00029463074504747054 8.709807983152569 1.127690553188324 0.0 0.33226648753881455 2.9813066124916077e-05 0.051759881312958896
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.1245, -0.9981], device='cuda:0') tensor([ 0.6472, -0.9977], device='cuda:0') tensor([ 0.6460, -1.0003], device='cuda:0')
R[0]
tensor([0.2254], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.l

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9998, -0.2664], device='cuda:0') tensor([-0.9923, -0.2265], device='cuda:0') tensor([-0.9998, -0.2664], device='cuda:0')
R[0]
tensor([0.2428], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0002673024709802121 9.328303560301661 1.125468868970871 0.0 0.32512663048505785 2.7903996407985686e-05 0.05298878743778914
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9955, -0.9984], device='cuda:0') tensor([ 0.9967, -0.9935], device='cuda:0') tensor([ 0.9955, -0.9984], device='cuda:0')
R[0]
tensor([0.2856], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.los

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.2377,  0.9992], device='cuda:0') tensor([0.5807, 0.9952], device='cuda:0') tensor([0.5928, 0.9988], device='cuda:0')
R[0]
tensor([0.2400], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00024849635538703297 9.946913719333708 1.1252140936851502 0.0 0.3343599790930748 2.5017671287059784e-05 0.05016198453772813
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.3838, -0.1510], device='cuda:0') tensor([-0.0670,  0.6095], device='cuda:0') tensor([-0.0325,  0.5868], device='cuda:0')
R[0]
tensor([0.2377], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([9.9885e-01, 9.0645e-05], device='cuda:0') tensor([0.9919, 0.3822], device='cuda:0') tensor([0.9986, 0.3860], device='cuda:0')
R[0]
tensor([0.2370], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00022771277610445396 10.566384330578149 1.1269272525310516 0.0 0.32958226472139357 1.7666429281234742e-05 0.054265123667195436
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.3715, 0.5873], device='cuda:0') tensor([0.2396, 0.0293], device='cuda:0') tensor([0.2236, 0.0430], device='cuda:0')
R[0]
tensor([0.2685], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.los

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1], c=colors[i], marker='x', edgecolors='k', alpha=0.5, s=100)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9981, -0.9993], device='cuda:0') tensor([ 0.9965, -0.4735], device='cuda:0') tensor([ 0.9977, -0.4615], device='cuda:0')
R[0]
tensor([0.2408], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00020472394020180218 11.184885167099536 1.1258387310504914 0.0 0.32596287813782693 1.9441775977611542e-05 0.052152340402826666
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.3804,  0.2128], device='cuda:0') tensor([-0.3848,  0.2136], device='cuda:0') tensor([-0.3804,  0.2128], device='cuda:0')
R[0]
tensor([0.2379], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self


KeyboardInterrupt

