In [1]:
import sys
import logging
import numpy as np
from joblib import hash, dump, load
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from simple_maze_env_3d import MyEnv as simple_maze_env
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy

In [2]:
class Defaults:
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 5000
    epochs = 50
    steps_per_test = 1000
    period_btw_summary_perfs = 1
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    update_rule = 'rmsprop'
    learning_rate = 5 * 1E-4 # 1E-4
    learning_rate_decay = 0.9
    discount = 0.9
    discount_inc = 1
    discount_max = 0.99
    rms_decay = 0.9
    rms_epsilon = 0.0001
    momentum = 0
    clip_norm = 1.0
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 10000
    update_frequency = 1
    replay_memory_size = 1000000 #replacing with 200000 will works just fine (in case you dont have 18gb of memory)
    batch_size = 32
    freeze_interval = 1000
    deterministic = False

higher_dim_obs = True

In [3]:

parameters = Defaults()
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

# --- Instantiate environment ---
env = simple_maze_env(rng, higher_dim_obs=higher_dim_obs)

# --- Instantiate learning_algo ---
learning_algo = CRAR(
    env,
    parameters.rms_decay,
    parameters.rms_epsilon,
    parameters.momentum,
    parameters.clip_norm,
    parameters.freeze_interval,
    parameters.batch_size,
    parameters.update_rule,
    rng,
    high_int_dim=False,
    internal_dim=3, lr=parameters.learning_rate)

test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.)

# --- Instantiate agent ---
agent = NeuralAgent(
    env,
    learning_algo,
    parameters.replay_memory_size,
    max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
    parameters.batch_size,
    rng,
    test_policy=test_policy)

# --- Create unique filename for FindBestController ---
h = hash(vars(parameters), hash_name="sha1")
fname = "test_" + h
print("The parameters hash is: {}".format(h))
print("The parameters are: {}".format(parameters))

# As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
# policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
# precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
# episode or epoch (or never, hence the resetEvery='none').
agent.attach(bc.EpsilonController(
    initial_e=parameters.epsilon_start,
    e_decays=parameters.epsilon_decay,
    e_min=parameters.epsilon_min,
    evaluate_on='action',
    periodicity=1,
    reset_every='none'))

agent.run(10, 500)
print("end gathering data")

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
# wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# Same for the discount factor.
agent.attach(bc.DiscountFactorController(
    initial_discount_factor=parameters.discount, 
    discount_factor_growth=parameters.discount_inc, 
    discount_factor_max=parameters.discount_max,
    periodicity=1))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
# has the highest validation score.
# To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
# important that the validationID is the same than the id argument of the InterleavedTestEpochController.
# The FindBestController will dump on disk the validation scores for each and every network, as well as the 
# structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
# of the validation and test scores (see below) or simply recover the resulting neural network for your 
# application.
agent.attach(bc.FindBestController(
    validationID=simple_maze_env.VALIDATION_MODE,
    testID=None,
    unique_fname=fname))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 
# rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 
# every [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=simple_maze_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1))

The parameters hash is: 62977be8e45d8a56a5537c11dfd5d2fd8dda69e0
The parameters are: <__main__.Defaults object at 0x2b5f7b8d4760>
end gathering data


In [4]:
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
agent.gathering_data=False
agent.run(parameters.epochs, parameters.steps_per_epoch)

# --- Show results ---
basename = "scores/" + fname
scores = load(basename + "_scores.jldump")
print (scores)

Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.3041, -0.2686,  0.0520], device='cuda:0') tensor([ 0.0486, -0.1000,  0.1473], device='cuda:0') tensor([ 0.3041, -0.2686,  0.0520], device='cuda:0')
R[0]
tensor([-0.0246], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.045375505048781635 0.00172343521239236 0.5153328900933266 0.0 0.39091637200117113 0.0005784320831298828 0.08760981340147554
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.7271, 0.1446, 0.9784], device='cuda:0') tensor([0.7067, 0.1152, 0.9301], device='cuda:0') tensor([ 0.9602, -0.1826,  0.8331], device='cuda:0')
R[0]
tensor([0.0765], device='cuda:0', gra

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.0731, -0.5977, -0.3837], device='cuda:0') tensor([ 0.0708, -0.4784, -0.8269], device='cuda:0') tensor([ 0.1258, -0.3885, -0.9260], device='cuda:0')
R[0]
tensor([0.0099], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.01073353178333491 0.02133051643613726 0.5076456690430641 0.0 0.3187966531515121 8.650402724742889e-05 0.04269625455187634
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.0676,  0.6945,  0.4900], device='cuda:0') tensor([-0.4580,  0.8535,  0.5168], device='cuda:0') tensor([-0.4935,  0.9435,  0.4327], device='cuda:0')
R[0]
tensor([0.0827], device='cuda:0', 

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.1760,  0.7437,  0.3088], device='cuda:0') tensor([-0.6002,  0.9112,  0.3239], device='cuda:0') tensor([-0.5563,  0.9711,  0.2321], device='cuda:0')
R[0]
tensor([0.0747], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.005625246350187808 0.0432806845495943 0.5042021257281304 0.0 0.3080396408587694 6.742816418409348e-05 0.04070470316521824
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.7719,  0.4087, -0.9900], device='cuda:0') tensor([-0.7978,  0.3004, -0.3477], device='cuda:0') tensor([-0.7763,  0.2424, -0.3181], device='cuda:0')
R[0]
tensor([-0.0486], device='cuda:0',

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.1300, -0.1477,  0.8656], device='cuda:0') tensor([ 0.0206, -0.1651,  0.8694], device='cuda:0') tensor([-0.1300, -0.1477,  0.8656], device='cuda:0')
R[0]
tensor([0.0791], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.004123851084616035 0.06752809581113979 0.501888462126255 0.0 0.29914637421071527 3.97854819893837e-05 0.03636895095300861
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.8211, -0.0781,  0.9771], device='cuda:0') tensor([ 0.8163, -0.0716,  0.9751], device='cuda:0') tensor([ 0.8211, -0.0781,  0.9771], device='cuda:0')
R[0]
tensor([0.0266], device='cuda:0', 

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9930,  0.9666, -0.9833], device='cuda:0') tensor([-0.9408,  0.7842, -0.2498], device='cuda:0') tensor([-0.9939,  0.7508, -0.2781], device='cuda:0')
R[0]
tensor([-0.0310], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0022864353298209608 0.09335739556630142 0.5025636056661605 0.0 0.2865214288532734 1.778838038444519e-05 0.03714350513846148
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.8412, -0.7166,  0.9778], device='cuda:0') tensor([-1.0010, -0.0561,  0.7924], device='cuda:0') tensor([-0.9795, -0.0378,  0.8185], device='cuda:0')
R[0]
tensor([0.0498], device='cuda:0

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.9708, 0.0376, 0.9558], device='cuda:0') tensor([0.7752, 0.6859, 0.9123], device='cuda:0') tensor([0.8417, 0.7630, 0.8717], device='cuda:0')
R[0]
tensor([0.0978], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.001168591639958322 0.11881998091400602 0.505470507144928 0.0 0.2863365162163973 1.5209555625915528e-05 0.03561378766794223
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9877,  0.9858, -0.9796], device='cuda:0') tensor([-0.9991,  0.9917, -0.9962], device='cuda:0') tensor([-0.9877,  0.9858, -0.9796], device='cuda:0')
R[0]
tensor([0.0021], device='cuda:0', grad_fn=

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.0022, -0.0886,  0.4361], device='cuda:0') tensor([-0.9589, -0.9768,  0.9991], device='cuda:0') tensor([-0.9573, -0.9803,  0.9924], device='cuda:0')
R[0]
tensor([0.0521], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0006576709795626811 0.1436564489842858 0.50698130351305 0.0 0.2914521096497774 7.293276488780975e-06 0.037123283612716475
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9885, -0.4582, -0.0110], device='cuda:0') tensor([ 1.0022,  0.1345, -0.2947], device='cuda:0') tensor([ 0.9905,  0.1520, -0.2951], device='cuda:0')
R[0]
tensor([0.0464], device='cuda:0', 

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.9853, 0.2371, 0.9749], device='cuda:0') tensor([ 1.0175,  0.2745, -0.5455], device='cuda:0') tensor([ 0.9999,  0.2452, -0.5682], device='cuda:0')
R[0]
tensor([0.0941], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00039953275601146743 0.16720929403335322 0.5098844120502471 0.0 0.2857254088446498 8.525736629962922e-06 0.03639048654783983
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.3732, -0.9506,  0.8980], device='cuda:0') tensor([ 0.3790, -0.9488, -0.0212], device='cuda:0') tensor([ 0.3814, -0.9481, -0.0237], device='cuda:0')
R[0]
tensor([0.0651], device='cuda:0', 

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9991, -0.2287,  0.9537], device='cuda:0') tensor([-0.9806, -0.2207,  0.9608], device='cuda:0') tensor([-0.9991, -0.2287,  0.9537], device='cuda:0')
R[0]
tensor([0.0017], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00027951776777626944 0.19000790377275553 0.5100173633098602 0.0 0.28536923784017565 7.827423512935638e-06 0.03472062071081018
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.3261, -0.9828,  0.0325], device='cuda:0') tensor([-0.3279, -0.9727,  0.9795], device='cuda:0') tensor([-0.3503, -0.9856,  0.9963], device='cuda:0')
R[0]
tensor([-0.0389], device='cuda

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.0051, -0.0355,  0.3979], device='cuda:0') tensor([ 0.9852,  0.9699, -0.9929], device='cuda:0') tensor([ 0.9928,  0.9744, -0.9976], device='cuda:0')
R[0]
tensor([-0.0102], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00020143249152170027 0.21319542100338731 0.5090062251091003 0.0 0.2863452959358692 6.669744849205017e-06 0.036606666790059535
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9476, -0.0472, -0.9886], device='cuda:0') tensor([-0.9228, -0.0452, -1.0000], device='cuda:0') tensor([-0.9476, -0.0472, -0.9886], device='cuda:0')
R[0]
tensor([-0.0171], device='cud

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9885,  0.9985,  0.2663], device='cuda:0') tensor([-1.0196,  0.9693,  0.9841], device='cuda:0') tensor([-0.9972,  0.9673,  0.9973], device='cuda:0')
R[0]
tensor([0.0099], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.0001686312256133533 0.2366870545297861 0.5084854107499123 0.0 0.2820809602588415 7.451705634593963e-06 0.0354491137630539
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.3873, -0.9504,  0.9823], device='cuda:0') tensor([ 1.0139, -1.0133,  0.6102], device='cuda:0') tensor([ 0.9991, -0.9937,  0.5941], device='cuda:0')
R[0]
tensor([0.0727], device='cuda:0', 

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.3333, 0.9941, 0.5435], device='cuda:0') tensor([-0.3719,  0.9876,  0.9383], device='cuda:0') tensor([-0.3422,  0.9900,  0.9652], device='cuda:0')
R[0]
tensor([0.0966], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
0.00012456496423692443 0.2603705366756767 0.5089312440752983 0.0 0.27713257414102554 5.499020218849182e-06 0.033489181817276406
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9989,  0.9853, -0.9959], device='cuda:0') tensor([ 0.0075, -0.0349,  0.3738], device='cuda:0') tensor([ 0.0122, -0.0347,  0.3711], device='cuda:0')
R[0]
tensor([0.0237], device='cuda:0',

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9965, -0.9981,  0.9860], device='cuda:0') tensor([-1.0011, -0.1935,  1.0006], device='cuda:0') tensor([-0.9919, -0.1789,  0.9922], device='cuda:0')
R[0]
tensor([0.0470], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
9.873873816104606e-05 0.28406118108984085 0.5087641510367393 0.0 0.2827427857965231 4.275307059288025e-06 0.03399636653123889
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.3494, 0.9932, 0.5613], device='cuda:0') tensor([ 0.3142,  1.0042, -0.6946], device='cuda:0') tensor([ 0.3116,  0.9934, -0.6983], device='cuda:0')
R[0]
tensor([0.0840], device='cuda:0', g

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.3115, -0.9917,  0.0171], device='cuda:0') tensor([-0.3194, -0.9971, -0.9921], device='cuda:0') tensor([-0.3225, -0.9958, -0.9875], device='cuda:0')
R[0]
tensor([0.0186], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
8.750334957585437e-05 0.3077255234248005 0.5096547036767006 0.0 0.2837944112867117 4.0508061647415165e-06 0.036281157095509116
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9965,  0.9959, -0.9991], device='cuda:0') tensor([0.9953, 0.9907, 0.0434], device='cuda:0') tensor([0.9965, 0.9939, 0.0295], device='cuda:0')
R[0]
tensor([-0.0277], device='cuda:0', gr

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.9967,  1.0002,  0.2276], device='cuda:0') tensor([-0.9931,  0.9690,  0.9874], device='cuda:0') tensor([-0.9980,  0.9589,  0.9969], device='cuda:0')
R[0]
tensor([0.0086], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
6.941329077017144e-05 0.3313350370655535 0.509302232503891 0.0 0.28703993365168573 3.5502463579177854e-06 0.03414482753019547
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9972, -0.3991,  0.9893], device='cuda:0') tensor([ 0.9924, -0.3974,  0.9939], device='cuda:0') tensor([ 0.9972, -0.3991,  0.9893], device='cuda:0')
R[0]
tensor([0.0686], device='cuda:0'

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.4131, -0.9679,  0.9994], device='cuda:0') tensor([ 0.4196, -0.9685,  0.9968], device='cuda:0') tensor([ 0.4131, -0.9679,  0.9994], device='cuda:0')
R[0]
tensor([0.0083], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
6.520151831500698e-05 0.3548278743402334 0.5094552233815193 0.0 0.28157658007740977 2.8213635087013246e-06 0.036446621153038
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9981, -0.9975, -0.2581], device='cuda:0') tensor([ 0.3857, -0.9807,  0.0553], device='cuda:0') tensor([ 0.3784, -0.9841,  0.0601], device='cuda:0')
R[0]
tensor([0.0126], device='cuda:0',

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.3431, -0.9484, -0.9605], device='cuda:0') tensor([ 0.3877, -0.9967,  0.0481], device='cuda:0') tensor([ 0.3909, -0.9849,  0.0612], device='cuda:0')
R[0]
tensor([-0.0642], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
5.639434271870414e-05 0.378356670926325 0.50990541690588 0.0 0.28137558509409427 2.6228129863739015e-06 0.0346616215746617
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.9966, 0.5756, 0.9984], device='cuda:0') tensor([ 0.9970,  0.9973, -0.0414], device='cuda:0') tensor([ 0.9992,  0.9995, -0.0517], device='cuda:0')
R[0]
tensor([0.1115], device='cuda:0', gra

  line3 = ax.scatter(all_possib_abs_states[length_block[i][0]:length_block[i][1],0], all_possib_abs_states[length_block[i][0]:length_block[i][1],1] ,all_possib_abs_states[length_block[i][0]:length_block[i][1],2], marker='x', depthshade=True, edgecolors='k', alpha=0.5, s=50)
  plt.plot(np.log(learning_algo.tracked_disamb1))


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.3277,  0.9960, -0.3241], device='cuda:0') tensor([ 0.2604,  0.9958, -0.8227], device='cuda:0') tensor([ 0.2545,  0.9973, -0.8232], device='cuda:0')
R[0]
tensor([0.0280], device='cuda:0', grad_fn=<SelectBackward0>)
self.loss_T/500., self.lossR/500., self.loss_gamma/500., self.loss_Q/500., self.loss_disentangle_t/500., self.loss_disambiguate1/500., self.loss_disambiguate2/500.
5.2400474171008684e-05 0.40171826199407223 0.5098661727905274 0.0 0.28385318349301814 2.4770200252532957e-06 0.03391457209968939
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.3044, -0.9983,  0.0266], device='cuda:0') tensor([-0.2888, -0.9881,  1.0102], device='cuda:0') tensor([-0.2852, -0.9957,  0.9957], device='cuda:0')
R[0]
tensor([-0.0390], device='cud


KeyboardInterrupt

