In [1]:
import sys
import logging
import numpy as np
from joblib import hash, dump
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from catcher_env import MyEnv as catcher_env
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy

In [2]:
class Defaults:
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 2000
    epochs = 50
    steps_per_test = 500
    period_btw_summary_perfs = 1
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    update_rule = 'rmsprop'
    learning_rate = 0.0005
    learning_rate_decay = 0.9
    discount = 0.9
    discount_inc = 1
    discount_max = 0.99
    rms_decay = 0.9
    rms_epsilon = 0.0001
    momentum = 0
    clip_norm = 1.0
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 10000
    update_frequency = 1
    replay_memory_size = 1000000
    batch_size = 32
    freeze_interval = 1000
    deterministic = False

HIGHER_DIM_OBS = True
HIGH_INT_DIM = False

In [3]:
parameters = Defaults()
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

In [4]:
# --- Instantiate environment ---
env = catcher_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=False)

# --- Instantiate learning algorithm ---
learning_algo = CRAR(
    env,
    parameters.rms_decay,
    parameters.rms_epsilon,
    parameters.momentum,
    parameters.clip_norm,
    parameters.freeze_interval,
    parameters.batch_size,
    parameters.update_rule,
    rng,
    double_Q=True,
    high_int_dim=HIGH_INT_DIM,
    internal_dim=3)

test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1)#1.)

# --- Instantiate agent ---
agent = NeuralAgent(
    env,
    learning_algo,
    parameters.replay_memory_size,
    max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
    parameters.batch_size,
    rng,
    test_policy=test_policy)

# --- Create unique filename for FindBestController ---
h = hash(vars(parameters), hash_name="sha1")
fname = "test_" + h
print("The parameters hash is: {}".format(h))
print("The parameters are: {}".format(parameters))

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
# policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
# precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
# episode or epoch (or never, hence the resetEvery='none').
agent.attach(bc.EpsilonController(
    initial_e=parameters.epsilon_start, 
    e_decays=parameters.epsilon_decay, 
    e_min=parameters.epsilon_min,
    evaluate_on='action',
    periodicity=1,
    reset_every='none'))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
# wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# Same for the discount factor.
agent.attach(bc.DiscountFactorController(
    initial_discount_factor=parameters.discount, 
    discount_factor_growth=parameters.discount_inc, 
    discount_factor_max=parameters.discount_max,
    periodicity=1))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
# these validation epoch to interfere with the training of the agent, which is well established by the 
# TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
# duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
# InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
# obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
# [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=catcher_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1))

The parameters hash is: 62977be8e45d8a56a5537c11dfd5d2fd8dda69e0
The parameters are: <__main__.Defaults object at 0x2ab37d83c070>


In [None]:
# --- Run the experiment ---
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
agent.run(parameters.epochs, parameters.steps_per_epoch)

Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.0865, -0.0777,  0.0089], device='cuda:0') tensor([ 0.0004, -0.3450,  0.1292], device='cuda:0') tensor([ 0.0884, -0.0781,  0.0093], device='cuda:0')
R[0]
tensor([0.1490], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.000763518393796403
Episode average V value: 0
Average (on the epoch) training loss: 0.02487533270066385
Episode average V value: 0
Average (on the epoch) training loss: 0.05749703552618694
Episode average V value: 0
Average (on the epoch) training loss: 0.06941996287579448
Episode average V value: 0
Average (on the epoch) training loss: 0.07434924849836864
Episode average V value: 0
Average (on the epoch) training loss: 0.07865321528323584
Episode average V value: 0
Average (on the epoch) training loss: 0.07765003541017874
Episode average V value: 0
Average (on the epoch) training loss: 0.079010212

Average (on the epoch) training loss: 0.08794319180843091
Episode average V value: 0
Average (on the epoch) training loss: 0.08782558719282507
Episode average V value: 0
Average (on the epoch) training loss: 0.08751680248458797
Episode average V value: 0
Average (on the epoch) training loss: 0.0872155250870467
Episode average V value: 0
Average (on the epoch) training loss: 0.08721183990180639
Episode average V value: 0
Average (on the epoch) training loss: 0.08719424352635353
Episode average V value: 0
Average (on the epoch) training loss: 0.08704977412970943
Episode average V value: 0
Average (on the epoch) training loss: 0.08693071422289393
Episode average V value: 0
Average (on the epoch) training loss: 0.08676201324246809
Episode average V value: 0
Average (on the epoch) training loss: 0.08652666857049475
Episode average V value: 0
Average (on the epoch) training loss: 0.08668105523332613
Episode average V value: 0
Average (on the epoch) training loss: 0.08672598168549826
Episode 

Average (on the epoch) training loss: 0.08160727437877925
Episode average V value: 0
Average (on the epoch) training loss: 0.08181859940367922
Episode average V value: 0
Average (on the epoch) training loss: 0.08169984021537696
Episode average V value: 0
Average (on the epoch) training loss: 0.08150728582725332
Episode average V value: 0
Average (on the epoch) training loss: 0.08156318493661086
Episode average V value: 0
Average (on the epoch) training loss: 0.08122328250883099
Episode average V value: 0
Average (on the epoch) training loss: 0.0811222694964578
Episode average V value: 0
Average (on the epoch) training loss: 0.0810990898489443
Episode average V value: 0
Average (on the epoch) training loss: 0.08103480638036797
Episode average V value: 0
Average (on the epoch) training loss: 0.0809798313555599
Episode average V value: 0
Average (on the epoch) training loss: 0.08094927277684019
Episode average V value: 0
Average (on the epoch) training loss: 0.08078672311487241
Episode av

  state = torch.as_tensor(state, device=self.device).float()


Testing score per episode (id: 0) is -0.48214285714285715 (average over 56 episode(s))
== Mean score per episode is -0.4821419961750068 over 56 episodes ==


  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.014664448839612305 0.07540814954973757 0.021767086277715862 0.06855819155042991 0.12265219482779503 0.0009212585985660553 0.057067657474428414
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.6731, 0.4534, 0.1291], device='cuda:0') tensor([0.6826, 0.1737, 0.3494], device='cuda:0') tensor([0.6673, 0.1227, 0.4043], device='cuda:0')
R[0]
tensor([0.0393], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.03000009970532523
Episode average V value: 0
Average (on the epoch) training loss: 0.03860027306816644
Episode average V value: 0
Average (on the epoch) training loss: 0.04739400551274971
Episode average V value: 0
Average (on the epoch) training loss: 0.05790982748537014
Episode average V value: 0
Average (o

Average (on the epoch) training loss: 0.05916081585577244
Episode average V value: 0
Average (on the epoch) training loss: 0.059272508169881624
Episode average V value: 0
Average (on the epoch) training loss: 0.05930050235040778
Episode average V value: 0
Average (on the epoch) training loss: 0.059020378181977966
Episode average V value: 0
Average (on the epoch) training loss: 0.05894172164144073
Episode average V value: 0
Average (on the epoch) training loss: 0.05876055592671037
Episode average V value: 0
Average (on the epoch) training loss: 0.058676683622413545
Episode average V value: 0
Average (on the epoch) training loss: 0.058690795025868735
Episode average V value: 0
Average (on the epoch) training loss: 0.05873756121982785
Episode average V value: 0
Average (on the epoch) training loss: 0.05903493010175779
Episode average V value: 0
Average (on the epoch) training loss: 0.05911910709376439
Episode average V value: 0
Average (on the epoch) training loss: 0.058954631497458186
Ep

Average (on the epoch) training loss: 0.05445836867386273
Episode average V value: 0
Average (on the epoch) training loss: 0.054359588240727015
Episode average V value: 0
Average (on the epoch) training loss: 0.05430806718471466
Episode average V value: 0
Average (on the epoch) training loss: 0.05430267255918484
Episode average V value: 0
Average (on the epoch) training loss: 0.0543239898087748
Episode average V value: 0
Average (on the epoch) training loss: 0.054243688201998326
Episode average V value: 0
Average (on the epoch) training loss: 0.05414347088447324
Episode average V value: 0
Average (on the epoch) training loss: 0.05407930955527321
Episode average V value: 0
Average (on the epoch) training loss: 0.05409035781066808
Episode average V value: 0
Average (on the epoch) training loss: 0.05405482575028424
Episode average V value: 0
Average (on the epoch) training loss: 0.053921351555912767
Episode average V value: 0
Average (on the epoch) training loss: 0.05383546561107368
Episo

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.013639671192504466 0.03965600429079495 0.016675253675086425 0.0426033119619824 0.060673882871866223 0.0009098165184259414 0.03940678233187646
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.6259, -0.7866,  0.0043], device='cuda:0') tensor([ 0.1984, -0.7015, -0.4747], device='cuda:0') tensor([ 0.4118, -0.6038, -0.5052], device='cuda:0')
R[0]
tensor([0.0470], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.050280777996199
Episode average V value: 0
Average (on the epoch) training loss: 0.05481541469796664
Episode average V value: 0
Average (on the epoch) training loss: 0.05668636875365068
Episode average V value: 0
Average (on the epoch) training loss: 0.04705277546438285
Episode average V value: 0
Aver

Average (on the epoch) training loss: 0.03913400185913461
Episode average V value: 0
Average (on the epoch) training loss: 0.03906555884538056
Episode average V value: 0
Average (on the epoch) training loss: 0.038946879938628304
Episode average V value: 0
Average (on the epoch) training loss: 0.03873526626997519
Episode average V value: 0
Average (on the epoch) training loss: 0.03869521574252244
Episode average V value: 0
Average (on the epoch) training loss: 0.03888096583132853
Episode average V value: 0
Average (on the epoch) training loss: 0.038945249940638436
Episode average V value: 0
Average (on the epoch) training loss: 0.038881118541704146
Episode average V value: 0
Average (on the epoch) training loss: 0.038955495713400325
Episode average V value: 0
Average (on the epoch) training loss: 0.03934035686101733
Episode average V value: 0
Average (on the epoch) training loss: 0.039284050275055354
Episode average V value: 0
Average (on the epoch) training loss: 0.03908588190496422
Ep

Average (on the epoch) training loss: 0.03772390475744571
Episode average V value: 0
Average (on the epoch) training loss: 0.03768186468729305
Episode average V value: 0
Average (on the epoch) training loss: 0.037728061937967475
Episode average V value: 0
Average (on the epoch) training loss: 0.037702899820551326
Episode average V value: 0
Average (on the epoch) training loss: 0.03766888203088842
Episode average V value: 0
Average (on the epoch) training loss: 0.0376095630378207
Episode average V value: 0
Average (on the epoch) training loss: 0.037659742401470936
Episode average V value: 0
Average (on the epoch) training loss: 0.03756764929098565
Episode average V value: 0
Average (on the epoch) training loss: 0.03748985281429604
Episode average V value: 0
Average (on the epoch) training loss: 0.037452026389814604
Episode average V value: 0
Average (on the epoch) training loss: 0.037405664153201165
Episode average V value: 0
Average (on the epoch) training loss: 0.03731772288861312
Epi

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.009812256320379674 0.03315694906655699 0.004481585326138884 0.03215761830518022 0.060692011266946795 0.0005506115406751633 0.03408988573681563
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.9759, 0.7075, 0.5506], device='cuda:0') tensor([0.9859, 0.2307, 0.9223], device='cuda:0') tensor([0.9631, 0.1398, 0.9133], device='cuda:0')
R[0]
tensor([-0.0279], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.04504696155587832
Episode average V value: 0
Average (on the epoch) training loss: 0.04537040791991684
Episode average V value: 0
Average (on the epoch) training loss: 0.04330337389061848
Episode average V value: 0
Average (on the epoch) training loss: 0.04154923498734004
Episode average V value: 0
Average (

Average (on the epoch) training loss: 0.03403394371321004
Episode average V value: 0
Average (on the epoch) training loss: 0.03393001152293648
Episode average V value: 0
Average (on the epoch) training loss: 0.03408509846228887
Episode average V value: 0
Average (on the epoch) training loss: 0.03388758831433663
Episode average V value: 0
Average (on the epoch) training loss: 0.033794536725124336
Episode average V value: 0
Average (on the epoch) training loss: 0.03381709308796233
Episode average V value: 0
Average (on the epoch) training loss: 0.033733837075703564
Episode average V value: 0
Average (on the epoch) training loss: 0.03363764771766225
Episode average V value: 0
Average (on the epoch) training loss: 0.033624625748867154
Episode average V value: 0
Average (on the epoch) training loss: 0.03373941126648462
Episode average V value: 0
Average (on the epoch) training loss: 0.03363937594101118
Episode average V value: 0
Average (on the epoch) training loss: 0.03362083419096209
Epis

Average (on the epoch) training loss: 0.03258455831251999
Episode average V value: 0
Average (on the epoch) training loss: 0.03253661531478627
Episode average V value: 0
Average (on the epoch) training loss: 0.032556120057596394
Episode average V value: 0
Average (on the epoch) training loss: 0.032522186702583285
Episode average V value: 0
Average (on the epoch) training loss: 0.032479375265406334
Episode average V value: 0
Average (on the epoch) training loss: 0.03254378924694202
Episode average V value: 0
Average (on the epoch) training loss: 0.032508661968414676
Episode average V value: 0
Average (on the epoch) training loss: 0.03248772134176559
Episode average V value: 0
Average (on the epoch) training loss: 0.03244880105120675
Episode average V value: 0
Average (on the epoch) training loss: 0.03247161580504568
Episode average V value: 0
Average (on the epoch) training loss: 0.03246753715756541
Episode average V value: 0
Average (on the epoch) training loss: 0.03243902527763607
Epi

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.008442921370267868 0.029135056812083348 0.0018927928073389922 0.026419407397508622 0.06373485599458217 0.0005078506544232369 0.03225449828384444
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.3523, 0.9659, 0.2144], device='cuda:0') tensor([0.4757, 0.8748, 0.5468], device='cuda:0') tensor([0.5719, 0.8590, 0.5938], device='cuda:0')
R[0]
tensor([0.0096], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.035454168915748596
Episode average V value: 0
Average (on the epoch) training loss: 0.03369526761687464
Episode average V value: 0
Average (on the epoch) training loss: 0.03541779276673441
Episode average V value: 0
Average (on the epoch) training loss: 0.03424295697671672
Episode average V value: 0
Average

Average (on the epoch) training loss: 0.027862379173442742
Episode average V value: 0
Average (on the epoch) training loss: 0.02785809272493694
Episode average V value: 0
Average (on the epoch) training loss: 0.02784845947707985
Episode average V value: 0
Average (on the epoch) training loss: 0.027750939500183636
Episode average V value: 0
Average (on the epoch) training loss: 0.027689668944338632
Episode average V value: 0
Average (on the epoch) training loss: 0.027657139936704284
Episode average V value: 0
Average (on the epoch) training loss: 0.02756498360305271
Episode average V value: 0
Average (on the epoch) training loss: 0.027458130133936158
Episode average V value: 0
Average (on the epoch) training loss: 0.027398349593026207
Episode average V value: 0
Average (on the epoch) training loss: 0.027474893950733496
Episode average V value: 0
Average (on the epoch) training loss: 0.027434303009384084
Episode average V value: 0
Average (on the epoch) training loss: 0.02740574655333318

Average (on the epoch) training loss: 0.026825325227393053
Episode average V value: 0
Average (on the epoch) training loss: 0.02679967766803402
Episode average V value: 0
Average (on the epoch) training loss: 0.026816708555907693
Episode average V value: 0
Average (on the epoch) training loss: 0.026789858352690644
Episode average V value: 0
Average (on the epoch) training loss: 0.02680117310474817
Episode average V value: 0
Average (on the epoch) training loss: 0.02678613616039949
Episode average V value: 0
Average (on the epoch) training loss: 0.026807294712172365
Episode average V value: 0
Average (on the epoch) training loss: 0.026754200668148105
Episode average V value: 0
Average (on the epoch) training loss: 0.02670288662046354
Episode average V value: 0
Average (on the epoch) training loss: 0.026664247716164914
Episode average V value: 0
Average (on the epoch) training loss: 0.02665580413827484
Episode average V value: 0
Average (on the epoch) training loss: 0.026624486230725386


  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.007180104481521994 0.026150694488547743 0.0013704451551457169 0.023170081438031048 0.06239825854450464 0.0003830820247530937 0.029886109153274446
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.6870, -0.6768,  1.0017], device='cuda:0') tensor([ 0.4462, -0.9143,  0.6811], device='cuda:0') tensor([ 0.3946, -0.9893,  0.7725], device='cuda:0')
R[0]
tensor([-0.0047], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.030019328515562747
Episode average V value: 0
Average (on the epoch) training loss: 0.03578173808960451
Episode average V value: 0
Average (on the epoch) training loss: 0.03517365728125528
Episode average V value: 0
Average (on the epoch) training loss: 0.032894519338798195
Episode average V valu

Average (on the epoch) training loss: 0.025028154756400082
Episode average V value: 0
Average (on the epoch) training loss: 0.024937680013300406
Episode average V value: 0
Average (on the epoch) training loss: 0.024914953802876613
Episode average V value: 0
Average (on the epoch) training loss: 0.024888883903141323
Episode average V value: 0
Average (on the epoch) training loss: 0.02479802919040276
Episode average V value: 0
Average (on the epoch) training loss: 0.02480675583009166
Episode average V value: 0
Average (on the epoch) training loss: 0.02479164982005284
Episode average V value: 0
Average (on the epoch) training loss: 0.024756947346606557
Episode average V value: 0
Average (on the epoch) training loss: 0.024748652185835354
Episode average V value: 0
Average (on the epoch) training loss: 0.024657806456381465
Episode average V value: 0
Average (on the epoch) training loss: 0.02461627089188144
Episode average V value: 0
Average (on the epoch) training loss: 0.024592845075946505

Average (on the epoch) training loss: 0.024088159492565812
Episode average V value: 0
Average (on the epoch) training loss: 0.02408323307487352
Episode average V value: 0
Average (on the epoch) training loss: 0.02406875436556422
Episode average V value: 0
Average (on the epoch) training loss: 0.024027226147374417
Episode average V value: 0
Average (on the epoch) training loss: 0.02400011959493531
Episode average V value: 0
Average (on the epoch) training loss: 0.023970398913282967
Episode average V value: 0
Average (on the epoch) training loss: 0.02398872965281308
Episode average V value: 0
Average (on the epoch) training loss: 0.02395862395402103
Episode average V value: 0
Average (on the epoch) training loss: 0.023934350158658226
Episode average V value: 0
Average (on the epoch) training loss: 0.023929134863811707
Episode average V value: 0
Average (on the epoch) training loss: 0.02390199888060997
Episode average V value: 0
Average (on the epoch) training loss: 0.023877497752180322
E

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.0070946449497714635 0.023408590855891816 0.0008849748928805638 0.019494010729249566 0.07447697644680738 0.00035528598725795746 0.029584723685868085
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.0233, -0.9932, -0.5540], device='cuda:0') tensor([-0.4967, -0.8642, -0.9466], device='cuda:0') tensor([-0.9635, -0.8972, -0.7977], device='cuda:0')
R[0]
tensor([0.0401], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.026299763470888138
Episode average V value: 0
Average (on the epoch) training loss: 0.026114014494750235
Episode average V value: 0
Average (on the epoch) training loss: 0.024156901364525158
Episode average V value: 0
Average (on the epoch) training loss: 0.023806660255003307
Episode average V v

Average (on the epoch) training loss: 0.02106421216050664
Episode average V value: 0
Average (on the epoch) training loss: 0.02102716417894477
Episode average V value: 0
Average (on the epoch) training loss: 0.020951556407061277
Episode average V value: 0
Average (on the epoch) training loss: 0.020910470564163817
Episode average V value: 0
Average (on the epoch) training loss: 0.02091066017782608
Episode average V value: 0
Average (on the epoch) training loss: 0.020815574114253235
Episode average V value: 0
Average (on the epoch) training loss: 0.02074108625612176
Episode average V value: 0
Average (on the epoch) training loss: 0.02068535660703977
Episode average V value: 0
Average (on the epoch) training loss: 0.020607004525487894
Episode average V value: 0
Average (on the epoch) training loss: 0.020588494069055444
Episode average V value: 0
Average (on the epoch) training loss: 0.020506916550992852
Episode average V value: 0
Average (on the epoch) training loss: 0.020427868822611533


Average (on the epoch) training loss: 0.01953577505555201
Episode average V value: 0
Average (on the epoch) training loss: 0.019533649459848625
Episode average V value: 0
Average (on the epoch) training loss: 0.019540865673665322
Episode average V value: 0
Average (on the epoch) training loss: 0.019526639907264415
Episode average V value: 0
Average (on the epoch) training loss: 0.01949819374783807
Episode average V value: 0
Average (on the epoch) training loss: 0.019510378900798146
Episode average V value: 0
Average (on the epoch) training loss: 0.0194771904676753
Episode average V value: 0
Average (on the epoch) training loss: 0.01945077803518091
Episode average V value: 0
Average (on the epoch) training loss: 0.019447325052096124
Episode average V value: 0
Average (on the epoch) training loss: 0.01942811970676752
Episode average V value: 0
Average (on the epoch) training loss: 0.01939644237640738
Episode average V value: 0
Average (on the epoch) training loss: 0.01936590169631014
Epi

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.0077437650859355925 0.0187002360294573 0.0006345823005613056 0.01569422990549356 0.08197131928056478 0.00031392786651849745 0.02984170668758452
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.9640, 0.7504, 0.2859], device='cuda:0') tensor([1.0115, 0.1631, 0.7814], device='cuda:0') tensor([1.0033, 0.1486, 0.9379], device='cuda:0')
R[0]
tensor([-0.0022], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.022748815827071667
Episode average V value: 0
Average (on the epoch) training loss: 0.021852461569425132
Episode average V value: 0
Average (on the epoch) training loss: 0.0210437072371995
Episode average V value: 0
Average (on the epoch) training loss: 0.021452886207650106
Episode average V value: 0
Averag

Average (on the epoch) training loss: 0.01702671711141279
Episode average V value: 0
Average (on the epoch) training loss: 0.01698377058022173
Episode average V value: 0
Average (on the epoch) training loss: 0.0169235855832489
Episode average V value: 0
Average (on the epoch) training loss: 0.016951880324631928
Episode average V value: 0
Average (on the epoch) training loss: 0.016975509187505036
Episode average V value: 0
Average (on the epoch) training loss: 0.016953179125684774
Episode average V value: 0
Average (on the epoch) training loss: 0.01694261814253593
Episode average V value: 0
Average (on the epoch) training loss: 0.016927601042297784
Episode average V value: 0
Average (on the epoch) training loss: 0.016899018585796892
Episode average V value: 0
Average (on the epoch) training loss: 0.016894254445124494
Episode average V value: 0
Average (on the epoch) training loss: 0.01689143796791075
Episode average V value: 0
Average (on the epoch) training loss: 0.01693193258588584
Ep

Average (on the epoch) training loss: 0.016371039655207406
Episode average V value: 0
Average (on the epoch) training loss: 0.016360782592616765
Episode average V value: 0
Average (on the epoch) training loss: 0.016366586434271406
Episode average V value: 0
Average (on the epoch) training loss: 0.01635544325568174
Episode average V value: 0
Average (on the epoch) training loss: 0.016356043873152645
Episode average V value: 0
Average (on the epoch) training loss: 0.016339009780527672
Episode average V value: 0
Average (on the epoch) training loss: 0.016350854369889027
Episode average V value: 0
Average (on the epoch) training loss: 0.01636765683365483
Episode average V value: 0
Average (on the epoch) training loss: 0.01635720850893936
Episode average V value: 0
Average (on the epoch) training loss: 0.0163502841635402
Episode average V value: 0
Average (on the epoch) training loss: 0.016330266758839085
Episode average V value: 0
Average (on the epoch) training loss: 0.016316779696369285


  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.008364638259634375 0.01757892073673429 0.0006407192432307056 0.01458175478503108 0.0783116465806961 0.0002483967915177345 0.030128819921053946
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([-0.5854, -0.3909, -0.3691], device='cuda:0') tensor([-1.0415, -0.2474, -0.7429], device='cuda:0') tensor([-0.9948, -0.2942, -0.7434], device='cuda:0')
R[0]
tensor([0.0640], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.018248663387364812
Episode average V value: 0
Average (on the epoch) training loss: 0.018095042401303846
Episode average V value: 0
Average (on the epoch) training loss: 0.017186021915188542
Episode average V value: 0
Average (on the epoch) training loss: 0.016455966798174713
Episode average V value:

Average (on the epoch) training loss: 0.014261877112816197
Episode average V value: 0
Average (on the epoch) training loss: 0.014301674665018384
Episode average V value: 0
Average (on the epoch) training loss: 0.01428194747342929
Episode average V value: 0
Average (on the epoch) training loss: 0.01424434178392102
Episode average V value: 0
Average (on the epoch) training loss: 0.014204475604946872
Episode average V value: 0
Average (on the epoch) training loss: 0.014200606807005696
Episode average V value: 0
Average (on the epoch) training loss: 0.01415571361255789
Episode average V value: 0
Average (on the epoch) training loss: 0.01413854745440875
Episode average V value: 0
Average (on the epoch) training loss: 0.014152219216822578
Episode average V value: 0
Average (on the epoch) training loss: 0.014158182095000655
Episode average V value: 0
Average (on the epoch) training loss: 0.014117773673736369
Episode average V value: 0
Average (on the epoch) training loss: 0.014111478267244149

Average (on the epoch) training loss: 0.013909151928379847
Episode average V value: 0
Average (on the epoch) training loss: 0.013896347452928653
Episode average V value: 0
Average (on the epoch) training loss: 0.013878964333113258
Episode average V value: 0
Average (on the epoch) training loss: 0.013876814678549777
Episode average V value: 0
Average (on the epoch) training loss: 0.013867029822004348
Episode average V value: 0
Average (on the epoch) training loss: 0.013869917861352853
Episode average V value: 0
Average (on the epoch) training loss: 0.013876825744865432
Episode average V value: 0
Average (on the epoch) training loss: 0.01387557264698285
Episode average V value: 0
Average (on the epoch) training loss: 0.01387435534126431
Episode average V value: 0
Average (on the epoch) training loss: 0.01387626971574167
Episode average V value: 0
Average (on the epoch) training loss: 0.013854975361736275
Episode average V value: 0
Average (on the epoch) training loss: 0.01385075415364362

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.008248401707038283 0.015780600495170802 0.0002202388402147335 0.012017815568484367 0.08120305834710598 0.000252813458442688 0.029331102174706757
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.6679, -0.3737,  0.9957], device='cuda:0') tensor([ 0.5569, -0.7037,  0.7676], device='cuda:0') tensor([ 0.6349, -0.7885,  0.8719], device='cuda:0')
R[0]
tensor([0.0139], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.013866027092768086
Episode average V value: 0
Average (on the epoch) training loss: 0.015104457425574461
Episode average V value: 0
Average (on the epoch) training loss: 0.013816784049763723
Episode average V value: 0
Average (on the epoch) training loss: 0.013519811353439258
Episode average V valu

Average (on the epoch) training loss: 0.012808048016412889
Episode average V value: 0
Average (on the epoch) training loss: 0.012802449655879249
Episode average V value: 0
Average (on the epoch) training loss: 0.012776617225856076
Episode average V value: 0
Average (on the epoch) training loss: 0.012778962478318398
Episode average V value: 0
Average (on the epoch) training loss: 0.01275277008733083
Episode average V value: 0
Average (on the epoch) training loss: 0.012752656545313639
Episode average V value: 0
Average (on the epoch) training loss: 0.012728950309017197
Episode average V value: 0
Average (on the epoch) training loss: 0.012706813380340643
Episode average V value: 0
Average (on the epoch) training loss: 0.012671572444232702
Episode average V value: 0
Average (on the epoch) training loss: 0.012684614056494122
Episode average V value: 0
Average (on the epoch) training loss: 0.012675364969939153
Episode average V value: 0
Average (on the epoch) training loss: 0.012667240793750

Average (on the epoch) training loss: 0.012076788011380278
Episode average V value: 0
Average (on the epoch) training loss: 0.012068249944065999
Episode average V value: 0
Average (on the epoch) training loss: 0.012057650607274054
Episode average V value: 0
Average (on the epoch) training loss: 0.012042472229167926
Episode average V value: 0
Average (on the epoch) training loss: 0.012029446745399646
Episode average V value: 0
Average (on the epoch) training loss: 0.012023173422446772
Episode average V value: 0
Average (on the epoch) training loss: 0.012003407555502646
Episode average V value: 0
Average (on the epoch) training loss: 0.011992175217185702
Episode average V value: 0
Average (on the epoch) training loss: 0.011967862054301106
Episode average V value: 0
Average (on the epoch) training loss: 0.011968540250483318
Episode average V value: 0
Average (on the epoch) training loss: 0.011963066826057103
Episode average V value: 0
Average (on the epoch) training loss: 0.01195223116871

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.008572539580054581 0.014739485817262903 0.00035109163911965877 0.009429327715188264 0.08211362595856189 0.0002307976558804512 0.02864641736401245
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.9922, -0.3994, -0.0217], device='cuda:0') tensor([ 0.8836, -0.6903, -0.0410], device='cuda:0') tensor([ 0.9975, -0.7317, -0.0675], device='cuda:0')
R[0]
tensor([0.0060], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.00966358402123054
Episode average V value: 0
Average (on the epoch) training loss: 0.010847950975100199
Episode average V value: 0
Average (on the epoch) training loss: 0.010625135774413744
Episode average V value: 0
Average (on the epoch) training loss: 0.010368165767027272
Episode average V valu

Average (on the epoch) training loss: 0.009798580863845935
Episode average V value: 0
Average (on the epoch) training loss: 0.009799921584855174
Episode average V value: 0
Average (on the epoch) training loss: 0.009800906948367853
Episode average V value: 0
Average (on the epoch) training loss: 0.009798107895270972
Episode average V value: 0
Average (on the epoch) training loss: 0.009786146188911294
Episode average V value: 0
Average (on the epoch) training loss: 0.009754850615563831
Episode average V value: 0
Average (on the epoch) training loss: 0.00973210149803914
Episode average V value: 0
Average (on the epoch) training loss: 0.009712139774619797
Episode average V value: 0
Average (on the epoch) training loss: 0.009705724229328834
Episode average V value: 0
Average (on the epoch) training loss: 0.009712084215629288
Episode average V value: 0
Average (on the epoch) training loss: 0.009716248325558475
Episode average V value: 0
Average (on the epoch) training loss: 0.009717311927537

Average (on the epoch) training loss: 0.00952716980966562
Episode average V value: 0
Average (on the epoch) training loss: 0.009535600469992016
Episode average V value: 0
Average (on the epoch) training loss: 0.009539901961730731
Episode average V value: 0
Average (on the epoch) training loss: 0.00952317516238969
Episode average V value: 0
Average (on the epoch) training loss: 0.00952769746553894
Episode average V value: 0
Average (on the epoch) training loss: 0.009524546211612918
Episode average V value: 0
Average (on the epoch) training loss: 0.009525435788159665
Episode average V value: 0
Average (on the epoch) training loss: 0.009531961486689628
Episode average V value: 0
Average (on the epoch) training loss: 0.009532316342981607
Episode average V value: 0
Average (on the epoch) training loss: 0.00952764088755657
Episode average V value: 0
Average (on the epoch) training loss: 0.009520076142307767
Episode average V value: 0
Average (on the epoch) training loss: 0.009508319233482315

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.008314498092513532 0.014325611768988892 0.00028118138975696637 0.008710652144160122 0.07783148299157619 0.00024311235547065736 0.027105042150709777
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([ 0.3349, -0.8407,  0.7245], device='cuda:0') tensor([-0.0633, -0.7672,  0.3787], device='cuda:0') tensor([-0.0028, -0.7405,  0.3777], device='cuda:0')
R[0]
tensor([0.0224], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.00703804269950423
Episode average V value: 0
Average (on the epoch) training loss: 0.008088624143662551
Episode average V value: 0
Average (on the epoch) training loss: 0.008300075609305943
Episode average V value: 0
Average (on the epoch) training loss: 0.008388775070973983
Episode average V va

Average (on the epoch) training loss: 0.008068472795656005
Episode average V value: 0
Average (on the epoch) training loss: 0.008056880765200811
Episode average V value: 0
Average (on the epoch) training loss: 0.008061753550090565
Episode average V value: 0
Average (on the epoch) training loss: 0.008042673714010528
Episode average V value: 0
Average (on the epoch) training loss: 0.008032383040518933
Episode average V value: 0
Average (on the epoch) training loss: 0.007998972493452465
Episode average V value: 0
Average (on the epoch) training loss: 0.007989565485025047
Episode average V value: 0
Average (on the epoch) training loss: 0.007993483559917565
Episode average V value: 0
Average (on the epoch) training loss: 0.008001190134976409
Episode average V value: 0
Average (on the epoch) training loss: 0.008013539155082846
Episode average V value: 0
Average (on the epoch) training loss: 0.008013827897101113
Episode average V value: 0
Average (on the epoch) training loss: 0.00800833866874

Average (on the epoch) training loss: 0.007802259343431119
Episode average V value: 0
Average (on the epoch) training loss: 0.007797847438253399
Episode average V value: 0
Average (on the epoch) training loss: 0.0077973646610926565
Episode average V value: 0
Average (on the epoch) training loss: 0.007785509124124705
Episode average V value: 0
Average (on the epoch) training loss: 0.007783095784622511
Episode average V value: 0
Average (on the epoch) training loss: 0.0077849827624867
Episode average V value: 0
Average (on the epoch) training loss: 0.0077733656777768355
Episode average V value: 0
Average (on the epoch) training loss: 0.007774990378834662
Episode average V value: 0
Average (on the epoch) training loss: 0.007778499858257492
Episode average V value: 0
Average (on the epoch) training loss: 0.007772168800835799
Episode average V value: 0
Average (on the epoch) training loss: 0.007770554508465097
Episode average V value: 0
Average (on the epoch) training loss: 0.00777177319559

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.007482053097803146 0.011593297145329416 0.0004262041605561535 0.007296398856211453 0.07371836266666651 0.00019142400473356246 0.029836754652205855
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.1740, 0.9930, 0.3074], device='cuda:0') tensor([0.4062, 0.5462, 0.7860], device='cuda:0') tensor([0.4647, 0.5779, 0.8670], device='cuda:0')
R[0]
tensor([-0.0628], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.008147626287407346
Episode average V value: 0
Average (on the epoch) training loss: 0.008532085119643144
Episode average V value: 0
Average (on the epoch) training loss: 0.00857136603789749
Episode average V value: 0
Average (on the epoch) training loss: 0.008266772971385054
Episode average V value: 0
Av

Average (on the epoch) training loss: 0.007477615142639711
Episode average V value: 0
Average (on the epoch) training loss: 0.007483776778343631
Episode average V value: 0
Average (on the epoch) training loss: 0.007469616839387909
Episode average V value: 0
Average (on the epoch) training loss: 0.007481431663984916
Episode average V value: 0
Average (on the epoch) training loss: 0.0074793888204213645
Episode average V value: 0
Average (on the epoch) training loss: 0.007493566516822912
Episode average V value: 0
Average (on the epoch) training loss: 0.007498434846768796
Episode average V value: 0
Average (on the epoch) training loss: 0.007509401375886448
Episode average V value: 0
Average (on the epoch) training loss: 0.007507657772627719
Episode average V value: 0
Average (on the epoch) training loss: 0.007511015226892023
Episode average V value: 0
Average (on the epoch) training loss: 0.007495457218659721
Episode average V value: 0
Average (on the epoch) training loss: 0.0074690412729

Average (on the epoch) training loss: 0.007339057064012787
Episode average V value: 0
Average (on the epoch) training loss: 0.007338711155797701
Episode average V value: 0
Average (on the epoch) training loss: 0.007330146098959874
Episode average V value: 0
Average (on the epoch) training loss: 0.007332021654647055
Episode average V value: 0
Average (on the epoch) training loss: 0.007327996573690084
Episode average V value: 0
Average (on the epoch) training loss: 0.007328774944526803
Episode average V value: 0
Average (on the epoch) training loss: 0.007323895799921466
Episode average V value: 0
Average (on the epoch) training loss: 0.007322860834085279
Episode average V value: 0
Average (on the epoch) training loss: 0.007322393646146255
Episode average V value: 0
Average (on the epoch) training loss: 0.007323909456194457
Episode average V value: 0
Average (on the epoch) training loss: 0.007325779184752542
Episode average V value: 0
Average (on the epoch) training loss: 0.00732696397099

  line3 = ax.scatter(


self.loss_T, self.loss_R, self.loss_gamma, self.loss_Q, self.loss_disentangle_t, self.loss_disambiguate1, self.loss_disambiguate2
0.00735939879482612 0.010951095759868622 0.0002716373152370579 0.00676704186713323 0.07121684244275094 0.00016200490295886994 0.026232944869436323
self.loss_interpret/500.
0.0
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.9942, 0.5449, 0.4699], device='cuda:0') tensor([ 0.9872, -0.1004,  0.8990], device='cuda:0') tensor([ 0.9395, -0.1557,  0.9520], device='cuda:0')
R[0]
tensor([0.0110], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.006719429563317034
Episode average V value: 0
Average (on the epoch) training loss: 0.007603107304829691
Episode average V value: 0
Average (on the epoch) training loss: 0.007824183182997836
Episode average V value: 0
Average (on the epoch) training loss: 0.007502015848230157
Episode average V value: 

Average (on the epoch) training loss: 0.006888516711336861
Episode average V value: 0
Average (on the epoch) training loss: 0.006887583093917083
Episode average V value: 0
Average (on the epoch) training loss: 0.00689798329739895
Episode average V value: 0
Average (on the epoch) training loss: 0.006885326816084445
Episode average V value: 0
Average (on the epoch) training loss: 0.006871936892408853
Episode average V value: 0
Average (on the epoch) training loss: 0.006860777366610403
Episode average V value: 0
Average (on the epoch) training loss: 0.006862795694955978
Episode average V value: 0
Average (on the epoch) training loss: 0.006851610809040134
Episode average V value: 0
Average (on the epoch) training loss: 0.006841284802216065
Episode average V value: 0
Average (on the epoch) training loss: 0.006847563796435242
Episode average V value: 0
Average (on the epoch) training loss: 0.006851225323242764
Episode average V value: 0
Average (on the epoch) training loss: 0.006862469538843

Average (on the epoch) training loss: 0.00694702539476566
Episode average V value: 0
Average (on the epoch) training loss: 0.006944371112741721
Episode average V value: 0
Average (on the epoch) training loss: 0.00694883721621914
Episode average V value: 0
Average (on the epoch) training loss: 0.006947967777382086
Episode average V value: 0
Average (on the epoch) training loss: 0.006938088055367964
Episode average V value: 0
Average (on the epoch) training loss: 0.006940543449106378
Episode average V value: 0
Average (on the epoch) training loss: 0.006944574377027318
Episode average V value: 0
Average (on the epoch) training loss: 0.00694051811102009
Episode average V value: 0
Average (on the epoch) training loss: 0.0069380195883709484
Episode average V value: 0
Average (on the epoch) training loss: 0.006933277502962331
Episode average V value: 0
