In [1]:
import sys
import logging

import numpy as np
import matplotlib.pyplot as plt
from joblib import hash, dump, load
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from figure8_env import MyEnv as figure8_env
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy, FixedFigure8Policy

In [2]:
figure8_give_rewards = True
nn_yaml = 'network_noconv.yaml'
higher_dim_obs = False
internal_dim = 10
fname = 'test'

In [3]:
class Defaults:
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 5000
    epochs = 50
    steps_per_test = 1000
    period_btw_summary_perfs = 1

    # ----------------------
    # Temporal Processing Parameters
    # ----------------------
    nstep = 20
    nstep_decay = 0.8
    encoder_type = 'variational'
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2
    show_rewards = False

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    learning_rate = 1 * 1E-4
    learning_rate_decay = 1.0
    discount = 0.9
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 1000
    update_frequency = 1
    replay_memory_size = 100000 #50000
    batch_size = 64
    freeze_interval = 1000
    deterministic = False
    
    # ----------------------
    # Learning algo parameters
    # ----------------------
    #loss_weights = [5E-3, 1E-3, 5E-3, 5E-3, 5E-3, 5E-3, 1.]
    #loss_weights = [0, 0, 0, 0, 0, 0, 1.]
    loss_weights = [5E-3, 5E-3, 5E-3, 0, 5E-3, 1E-3, 1., 1E-6]

In [4]:
parameters = Defaults()
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

# --- Instantiate environment ---
env = figure8_env(
    give_rewards=figure8_give_rewards,
    intern_dim=internal_dim,
    higher_dim_obs=higher_dim_obs,
    show_rewards=parameters.show_rewards,
    nstep=parameters.nstep, nstep_decay=parameters.nstep_decay
    )

# --- Instantiate learning_algo ---
learning_algo = CRAR(
    env,
    parameters.freeze_interval,
    parameters.batch_size,
    rng,
    high_int_dim=False,
    internal_dim=internal_dim, lr=parameters.learning_rate,
    nn_yaml=nn_yaml, double_Q=True,
    loss_weights=parameters.loss_weights,
    nstep=parameters.nstep, nstep_decay=parameters.nstep_decay,
    encoder_type=parameters.encoder_type
    )

if figure8_give_rewards:
    train_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.2,
        consider_valid_transitions=False
        )
    test_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.
        )
else:
    train_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng, epsilon=0.2,
        height=env.HEIGHT, width=env.WIDTH
        )
    test_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng,
        height=env.HEIGHT, width=env.WIDTH
        )

# --- Instantiate agent ---
agent = NeuralAgent(
    env, learning_algo,
    parameters.replay_memory_size,
    1, parameters.batch_size, rng,
    train_policy=train_policy, test_policy=test_policy)

agent.run(10, 500)
print("end gathering data")

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# Learning rate may follow a scheduler
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
# has the highest validation score.
# To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
# important that the validationID is the same than the id argument of the InterleavedTestEpochController.
# The FindBestController will dump on disk the validation scores for each and every network, as well as the 
# structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
# of the validation and test scores (see below) or simply recover the resulting neural network for your 
# application.
agent.attach(bc.FindBestController(
    validationID=figure8_env.VALIDATION_MODE,
    testID=None,
    unique_fname=fname))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 
# rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 
# every [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=figure8_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1,
    unique_fname=fname))

end gathering data


In [5]:
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
#agent.gathering_data=False
agent.run(parameters.epochs, parameters.steps_per_epoch)

# --- Show results ---
basename = "scores/" + fname
scores = load(basename + "_scores.jldump")
print(scores)

Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.9788, -1.2043, -1.1370, -1.3863,  0.8605,  1.9189, -1.1895, -1.0534,
        -0.1351,  0.6652]) tensor([-0.7933, -1.1102, -1.4306, -1.6623,  0.9538,  2.0401, -0.9203, -1.2591,
         0.0874,  0.8875]) tensor([-0.9202, -0.5225,  0.6498,  0.9199, -1.9167,  0.5375,  0.9473,  0.4975,
         0.7661, -0.2791])
R[0]
tensor([0.2181], grad_fn=<SelectBackward0>)
LOSSES
T = 1.2190736744403838; R = 0.007600422054587398;                 Gamma = 0.3480693909712136; Q = 0.025976787250488995;
Entropy Neighbor = 1.1845786672424196e-05;                 Entropy Random = 1.0816577540484928e-05;                 Volume = 2.816436332702637; VAE = 47.36184683227539
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.8675,  1.4538, -0.6874, -0.3086, -1.1682,  0.6896,  1.1189, -0.0783,



Testing score per episode (id: 0) is 0.0 (average over 1 episode(s))
== Mean score per episode is 0.0 over 1 episodes ==


  abs_states[i:i+1], torch.as_tensor([action_encoding])
  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 1.4236, -0.2033, -0.4562, -0.7022,  0.0916,  0.3257,  0.7770, -2.1177,
        -0.0353,  0.9046]) tensor([ 1.4265, -0.4433, -0.3746, -0.6656,  0.3548,  0.3689,  0.7101, -2.0564,
        -0.1040,  0.8071]) tensor([ 1.7488, -0.2470, -0.6419, -0.7277,  0.4187,  0.4013,  1.1190, -2.2989,
        -0.1961,  1.0042])
R[0]
tensor([-4.6074e-05], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06847252438962459; R = 0.0009049177863053046;                 Gamma = 0.016333105246245396; Q = 0.0007320378847653046;
Entropy Neighbor = 0.014011786185204983;                 Entropy Random = 0.008479813725687563;                 Volume = 2.2956529932022094; VAE = 47.107276153564456
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 1.3439, -0.5752, -1.0429, -0.0489,  0.2629,  0.3068,  1.2532, 

  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 2.3349, -0.8924, -0.9946, -0.5227,  0.5999,  0.2913,  0.7322, -2.0869,
        -0.4774,  0.6876]) tensor([ 2.4056, -0.7047, -0.9906, -0.5637,  0.5919,  0.1764,  0.7793, -2.0376,
        -0.2415,  0.5821]) tensor([ 2.1987, -0.7631, -1.0270, -0.7809,  0.6533,  0.0892,  0.8639, -1.8870,
        -0.1144,  0.6436])
R[0]
tensor([0.0003], grad_fn=<SelectBackward0>)
LOSSES
T = 0.05364042358100414; R = 0.002068844107898258;                 Gamma = 0.0123453104988439; Q = 0.0015170180659624748;
Entropy Neighbor = 0.021732441695407032;                 Entropy Random = 0.011199204393662513;                 Volume = 2.7656409177780152; VAE = 41.57756015014648
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 2.2990, -0.7966, -0.9446, -0.2989,  0.4256,  0.2478,  0.9587, -1.9114,


  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-1.0050,  0.1290,  0.8846, -0.1857,  0.0636,  0.1142, -0.3227,  0.6407,
         0.5916, -0.6889]) tensor([-1.0725,  0.0012,  0.4073, -0.2177, -0.1438,  0.1506, -0.2959,  0.4257,
         0.4665, -0.0062]) tensor([-1.2996, -0.3786,  1.0037, -0.2428, -0.3470, -0.2446, -0.5171,  0.6881,
         0.5668, -0.0026])
R[0]
tensor([0.0544], grad_fn=<SelectBackward0>)
LOSSES
T = 0.05209324613958597; R = 0.0018414062999654562;                 Gamma = 0.010011152709121235; Q = 0.0014038665456391755;
Entropy Neighbor = 0.02246588676609099;                 Entropy Random = 0.010075798435136675;                 Volume = 2.2430645489692687; VAE = 34.34252880859375
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 1.6427, -0.6311, -1.1426, -0.6079,  0.3352,  0.3743,  0.8523, -1.3637

  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 1.6218, -0.4334, -0.8057, -0.4389,  0.6655,  0.8157,  0.4987, -1.1373,
        -0.6643, -0.1418]) tensor([ 1.6375, -0.4137, -0.9278, -0.1197,  0.3727,  0.5465,  0.5390, -1.1454,
        -0.4995,  0.2456]) tensor([ 1.5553, -0.5352, -0.5488, -0.2900,  0.5329,  0.6024,  0.2656, -1.0613,
        -0.3348,  0.3639])
R[0]
tensor([0.0012], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06057524698227644; R = 0.006168485763802891;                 Gamma = 0.00861816825234564; Q = 0.004483690288790967;
Entropy Neighbor = 0.024428422790020703;                 Entropy Random = 0.00637171443272382;                 Volume = 1.330993589282036; VAE = 25.792032245635987
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 1.4838, -0.4710,  0.0188, -1.0798,  0.0649,  0.0323,  0.5055, -1.1768,
 

  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
3 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.9327, -0.5543, -0.5210, -0.8013,  0.2044, -0.1255,  0.4879, -0.8737,
         0.2501, -0.2630]) tensor([ 0.8294, -0.4900, -0.3220, -0.5482,  0.1582,  0.1249,  0.4607, -0.8232,
         0.2744, -0.0280]) tensor([ 0.3297, -0.0586, -0.2524, -0.8946, -0.1654,  0.4131,  0.4970, -0.5956,
         0.2355, -0.2224])
R[0]
tensor([0.0055], grad_fn=<SelectBackward0>)
LOSSES
T = 0.05576662161201239; R = 0.009202222963387612;                 Gamma = 0.0071913140265533; Q = 0.005266884192358702;
Entropy Neighbor = 0.028166944582015276;                 Entropy Random = 0.004596059827366844;                 Volume = 0.5961286343932152; VAE = 23.297647861480712
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 1.6204e+00, -6.4332e-01, -8.0853e-01, -1.3663e-03,  3.3086e-01,
       

  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 1.0
Es[0], TEs[0], Esp_[0]
tensor([ 1.1335, -0.1494, -0.6911, -0.4957,  0.2605,  0.2681,  1.1427, -1.5386,
         0.1149,  0.0680]) tensor([ 1.1380, -0.5528, -0.7574, -0.7880,  0.2662,  0.3450,  1.1295, -1.5075,
         0.1610, -0.0225]) tensor([ 1.1511, -0.7489, -0.7893, -0.4589,  0.6625,  0.1569,  1.0143, -1.4248,
         0.0048, -0.4520])
R[0]
tensor([-0.0040], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06303413390368223; R = 0.013452999947534409;                 Gamma = 0.006416240448481403; Q = 0.007785355653148145;
Entropy Neighbor = 0.028900488559156657;                 Entropy Random = 0.003968085886212066;                 Volume = 0.38356553491950035; VAE = 26.023782001495363
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.8979, -0.1210, -1.0957, -0.0029,  0.4744,  0.8828,  0.3992, -0.70

  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.6431, -0.5732, -0.9943, -0.6929,  0.1426, -0.0116,  0.9960, -1.0493,
        -0.0505, -0.1440]) tensor([ 0.5656, -0.4009, -0.7366, -0.5923,  0.3180,  0.4997,  0.9684, -1.0056,
         0.0408, -0.0319]) tensor([ 0.6463, -0.7345, -0.8394, -0.4414,  0.0818,  0.8340,  0.9772, -1.1124,
         0.2979, -0.3921])
R[0]
tensor([-0.0145], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07087242698669434; R = 0.01668417024600785;                 Gamma = 0.006826686209096806; Q = 0.008937346798367798;
Entropy Neighbor = 0.029215177085250617;                 Entropy Random = 0.0037984315818175673;                 Volume = 0.3210598703920841; VAE = 28.23548796081543
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.3169, -0.3313,  0.0527, -1.1772,  0.3397, -0.0932,  0.5088, -0.9385

  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
2 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.1988, -0.1979, -0.8650, -0.3323,  0.2832,  0.8897, -0.0675, -0.2724,
        -0.5900,  0.3168]) tensor([ 9.0588e-02, -1.3278e-01, -5.6395e-01,  4.6578e-04,  1.9521e-01,
         7.8648e-01, -1.6027e-01, -1.9417e-01, -5.1537e-01,  1.6347e-01]) tensor([-0.1656, -0.1033,  0.6473, -0.5688, -0.2317, -0.1887, -0.6097,  0.2379,
         0.4154, -0.1783])
R[0]
tensor([-0.0024], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07201548499614001; R = 0.013117261775565566;                 Gamma = 0.006528518078048364; Q = 0.009033582995878532;
Entropy Neighbor = 0.029129158757627012;                 Entropy Random = 0.004083574745571241;                 Volume = 0.2867070360481739; VAE = 28.595588989257813
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.3461, -0.4493,  0.2226, -0

Testing score per episode (id: 0) is 142.0 (average over 1 episode(s))
== Mean score per episode is 141.985801419858 over 1 episodes ==


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.6412, -0.2374, -0.6108, -0.5812,  1.0468, -0.0178,  0.9238, -1.0989,
         0.1077, -0.1351]) tensor([ 0.5065, -0.3522, -0.6920, -0.6156,  0.5066,  0.3536,  0.9003, -1.0453,
         0.1180, -0.0942]) tensor([ 0.6246, -0.6303, -0.1820, -1.0062,  0.5504,  0.3437,  0.9195, -1.0773,
         0.1464,  0.3611])
R[0]
tensor([0.0245], grad_fn=<SelectBackward0>)
LOSSES
T = 0.07043951278179884; R = 0.012693849506904371;                 Gamma = 0.0057332254391876634; Q = 0.00920137558504939;
Entropy Neighbor = 0.030767518252134324;                 Entropy Random = 0.00447754874243401;                 Volume = 0.204677266061306; VAE = 28.087453285217286
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.9276,  0.0393, -0.7770, -0.1600,  0.2643,  1.3971,  0.5205, -0.3234,


  ax.scatter(
  plt.show()


Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 1.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-1.3791,  0.2224,  1.0298,  0.2643, -0.1397, -0.1387, -0.8451,  0.8435,
         0.6960, -0.0820]) tensor([-0.4569,  0.0057,  0.0988, -0.0783,  0.1059,  0.2838, -0.1656, -0.0409,
         0.4625,  0.0502]) tensor([ 0.2162, -0.2376, -0.9401, -0.7658,  0.3711,  0.4971,  1.0527, -1.0136,
        -0.0428,  0.2527])
R[0]
tensor([0.8819], grad_fn=<SelectBackward0>)
LOSSES
T = 0.06779227401316167; R = 0.015141792837472167;                 Gamma = 0.0058837916517077244; Q = 0.010271765822079032;
Entropy Neighbor = 0.032048573043197394;                 Entropy Random = 0.004483509021927602;                 Volume = 0.2181043034940958; VAE = 28.929636981964112
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
1 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([-0.4459, -0.4440, -0.6618, -0.7296, -0.1697,  0.7260,  0.9813, -0.804


KeyboardInterrupt



In [None]:
x = learning_algo.get_losses()[0]

In [None]:
x[-1]

## Visualize performance

In [None]:
agent.setNetwork(fname, nEpoch=33)

In [None]:
agent._in_episode = True
agent._mode = 0 # Testing mode with plan_depth=0
initState = env.reset(agent._mode)
inputDims = env.inputDimensions()

for i in range(len(inputDims)):
    if inputDims[i][0] > 1:
        agent._state[i][1:] = initState[i][1:]
agent._Vs_on_last_episode = []
is_terminal = False
reward = 0

### Frame by frame

In [None]:
%matplotlib inline

for i in range(100):
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    plt.figure()
    plt.imshow(np.flip(_obs.squeeze()))
    plt.show()
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
    V, action, reward, _ = agent._step()
    print(action)
    agent._Vs_on_last_episode.append(V)
    is_terminal = env.inTerminalState()
    if is_terminal: break

### As animation

In [None]:
%matplotlib notebook

import numpy as np 
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation

fig, ax = plt.subplots(1,1)
obs = env.observe()
_obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
_obs = np.flip(_obs.squeeze())
ax.set_xticks([])
ax.set_yticks([])
im = ax.imshow(np.zeros(_obs.shape))

def init():
    plt.cla()
    im = ax.imshow(_obs)
    return [im]

def animate(i, *args, **kwargs):
    plt.cla()
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    _obs = np.flip(_obs.squeeze())
    im = ax.imshow(_obs)
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
        V, action, reward, _ = agent._step()
        agent._Vs_on_last_episode.append(V)
    return [im]

ani = animation.FuncAnimation(fig, animate, init_func=init, 
     frames=100, blit=False, repeat=True)
ani.save('behavior.gif', writer="ffmpeg", fps = 15)
fig.show()