In [1]:
import sys
import logging

import numpy as np
import matplotlib.pyplot as plt
from joblib import hash, dump, load
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from figure8_env import MyEnv as figure8_env
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy, FixedFigure8Policy

In [2]:
figure8_give_rewards = True
nn_yaml = 'network_rnn.yaml'
#nn_yaml = 'network_noconv.yaml'
higher_dim_obs = False
internal_dim = 3
nstep = 15

In [3]:
class Defaults:
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 5000
    epochs = 50
    steps_per_test = 1000
    period_btw_summary_perfs = 1
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2
    show_rewards = False

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    update_rule = 'rmsprop'
    learning_rate = 5 * 1E-4 # 1E-4
    learning_rate_decay = 0.9
    discount = 0.9
    discount_inc = 1
    discount_max = 0.99
    rms_decay = 0.9
    rms_epsilon = 0.0001
    momentum = 0
    clip_norm = 1.0
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 10000
    update_frequency = 1
    replay_memory_size = 1000000 #replacing with 200000 will works just fine (in case you dont have 18gb of memory)
    batch_size = 32
    freeze_interval = 1000
    deterministic = False
    
    # ----------------------
    # Learning algo parameters
    # ----------------------
    loss_weights = [1E-2, 1E-3, 1E-3, 1E-3, 1E-3, 1E-3, 1.]

In [4]:
parameters = Defaults()
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

# --- Instantiate environment ---
env = figure8_env(
    give_rewards=figure8_give_rewards,
    intern_dim=internal_dim,
    higher_dim_obs=higher_dim_obs,
    show_rewards=parameters.show_rewards
    )

# --- Instantiate learning_algo ---
learning_algo = CRAR(
    env,
    parameters.rms_decay,
    parameters.rms_epsilon,
    parameters.momentum,
    parameters.clip_norm,
    parameters.freeze_interval,
    parameters.batch_size,
    parameters.update_rule,
    rng,
    high_int_dim=False,
    internal_dim=internal_dim, lr=parameters.learning_rate,
    nn_yaml=nn_yaml, double_Q=True,
    loss_weights=parameters.loss_weights,
    nstep=nstep
    )

if figure8_give_rewards:
    train_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.2,
        consider_valid_transitions=False
        )
    test_policy = EpsilonGreedyPolicy(
        learning_algo, env.nActions(), rng, 0.
        )
else:
    train_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng, epsilon=0.2,
        height=env.HEIGHT, width=env.WIDTH
        )
    test_policy = FixedFigure8Policy.FixedFigure8Policy(
        learning_algo, env.nActions(), rng,
        height=env.HEIGHT, width=env.WIDTH
        )

# --- Instantiate agent ---
agent = NeuralAgent(
    env,
    learning_algo,
    parameters.replay_memory_size,
    1,
    parameters.batch_size,
    rng,
    train_policy=train_policy,
    test_policy=test_policy)

# --- Create unique filename for FindBestController ---
h = hash(vars(parameters), hash_name="sha1")
fname = "test_" + h
print("The parameters hash is: {}".format(h))
print("The parameters are: {}".format(parameters))

# As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
# policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
# precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
# episode or epoch (or never, hence the resetEvery='none').
agent.attach(bc.EpsilonController(
    initial_e=parameters.epsilon_start,
    e_decays=parameters.epsilon_decay,
    e_min=parameters.epsilon_min,
    evaluate_on='episode',
    periodicity=1,
    reset_every='none'))

agent.run(10, 500)
print("end gathering data")

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
# wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# Same for the discount factor.
agent.attach(bc.DiscountFactorController(
    initial_discount_factor=parameters.discount, 
    discount_factor_growth=parameters.discount_inc, 
    discount_factor_max=parameters.discount_max,
    periodicity=1))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
# has the highest validation score.
# To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
# important that the validationID is the same than the id argument of the InterleavedTestEpochController.
# The FindBestController will dump on disk the validation scores for each and every network, as well as the 
# structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
# of the validation and test scores (see below) or simply recover the resulting neural network for your 
# application.
agent.attach(bc.FindBestController(
    validationID=figure8_env.VALIDATION_MODE,
    testID=None,
    unique_fname=fname))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 
# rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 
# every [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=figure8_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1))

The parameters hash is: 62977be8e45d8a56a5537c11dfd5d2fd8dda69e0
The parameters are: <__main__.Defaults object at 0x147e1fa00>
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
Non

In [5]:
agent._dataset.n_elems

5000

In [6]:
agent._mode

-1

In [7]:
agent.gathering_data

True

In [8]:
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
#agent.gathering_data=False
agent.run(parameters.epochs, parameters.steps_per_epoch)

# --- Show results ---
basename = "scores/" + fname
scores = load(basename + "_scores.jldump")
print(scores)

None
Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 0.0
Es[0], TEs[0], Esp_[0]
tensor([ 0.4825, -0.0980, -0.0048]) tensor([ 0.1886, -0.0058, -0.1392]) tensor([ 0.4828, -0.0944, -0.0029])
R[0]
tensor([0.1812], grad_fn=<SelectBackward0>)
> [0;32m/Users/chingfang/Code/deer/deer/agent.py[0m(848)[0;36maddSample[0;34m()[0m
[0;32m    846 [0;31m                [0;32mif[0m [0mhidden[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    847 [0;31m                    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 848 [0;31m                [0mprint[0m[0;34m([0m[0mhidden[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    849 [0;31m                [0mself[0m[0;34m.[0m[0m_hiddens[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mhidden[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m


(tensor([[[ 0.5671, -0.1807, -0.0664],
         [ 0.5778, -0.1861, -0.0509],
         [ 0.5407, -0.1805, -0.0657],
         [ 0.5762, -0.1948, -0.0804],
         [ 0.5795, -0.1925, -0.0740],
         [ 0.5085, -0.1682, -0.0666],
         [ 0.5069, -0.1648, -0.0615],
         [ 0.5910, -0.2022, -0.0852],
         [ 0.5453, -0.1921, -0.0616],
         [ 0.5096, -0.1441, -0.0384],
         [ 0.5218, -0.1464, -0.0319],
         [ 0.5278, -0.1764, -0.0536],
         [ 0.5088, -0.1458, -0.0375],
         [ 0.5273, -0.1850, -0.0646],
         [ 0.5745, -0.1980, -0.0847],
         [ 0.4838, -0.1768, -0.0438],
         [ 0.5375, -0.1823, -0.0506],
         [ 0.5671, -0.1885, -0.0768],
         [ 0.4548, -0.1308, -0.0010],
         [ 0.5838, -0.1937, -0.0786],
         [ 0.5687, -0.1936, -0.0838],
         [ 0.5204, -0.1604, -0.0342],
         [ 0.5723, -0.1931, -0.0755],
         [ 0.5410, -0.1816, -0.0614],
         [ 0.4533, -0.1101,  0.0156],
         [ 0.4531, -0.1319, -0.0098],
         [ 

(tensor([[[ 0.5864, -0.2475, -0.0907],
         [ 0.6239, -0.2452, -0.1149],
         [ 0.5478, -0.2194, -0.1134],
         [ 0.6178, -0.2394, -0.1075],
         [ 0.5440, -0.2168, -0.0784],
         [ 0.5836, -0.2415, -0.1111],
         [ 0.5441, -0.2168, -0.1016],
         [ 0.6165, -0.2283, -0.0928],
         [ 0.6268, -0.2366, -0.0990],
         [ 0.6101, -0.2280, -0.1125],
         [ 0.6247, -0.2466, -0.1006],
         [ 0.5538, -0.2428, -0.1175],
         [ 0.5071, -0.1957, -0.0695],
         [ 0.5869, -0.2499, -0.1193],
         [ 0.5420, -0.2142, -0.0742],
         [ 0.5420, -0.2197, -0.1046],
         [ 0.6200, -0.2399, -0.1105],
         [ 0.6190, -0.2465, -0.0960],
         [ 0.5170, -0.2237, -0.0499],
         [ 0.5207, -0.2372, -0.0380],
         [ 0.5573, -0.2353, -0.0940],
         [ 0.5839, -0.2479, -0.0925],
         [ 0.6351, -0.2478, -0.1171],
         [ 0.5213, -0.2382, -0.0456],
         [ 0.6300, -0.2468, -0.1112],
         [ 0.6134, -0.2445, -0.0991],
         [ 

(tensor([[[ 0.6195, -0.2782, -0.1188],
         [ 0.6073, -0.2927, -0.1410],
         [ 0.5523, -0.2712, -0.0813],
         [ 0.6313, -0.2871, -0.1327],
         [ 0.5945, -0.3018, -0.1265],
         [ 0.5961, -0.2967, -0.1342],
         [ 0.6212, -0.2870, -0.1335],
         [ 0.5763, -0.2875, -0.0976],
         [ 0.5900, -0.2816, -0.1207],
         [ 0.6259, -0.2898, -0.1227],
         [ 0.5509, -0.2932, -0.1388],
         [ 0.5893, -0.2528, -0.0809],
         [ 0.5581, -0.2921, -0.0882],
         [ 0.6245, -0.2855, -0.1331],
         [ 0.6205, -0.2739, -0.1077],
         [ 0.5088, -0.2565, -0.0965],
         [ 0.5231, -0.2795, -0.1345],
         [ 0.6124, -0.2683, -0.1265],
         [ 0.4748, -0.1803, -0.0458],
         [ 0.6377, -0.2897, -0.1336],
         [ 0.5953, -0.2680, -0.0931],
         [ 0.4716, -0.1996, -0.0628],
         [ 0.5558, -0.2686, -0.0804],
         [ 0.5982, -0.2706, -0.1305],
         [ 0.5800, -0.2893, -0.1200],
         [ 0.6222, -0.2740, -0.1069],
         [ 

AttributeError: 'tuple' object has no attribute 'dim'

In [None]:
agent._mode

In [None]:
agent.gathering_data

### Why is the hidden state not in batch sizes?

## Visualize performance

In [None]:
agent.setNetwork(fname, nEpoch=2)

In [None]:
agent._in_episode = True
agent._mode = 0 # Testing mode with plan_depth=0
initState = env.reset(agent._mode)
inputDims = env.inputDimensions()

for i in range(len(inputDims)):
    if inputDims[i][0] > 1:
        agent._state[i][1:] = initState[i][1:]
agent._Vs_on_last_episode = []
is_terminal = False
reward = 0

### Frame by frame

In [None]:
%matplotlib inline

for i in range(100):
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    plt.figure()
    plt.imshow(np.flip(_obs.squeeze()))
    plt.show()
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
    V, action, reward, _ = agent._step()
    print(action)
    agent._Vs_on_last_episode.append(V)
    is_terminal = env.inTerminalState()
    if is_terminal: break

### As animation

In [None]:
%matplotlib notebook

import numpy as np 
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation

fig, ax = plt.subplots(1,1)
obs = env.observe()
_obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
_obs = np.flip(_obs.squeeze())
ax.set_xticks([])
ax.set_yticks([])
im = ax.imshow(np.zeros(_obs.shape))

def init():
    plt.cla()
    im = ax.imshow(_obs)
    return [im]

def animate(i, *args, **kwargs):
    plt.cla()
    obs = env.observe()
    _obs = obs[0].reshape((env.WIDTH, env.HEIGHT))
    _obs = np.flip(_obs.squeeze())
    im = ax.imshow(_obs)
    for i in range(len(obs)):
        agent._state[i][0:-1] = agent._state[i][1:]
        agent._state[i][-1] = obs[i]
        V, action, reward, _ = agent._step()
        agent._Vs_on_last_episode.append(V)
    return [im]

ani = animation.FuncAnimation(fig, animate, init_func=init, 
     frames=100, blit=False, repeat=True)
ani.save('behavior.gif', writer="ffmpeg", fps = 15)
fig.show()