In [1]:
import sys
import logging
import numpy as np
from joblib import hash, dump
import os

from deer.default_parser import process_args
from deer.agent import NeuralAgent
from deer.learning_algos.CRAR_torch import CRAR
from catcher_env import MyEnv as catcher_env
import deer.experiment.base_controllers as bc

from deer.policies import EpsilonGreedyPolicy

In [2]:
class Defaults:
    # ----------------------
    # Experiment Parameters
    # ----------------------
    steps_per_epoch = 2000
    epochs = 50
    steps_per_test = 500
    period_btw_summary_perfs = 1
    
    # ----------------------
    # Environment Parameters
    # ----------------------
    frame_skip = 2

    # ----------------------
    # DQN Agent parameters:
    # ----------------------
    update_rule = 'rmsprop'
    learning_rate = 0.0005
    learning_rate_decay = 0.9
    discount = 0.9
    discount_inc = 1
    discount_max = 0.99
    rms_decay = 0.9
    rms_epsilon = 0.0001
    momentum = 0
    clip_norm = 1.0
    epsilon_start = 1.0
    epsilon_min = 1.0
    epsilon_decay = 10000
    update_frequency = 1
    replay_memory_size = 1000000
    batch_size = 32
    freeze_interval = 1000
    deterministic = False

HIGHER_DIM_OBS = True
HIGH_INT_DIM = False

In [3]:
parameters = Defaults()
if parameters.deterministic:
    rng = np.random.RandomState(123456)
else:
    rng = np.random.RandomState()

In [4]:
# --- Instantiate environment ---
env = catcher_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=False)

# --- Instantiate learning algorithm ---
learning_algo = CRAR(
    env,
    parameters.rms_decay,
    parameters.rms_epsilon,
    parameters.momentum,
    parameters.clip_norm,
    parameters.freeze_interval,
    parameters.batch_size,
    parameters.update_rule,
    rng,
    double_Q=True,
    high_int_dim=HIGH_INT_DIM,
    internal_dim=3)

test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1)#1.)

# --- Instantiate agent ---
agent = NeuralAgent(
    env,
    learning_algo,
    parameters.replay_memory_size,
    max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
    parameters.batch_size,
    rng,
    test_policy=test_policy)

# --- Create unique filename for FindBestController ---
h = hash(vars(parameters), hash_name="sha1")
fname = "test_" + h
print("The parameters hash is: {}".format(h))
print("The parameters are: {}".format(parameters))

# --- Bind controllers to the agent ---
# Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
# learning rate as well as the training epoch number.
agent.attach(bc.VerboseController(
    evaluate_on='epoch', 
    periodicity=1))

# As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
# policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
# precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
# episode or epoch (or never, hence the resetEvery='none').
agent.attach(bc.EpsilonController(
    initial_e=parameters.epsilon_start, 
    e_decays=parameters.epsilon_decay, 
    e_min=parameters.epsilon_min,
    evaluate_on='action',
    periodicity=1,
    reset_every='none'))

# During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
# Plus, we also want to display after each training episode (!= than after every training) the average bellman
# residual and the average of the V values obtained during the last episode, hence the two last arguments.
agent.attach(bc.TrainerController(
    evaluate_on='action', 
    periodicity=parameters.update_frequency, 
    show_episode_avg_V_value=True, 
    show_avg_Bellman_residual=True))

# Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
# wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
agent.attach(bc.LearningRateController(
    initial_learning_rate=parameters.learning_rate, 
    learning_rate_decay=parameters.learning_rate_decay,
    periodicity=1))

# Same for the discount factor.
agent.attach(bc.DiscountFactorController(
    initial_discount_factor=parameters.discount, 
    discount_factor_growth=parameters.discount_inc, 
    discount_factor_max=parameters.discount_max,
    periodicity=1))

# All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
# "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
# these validation epoch to interfere with the training of the agent, which is well established by the 
# TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
# duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
# InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
# obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
# [parameters.period_btw_summary_perfs] *validation* epochs.
agent.attach(bc.InterleavedTestEpochController(
    id=catcher_env.VALIDATION_MODE, 
    epoch_length=parameters.steps_per_test,
    periodicity=1,
    show_score=True,
    summarize_every=1))

The parameters hash is: 62977be8e45d8a56a5537c11dfd5d2fd8dda69e0
The parameters are: <__main__.Defaults object at 0x2b613d43e700>


In [5]:
# --- Run the experiment ---
try:
    os.mkdir("params")
except Exception:
    pass
dump(vars(parameters), "params/" + fname + ".jldump")
agent.run(parameters.epochs, parameters.steps_per_epoch)

Printing a few elements useful for debugging:
actions_val[0], rewards_val[0], terminals_val[0]
0 0.0 False
Es[0], TEs[0], Esp_[0]
tensor([0.2541, 0.0169, 0.1931], device='cuda:0') tensor([ 0.4409, -0.2150,  0.0022], device='cuda:0') tensor([0.2537, 0.0178, 0.1930], device='cuda:0')
R[0]
tensor([-0.1063], device='cuda:0', grad_fn=<SelectBackward0>)
Average (on the epoch) training loss: 0.0003780538681894541
Episode average V value: 0
Average (on the epoch) training loss: 0.029045530211399582
Episode average V value: 0
Average (on the epoch) training loss: 0.04262078488962008
Episode average V value: 0
Average (on the epoch) training loss: 0.05514818562992981
Episode average V value: 0
Average (on the epoch) training loss: 0.0639198447421553
Episode average V value: 0
Average (on the epoch) training loss: 0.06882521309911818
Episode average V value: 0
Average (on the epoch) training loss: 0.07229921795641853
Episode average V value: 0
Average (on the epoch) training loss: 0.0750280910107

Average (on the epoch) training loss: 0.09259345294839316
Episode average V value: 0
Average (on the epoch) training loss: 0.09248162058690894
Episode average V value: 0
Average (on the epoch) training loss: 0.09248784050328054
Episode average V value: 0
Average (on the epoch) training loss: 0.09223797498365456
Episode average V value: 0
Average (on the epoch) training loss: 0.09253621596950354
Episode average V value: 0
Average (on the epoch) training loss: 0.09282596162123284
Episode average V value: 0
Average (on the epoch) training loss: 0.09267384640090089
Episode average V value: 0
Average (on the epoch) training loss: 0.09273470167170053
Episode average V value: 0
Average (on the epoch) training loss: 0.09300407398603691
Episode average V value: 0
Average (on the epoch) training loss: 0.09322550365750665
Episode average V value: 0
Average (on the epoch) training loss: 0.09307410138901183
Episode average V value: 0
Average (on the epoch) training loss: 0.09312032103313425
Episode

Average (on the epoch) training loss: 0.09107767203922121
Episode average V value: 0
Average (on the epoch) training loss: 0.09109434915946239
Episode average V value: 0
Average (on the epoch) training loss: 0.09095292990836738
Episode average V value: 0
Average (on the epoch) training loss: 0.09096505439384436
Episode average V value: 0
Average (on the epoch) training loss: 0.09097631383042647
Episode average V value: 0
Average (on the epoch) training loss: 0.09101106454116253
Episode average V value: 0
Average (on the epoch) training loss: 0.09099002258882623
Episode average V value: 0
Average (on the epoch) training loss: 0.09097762214508189
Episode average V value: 0
Average (on the epoch) training loss: 0.09085767679505517
Episode average V value: 0
Average (on the epoch) training loss: 0.09080154903926424
Episode average V value: 0
Average (on the epoch) training loss: 0.09080093412670678
Episode average V value: 0
Average (on the epoch) training loss: 0.09092289550473584
Episode



NameError: name 'copy' is not defined