In [1]:
import base64
import imageio
import matplotlib
import matplotlib.pyplot as plt

import tensorflow as tf

from tf_agents.agents.dqn.dqn_agent import DqnAgent
from tf_agents.networks.q_network import QNetwork

from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment

from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

# Globals
NUMBER_ITERATION = 20000
COLLECTION_STEPS = 1
BATCH_SIZE = 64
EVAL_EPISODES = 10
EVAL_INTERVAL = 1000

We imported TensorFlow and a lot of modules from __TF-Agents__. 

One of the classes we imported is __DqnAgent__, specific agent that can perform Deep Q-Learning. This is really cool and saves us a lot of time. 

Also we imported __QNetwork__ class. This class is an abstraction of neural network that we use for learning. As you can see, as with transfer learning, this saves us a bunch of time. 

We also import __suite_gym__ and __tf_py_environment__. The first module grants us access to training environments. __Since all of these environments are implemented in Python__, we need to __wrap them up into TensorFlow__. That is what __tf_py_environment__ is used for. 

For _experience replay_, we use class __TFUniformReplayBuffer__ and in this buffer we store trajectories. Trajectory is a tuple that contains state of the environment in some time step, action that agent should take it in that state and state in which the environment will be after defined action is performed. 

# Entorno

After importing all necessary modules, we need to construct the environment. In fact, we need two environments, one for training and the other one for evaluation. Here is how we do that:

In [24]:
train_env = suite_gym.load('CartPole-v0')
evaluation_env = suite_gym.load('CartPole-v0')

print('Observation Spec:')
print(train_env.time_step_spec().observation)

print('Reward Spec:')
print(train_env.time_step_spec().reward)

print('Action Spec:')
print(train_env.action_spec())

train_env = tf_py_environment.TFPyEnvironment(train_env)
evaluation_env = tf_py_environment.TFPyEnvironment(evaluation_env)

Observation Spec:
TimeStep(step_type=ArraySpec(shape=(), dtype=dtype('int32'), name='step_type'), reward=ArraySpec(shape=(), dtype=dtype('float32'), name='reward'), discount=BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0), observation=BoundedArraySpec(shape=(4,), dtype=dtype('float32'), name='observation', minimum=[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], maximum=[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]))
Observation Spec:
BoundedArraySpec(shape=(4,), dtype=dtype('float32'), name='observation', minimum=[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], maximum=[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38])
Reward Spec:
ArraySpec(shape=(), dtype=dtype('float32'), name='reward')
Action Spec:
BoundedArraySpec(shape=(), dtype=dtype('int64'), name='action', minimum=0, maximum=1)


# Agent
Now, we can _build DQN agent_. Before we proceed with that, __we need to create an instance of QNetwork class__. 

## QNetwork

Here we have two obligatory parameters and a number of optional ones:

- input_tensor_spec, which is the set of possible states of the environment 
- action_spec, which is the set of possible actions that agent can be undertake in that environment

Among other parameters fc_layer_params, is of great importance to us. Using this parameters, we can define number of neurons for each hidden layer. We use this constructor like this:

In [21]:
#Vamos a utilizar una sola capa deinterna de dimension 100
hidden_layers = (100,)

#Red Neuronal
q_network = QNetwork(
    #Numero de estados
    train_env.observation_spec(),
    #Numero de acciones
    train_env.action_spec(),
    #Configuracion de las capas de la RN
    fc_layer_params=hidden_layers)

## DqnAgent

We define only one hidden layer, with 100 neurons and pass on information about training environment to the QNetwork constructor. Now we can instantiate an object of DQNAgent class.

In [28]:
counter = tf.Variable(0)

agent = DqnAgent(
    # Entornos de prueba y de validacion
    train_env.time_step_spec(),
    train_env.action_spec(),
    #Red Neuronal
    q_network = q_network,
    #Optimizador y funcion de error
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3),
    td_errors_loss_fn = common.element_wise_squared_loss,
    #Numero de pasos por simulacion
    train_step_counter = counter)

#Inicializa el agente
agent.initialize()

print('Agent Spec (Trajectory):')
print(agent.collect_data_spec)

print('Agent Spec Trajectory.observation:')
print(agent.collect_data_spec.observation)

print('Agent Spec Trajectory.action:')
print(agent.collect_data_spec.action)

print('Agent Spec Trajectory.reward:')
print(agent.collect_data_spec.reward)



Agent Spec (Trajectory):
Trajectory(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), observation=BoundedTensorSpec(shape=(4,), dtype=tf.float32, name='observation', minimum=array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38],
      dtype=float32), maximum=array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
      dtype=float32)), action=BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0, dtype=int64), maximum=array(1, dtype=int64)), policy_info=(), next_step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)))
Agent Spec Trajectory.observation:
BoundedTensorSpec(shape=(4,), dtype=tf.float32, name='observation', minimum=array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38],
      dtype=flo

## Simulaciones

The method bellow is used for calculations of how much reword has agent gained on average.

In [16]:
#Ejecuta un numero de simulaciones - episodes - y recupera el valor medio
def get_average_return(environment, policy, episodes=10):

    total_return = 0.0
    
    #Para cada episodio
    for _ in range(episodes):
        #Reseteamos el entorno
        time_step = environment.reset()
        episode_return = 0.0
        #Empieza el episodio...
        while not time_step.is_last():
            #Ejecuta la accion determinada por la policy para el estado especificado
            action_step = policy.action(time_step)
            #Ejecuta la accion
            time_step = environment.step(action_step.action)
            #Actualiza la recompensa
            episode_return += time_step.reward
    
        #Actualiza el total
        total_return += episode_return
    
    #El retorno medio
    avg_return = total_return / episodes
    
    return avg_return.numpy()[0]

# Experience Replay

Ok, let’s build the last piece of the Deep Q-Learning ecosystem – Experience Replay. For this purpose, we implement the class with the same name:

In [12]:
class ExperienceReply(object):
    
    def __init__(self, agent, enviroment):
        #Lista de vectores. Se podran utilizar para el aprendizaje
        self._replay_buffer = TFUniformReplayBuffer(
            #Estado
            data_spec=agent.collect_data_spec,
            batch_size=enviroment.batch_size,
            #Tamaño de la lista
            max_length=50000)
        
        self._random_policy = RandomTFPolicy(train_env.time_step_spec(),
                                                enviroment.action_spec())
        
        self._fill_buffer(train_env, self._random_policy, steps=100)
        
        self.dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3, 
            sample_batch_size=BATCH_SIZE, 
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)
    
    def _fill_buffer(self, enviroment, policy, steps):
        for _ in range(steps):
            self.timestamp_data(enviroment, policy)
            
    def timestamp_data(self, environment, policy):
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        timestamp_trajectory = trajectory.from_transition(time_step, action_step, next_time_step)

        self._replay_buffer.add_batch(timestamp_trajectory)

experience_replay = ExperienceReply(agent, train_env)

In [19]:
print(agent.collect_data_spec)
print(train_env.batch_size)

Trajectory(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), observation=BoundedTensorSpec(shape=(4,), dtype=tf.float32, name='observation', minimum=array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38],
      dtype=float32), maximum=array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
      dtype=float32)), action=BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0, dtype=int64), maximum=array(1, dtype=int64)), policy_info=(), next_step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)))
1


In the constructor of this class, we initialize replay buffer, which is an object of the class __TFUniformReplayBuffer__.

If your agent is not getting good results, you can play with batch size and length of the buffer.

Also, we created and instance of RandomTFPolicy. This one is used to fill buffer with initial values, which is done by calling internal function _fill_buffer. This method in turn calls timestamp_data method for each state of the environment.

Method timestamp_data then forms trajectory from the current state and the action defined by policy. This trajectory is stored in the the buffer.

Final step of the constructor is to create an iterable tf.data.Dataset pipeline which feeds data to the agent.

# Training and Evaluation

Once we have all this prepared, implementing training process is straight forward:

First, we initialize counter on the agent to 0 and get initial average return of reward. Then training process starts for defined number of iterations. During this process we first collect data from the environment and then use that data to train the agent’s both neural networks. We also periodically print out average reward return and loss on evaluation environment. Here is how that looks like:

In [29]:
agent.train_step_counter.assign(0)

avg_return = get_average_return(evaluation_env, agent.policy, EVAL_EPISODES)
returns = [avg_return]

for _ in range(NUMBER_ITERATION):
    
    for _ in range(COLLECTION_STEPS):
        experience_replay.timestamp_data(train_env, agent.collect_policy)

    experience, info = next(experience_replay.iterator)
    train_loss = agent.train(experience).loss

    if agent.train_step_counter.numpy() % EVAL_INTERVAL == 0:
        avg_return = get_average_return(evaluation_env, agent.policy, EVAL_EPISODES)
        print('Iteration {0} - Average Return = {1}, Loss = {2}.'.format(agent.train_step_counter.numpy(), avg_return, train_loss))
        returns.append(avg_return)

Iteration 1000 - Average Return = 23.0, Loss = 21.123376846313477.
Iteration 2000 - Average Return = 31.600000381469727, Loss = 13.086341857910156.
Iteration 3000 - Average Return = 32.20000076293945, Loss = 28.730281829833984.
Iteration 4000 - Average Return = 126.5999984741211, Loss = 6.088086128234863.
Iteration 5000 - Average Return = 161.0, Loss = 29.583511352539062.
Iteration 6000 - Average Return = 188.3000030517578, Loss = 98.26377868652344.
Iteration 7000 - Average Return = 200.0, Loss = 14.179779052734375.
Iteration 8000 - Average Return = 199.89999389648438, Loss = 394.890380859375.
Iteration 9000 - Average Return = 200.0, Loss = 198.0342254638672.
Iteration 10000 - Average Return = 200.0, Loss = 263.45770263671875.
Iteration 11000 - Average Return = 200.0, Loss = 656.7501831054688.
Iteration 12000 - Average Return = 200.0, Loss = 70.80119323730469.
Iteration 13000 - Average Return = 200.0, Loss = 55.57476806640625.
Iteration 14000 - Average Return = 200.0, Loss = 62.5096359