In [2]:
import gym
import tensorflow as tf
from tensorflow import keras
import random
import numpy as np
import datetime as dt
import math

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
STORE_PATH = './'
MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.0005
GAMMA = 0.95
BATCH_SIZE = 32
TAU = 0.08
RANDOM_REWARD_STD = 1.0

In [4]:
env = gym.make("CartPole-v0")
state_size = 4
num_actions = env.action_space.n

In [5]:
primary_network = keras.Sequential([
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(num_actions)
])

target_network = keras.Sequential([
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(num_actions)
])

In [13]:
primary_network.compile(optimizer=keras.optimizers.Adam(), loss='mse')

In [7]:
class Memory:
    def __init__(self, max_memory):
        self._max_memory = max_memory
        self._samples = []

    def add_sample(self, sample):
        self._samples.append(sample)
        if len(self._samples) > self._max_memory:
            self._samples.pop(0)

    def sample(self, no_samples):
        if no_samples > len(self._samples):
            return random.sample(self._samples, len(self._samples))
        else:
            return random.sample(self._samples, no_samples)

    @property
    def num_samples(self):
        return len(self._samples)

In [8]:
memory = Memory(50000)

In [9]:
def choose_action(state, primary_network, eps):
    if random.random() < eps:
        return random.randint(0, num_actions - 1)
    else:
        return np.argmax(primary_network(state.reshape(1, -1)))



In [10]:
def train(primary_network, memory, target_network=None):
    #– this is to ensure no training of the primary network takes place 
    #until there is a reasonable amount of samples within the memory
    if memory.num_samples < BATCH_SIZE * 3:
        return 0
    #a batch is extracted from the memory 
    batch = memory.sample(BATCH_SIZE)
    #individual state, actions and reward values are then extracted and converted
    #to numpy arrays using Python list comprehensions
    states = np.array([val[0] for val in batch])
    actions = np.array([val[1] for val in batch])
    rewards = np.array([val[2] for val in batch])
    #the next_state values are set to zeros if the raw next_state values are None
    next_states = np.array([(np.zeros(state_size) if val[3] is None else val[3]) for val in batch])
    
    # predict Q(s,a) given the batch of states
    prim_qt = primary_network(states)
    # predict Q(s',a') from the evaluation network
    prim_qtp1 = primary_network(next_states)
    
    # copy the prim_qt tensor into the target_q tensor - we then will update one index corresponding to the max action
    target_q = prim_qt.numpy()
    
    #Note that the target_q values are the same as the prim_qt () values except for the index 
    #corresponding to the action chosen
    updates = rewards
    
    #Comprueba para cada una de las muestras que el estado no sea cero. Si lo es significa que no podemos usar la muestra,
    # valid_idxs sera un vector con un true en aquellas posiciones donde el next_state estaba informado
    valid_idxs = np.array(next_states).sum(axis=1) != 0
    
    batch_idxs = np.arange(BATCH_SIZE)
    
    #Esta funciona es valida para DQN y para no DQN
    if target_network is None:
        #Modo tradicional QN
        updates[valid_idxs] += GAMMA * np.amax(prim_qtp1.numpy()[valid_idxs, :], axis=1)
    else:
        #Modo DQN
        prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
        q_from_target = target_network(next_states)
        updates[valid_idxs] += GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], prim_action_tp1[valid_idxs]]
    #calcula el target
    target_q[batch_idxs, actions] = updates
    
    #Entrena el modelo
    loss = primary_network.train_on_batch(states, target_q)
    
    #Copia al modelo target
    if target_network is not None:
        # Actualiza el modelo target, copiando progresivamente los pesos del modelo principal
        for t, e in zip(target_network.trainable_variables, primary_network.trainable_variables):
            t.assign(t * (1 - TAU) + e * TAU)
    return loss


In [12]:
num_episodes = 1000
eps = MAX_EPSILON
#No visualizamos
render = False
train_writer = tf.summary.create_file_writer(STORE_PATH + f"/DoubleQ_{dt.datetime.now().strftime('%d%m%Y%H%M')}")
#If double_q is set to False, the training function defaults to standard deep Q learning
double_q = False
                                             
steps = 0
for i in range(num_episodes):
    state = env.reset()
    cnt = 0
    avg_loss = 0
    while True:
        if render:
            env.render()
        #Elije la accion usando la primary network                                     
        action = choose_action(state, primary_network, eps)
        #Ejecuta la accion
        next_state, reward, done, info = env.step(action)
        #Hacemos el entorno estocastico
        reward = np.random.normal(1.0, RANDOM_REWARD_STD)
        
        if done:
            next_state = None
        
        # guardamos en la memoria, el estado, la accion, la recompensa y el siguiente estado
        memory.add_sample((state, action, reward, next_state))

        #Entrenamos
        loss = train(primary_network, memory, target_network if double_q else None)
        avg_loss += loss

        state = next_state

        # exponentially decay the eps value
        steps += 1
        eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * steps)

        #Terminamos
        if done:
            avg_loss /= cnt
            print(f"Episode: {i}, Reward: {cnt}, avg loss: {avg_loss:.3f}, eps: {eps:.3f}")
            with train_writer.as_default():
                tf.summary.scalar('reward', cnt, step=i)
                tf.summary.scalar('avg loss', avg_loss, step=i)
            break

        cnt += 1


Episode: 0, Reward: 22, avg loss: 0.000, eps: 0.989
Episode: 1, Reward: 24, avg loss: 0.000, eps: 0.977


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Episode: 2, Reward: 14, avg loss: 0.000, eps: 0.969
Episode: 3, Reward: 13, avg loss: 0.000, eps: 0.963
Episode: 4, Reward: 28, avg loss: 0.542, eps: 0.949
Episode: 5, Reward: 9, avg loss: 1.602, eps: 0.944
Episode: 6, Reward: 19, avg loss: 1.442, eps: 0.935
Episode: 7, Reward: 15, avg loss: 1.731, eps: 0.928
Episode: 8, Reward: 17, avg loss: 2.231, eps: 0.919
Episode: 9, Reward: 9, avg loss: 1.971, eps: 0.915
Episode: 10, Reward: 21, avg loss: 3.381, eps: 0.905
Episode: 11, Reward: 10, avg loss: 3.608, eps: 0.900
Episode: 12, Reward: 11, avg loss: 4.916, eps: 0.895
Episode: 13, Reward: 1

Episode: 139, Reward: 199, avg loss: 1.419, eps: 0.010
Episode: 140, Reward: 199, avg loss: 1.276, eps: 0.010
Episode: 141, Reward: 199, avg loss: 1.161, eps: 0.010
Episode: 142, Reward: 199, avg loss: 1.339, eps: 0.010
Episode: 143, Reward: 199, avg loss: 1.116, eps: 0.010
Episode: 144, Reward: 199, avg loss: 1.158, eps: 0.010
Episode: 145, Reward: 199, avg loss: 1.250, eps: 0.010
Episode: 146, Reward: 199, avg loss: 1.075, eps: 0.010
Episode: 147, Reward: 199, avg loss: 1.311, eps: 0.010
Episode: 148, Reward: 199, avg loss: 1.241, eps: 0.010
Episode: 149, Reward: 199, avg loss: 1.129, eps: 0.010
Episode: 150, Reward: 199, avg loss: 1.273, eps: 0.010
Episode: 151, Reward: 199, avg loss: 1.186, eps: 0.010
Episode: 152, Reward: 199, avg loss: 1.279, eps: 0.010
Episode: 153, Reward: 199, avg loss: 1.287, eps: 0.010
Episode: 154, Reward: 199, avg loss: 1.158, eps: 0.010
Episode: 155, Reward: 199, avg loss: 1.351, eps: 0.010
Episode: 156, Reward: 199, avg loss: 1.158, eps: 0.010
Episode: 1

Episode: 288, Reward: 199, avg loss: 1.051, eps: 0.010
Episode: 289, Reward: 199, avg loss: 1.060, eps: 0.010
Episode: 290, Reward: 199, avg loss: 0.877, eps: 0.010
Episode: 291, Reward: 199, avg loss: 1.026, eps: 0.010
Episode: 292, Reward: 199, avg loss: 1.105, eps: 0.010
Episode: 293, Reward: 199, avg loss: 1.152, eps: 0.010
Episode: 294, Reward: 199, avg loss: 1.154, eps: 0.010
Episode: 295, Reward: 199, avg loss: 1.080, eps: 0.010
Episode: 296, Reward: 199, avg loss: 0.936, eps: 0.010
Episode: 297, Reward: 199, avg loss: 0.992, eps: 0.010
Episode: 298, Reward: 199, avg loss: 0.926, eps: 0.010
Episode: 299, Reward: 199, avg loss: 1.205, eps: 0.010
Episode: 300, Reward: 199, avg loss: 1.195, eps: 0.010
Episode: 301, Reward: 199, avg loss: 1.191, eps: 0.010
Episode: 302, Reward: 199, avg loss: 0.874, eps: 0.010
Episode: 303, Reward: 199, avg loss: 1.116, eps: 0.010
Episode: 304, Reward: 199, avg loss: 0.837, eps: 0.010
Episode: 305, Reward: 199, avg loss: 1.172, eps: 0.010
Episode: 3

Episode: 438, Reward: 199, avg loss: 0.885, eps: 0.010
Episode: 439, Reward: 199, avg loss: 0.932, eps: 0.010
Episode: 440, Reward: 199, avg loss: 1.108, eps: 0.010
Episode: 441, Reward: 191, avg loss: 1.102, eps: 0.010
Episode: 442, Reward: 199, avg loss: 0.984, eps: 0.010
Episode: 443, Reward: 199, avg loss: 1.039, eps: 0.010
Episode: 444, Reward: 199, avg loss: 0.981, eps: 0.010
Episode: 445, Reward: 166, avg loss: 0.945, eps: 0.010
Episode: 446, Reward: 184, avg loss: 1.110, eps: 0.010
Episode: 447, Reward: 155, avg loss: 1.129, eps: 0.010
Episode: 448, Reward: 143, avg loss: 1.073, eps: 0.010
Episode: 449, Reward: 194, avg loss: 1.141, eps: 0.010
Episode: 450, Reward: 199, avg loss: 0.939, eps: 0.010
Episode: 451, Reward: 199, avg loss: 1.000, eps: 0.010
Episode: 452, Reward: 168, avg loss: 0.987, eps: 0.010
Episode: 453, Reward: 199, avg loss: 1.087, eps: 0.010
Episode: 454, Reward: 174, avg loss: 0.881, eps: 0.010
Episode: 455, Reward: 199, avg loss: 1.023, eps: 0.010
Episode: 4

Episode: 587, Reward: 184, avg loss: 0.865, eps: 0.010
Episode: 588, Reward: 179, avg loss: 0.822, eps: 0.010
Episode: 589, Reward: 143, avg loss: 0.911, eps: 0.010
Episode: 590, Reward: 118, avg loss: 0.858, eps: 0.010
Episode: 591, Reward: 199, avg loss: 0.941, eps: 0.010
Episode: 592, Reward: 199, avg loss: 0.988, eps: 0.010
Episode: 593, Reward: 140, avg loss: 0.934, eps: 0.010
Episode: 594, Reward: 144, avg loss: 0.939, eps: 0.010
Episode: 595, Reward: 157, avg loss: 0.982, eps: 0.010
Episode: 596, Reward: 124, avg loss: 0.772, eps: 0.010
Episode: 597, Reward: 160, avg loss: 0.820, eps: 0.010
Episode: 598, Reward: 128, avg loss: 0.964, eps: 0.010
Episode: 599, Reward: 133, avg loss: 0.881, eps: 0.010
Episode: 600, Reward: 127, avg loss: 0.938, eps: 0.010
Episode: 601, Reward: 140, avg loss: 0.964, eps: 0.010
Episode: 602, Reward: 164, avg loss: 0.867, eps: 0.010
Episode: 603, Reward: 177, avg loss: 0.944, eps: 0.010
Episode: 604, Reward: 145, avg loss: 0.760, eps: 0.010
Episode: 6

Episode: 737, Reward: 133, avg loss: 0.733, eps: 0.010
Episode: 738, Reward: 129, avg loss: 0.913, eps: 0.010
Episode: 739, Reward: 165, avg loss: 0.675, eps: 0.010
Episode: 740, Reward: 114, avg loss: 0.741, eps: 0.010
Episode: 741, Reward: 199, avg loss: 0.845, eps: 0.010
Episode: 742, Reward: 164, avg loss: 0.703, eps: 0.010
Episode: 743, Reward: 154, avg loss: 0.824, eps: 0.010
Episode: 744, Reward: 149, avg loss: 0.686, eps: 0.010
Episode: 745, Reward: 176, avg loss: 0.851, eps: 0.010
Episode: 746, Reward: 184, avg loss: 0.841, eps: 0.010
Episode: 747, Reward: 144, avg loss: 0.842, eps: 0.010
Episode: 748, Reward: 183, avg loss: 0.750, eps: 0.010
Episode: 749, Reward: 158, avg loss: 0.851, eps: 0.010
Episode: 750, Reward: 124, avg loss: 0.816, eps: 0.010
Episode: 751, Reward: 150, avg loss: 0.851, eps: 0.010
Episode: 752, Reward: 151, avg loss: 0.739, eps: 0.010
Episode: 753, Reward: 138, avg loss: 0.854, eps: 0.010
Episode: 754, Reward: 124, avg loss: 0.870, eps: 0.010
Episode: 7

Episode: 887, Reward: 199, avg loss: 0.786, eps: 0.010
Episode: 888, Reward: 163, avg loss: 0.837, eps: 0.010
Episode: 889, Reward: 199, avg loss: 0.888, eps: 0.010
Episode: 890, Reward: 129, avg loss: 0.744, eps: 0.010
Episode: 891, Reward: 138, avg loss: 0.735, eps: 0.010
Episode: 892, Reward: 166, avg loss: 0.766, eps: 0.010
Episode: 893, Reward: 195, avg loss: 0.837, eps: 0.010
Episode: 894, Reward: 146, avg loss: 0.685, eps: 0.010
Episode: 895, Reward: 199, avg loss: 0.778, eps: 0.010
Episode: 896, Reward: 178, avg loss: 0.743, eps: 0.010
Episode: 897, Reward: 136, avg loss: 0.814, eps: 0.010
Episode: 898, Reward: 150, avg loss: 0.821, eps: 0.010
Episode: 899, Reward: 199, avg loss: 0.799, eps: 0.010
Episode: 900, Reward: 199, avg loss: 0.793, eps: 0.010
Episode: 901, Reward: 127, avg loss: 0.743, eps: 0.010
Episode: 902, Reward: 199, avg loss: 0.698, eps: 0.010
Episode: 903, Reward: 126, avg loss: 0.725, eps: 0.010
Episode: 904, Reward: 121, avg loss: 0.662, eps: 0.010
Episode: 9