In [None]:
import gym
import tensorflow as tf
from tensorflow import keras
import random
import numpy as np
import datetime as dt
import math

STORE_PATH = '.'
MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.0005
GAMMA = 0.95
BATCH_SIZE = 32
TAU = 0.08
RANDOM_REWARD_STD = 1.0

env = gym.make("CartPole-v1")
state_size = 4
num_actions = env.action_space.n

primary_network = keras.Sequential([
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(num_actions)
])

target_network = keras.Sequential([
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(num_actions)
])

primary_network.compile(optimizer=keras.optimizers.Adam(), loss='mse')

# print('model init weight',primary_network.get_weights())


class Memory:
    def __init__(self, max_memory):
        self._max_memory = max_memory
        self._samples = []

    def add_sample(self, sample):
        self._samples.append(sample)
        if len(self._samples) > self._max_memory:
            self._samples.pop(0)

    def sample(self, no_samples):
        if no_samples > len(self._samples):
            return random.sample(self._samples, len(self._samples))
        else:
            return random.sample(self._samples, no_samples)

    @property
    def num_samples(self):
        return len(self._samples)


memory = Memory(500000)


def choose_action(state, primary_network, eps):
    if random.random() < eps:
        return random.randint(0, num_actions - 1)
    else:
#         print("\n\n\n choose_action      state", state.shape, state.reshape(1, -1).shape)
        return np.argmax(primary_network(state.reshape(1, -1)))


def train(primary_network, memory, target_network=None):
    if memory.num_samples < BATCH_SIZE * 3:
        return 0
    batch = memory.sample(BATCH_SIZE)
    states = np.array([val[0] for val in batch])
    
    
    actions = np.array([val[1] for val in batch])
    rewards = np.array([val[2] for val in batch])
    next_states = np.array([(np.zeros(state_size)
                             if val[3] is None else val[3]) for val in batch])
    
    
    
#     print("\n batch", len(batch), batch[0])
    
    
#     print("\n states", states.shape)
#     print("\n actions", actions.shape)
#     print("\n rewards", rewards.shape)
#     print("\n next_states", next_states.shape)
    
    
    # predict Q(s,a) given the batch of states

    prim_qt = primary_network(states)
#     print("\n prim_qt", prim_qt.shape)
    
    # predict Q(s',a') from the evaluation network
    prim_qtp1 = primary_network(next_states)
    
#     print("\n prim_qtp1", prim_qtp1.shape)
        
        
        
    # copy the prim_qt tensor into the target_q tensor - we then will update one index corresponding to the max action
    target_q = prim_qt.numpy()
    
#     print("\n\n\n\n target_q", target_q.shape, target_q)
    
    
    
    updates = rewards
    valid_idxs = np.array(next_states).sum(axis=1) != 0
    
    
    batch_idxs = np.arange(BATCH_SIZE)    
    
    

    if target_network is None:
        updates[valid_idxs] += GAMMA * np.amax(prim_qtp1.numpy()[valid_idxs, :], axis=1)
    else:
        
        
#         print("\prim_qtp1", prim_qtp1.numpy().shape)
        
        prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
#         print("prim_action_tp1", prim_action_tp1.shape)
        
        q_from_target = target_network(next_states)
#         print("q_from_target", q_from_target.shape)
        
        
#         print("batch_idxs[valid_idxs]", batch_idxs[valid_idxs].shape, batch_idxs[valid_idxs][0:2])
#         print("prim_action_tp1[valid_idxs]", prim_action_tp1[valid_idxs].shape, prim_action_tp1[valid_idxs][0:2])
        
        
        
#         print("gamma * ...", q_from_target.numpy()[batch_idxs[valid_idxs], prim_action_tp1[valid_idxs]].shape)
        
#         print("\n updates[valid_idxs]", updates[valid_idxs].shape)
        
        updates[valid_idxs] += GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], prim_action_tp1[valid_idxs]]
    
#     print("\n\n actions", actions.shape, actions)
    
#     print("\n\n batch_idxs", batch_idxs.shape, batch_idxs)
    
    
#     print("\n\n  target_q[batch_idxs, actions]", target_q[batch_idxs, actions].shape, target_q[batch_idxs, actions])
    target_q[batch_idxs, actions] = updates
#     print("\n\n\n target_q", target_q.shape)

    
    loss = primary_network.train_on_batch(states, target_q)
    
    if target_network is not None:
        # update target network parameters slowly from primary network
        for t, e in zip(target_network.trainable_variables, primary_network.trainable_variables):
            t.assign(t * (1 - TAU) + e * TAU)

    return loss

num_episodes = 1000
eps = MAX_EPSILON
render = False
train_writer = tf.summary.create_file_writer(STORE_PATH + f"/DoubleQ_{dt.datetime.now().strftime('%d%m%Y%H%M')}")
double_q = True
steps = 0


for i in range(num_episodes):
    state = env.reset()
    cnt = 0
    avg_loss = 0
    while True:
        if render:
            env.render()
        

        action = choose_action(state, primary_network, eps)
        
        
        next_state, reward, done, info = env.step(action)
        reward = np.random.normal(1.0, RANDOM_REWARD_STD)
        
                
        if done:
            next_state = None
        # store in memory
        memory.add_sample((state, action, reward, next_state))

        loss = train(primary_network, memory, target_network if double_q else None)
        avg_loss += loss

        state = next_state

        # exponentially decay the eps value
        steps += 1
        eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * steps)

        if done:
            avg_loss /= cnt
            print(f"Episode: {i}, Reward: {cnt}, avg loss: {avg_loss:.3f}, eps: {eps:.3f}")
            with train_writer.as_default():
                tf.summary.scalar('reward', cnt, step=i)
                tf.summary.scalar('avg loss', avg_loss, step=i)
            break

        cnt += 1


Episode: 0, Reward: 11, avg loss: 0.000, eps: 0.994
Episode: 1, Reward: 10, avg loss: 0.000, eps: 0.989
Episode: 2, Reward: 20, avg loss: 0.000, eps: 0.978
Episode: 3, Reward: 30, avg loss: 0.000, eps: 0.964
Episode: 4, Reward: 24, avg loss: 0.257, eps: 0.952
Episode: 5, Reward: 19, avg loss: 0.862, eps: 0.942
Episode: 6, Reward: 16, avg loss: 0.869, eps: 0.934
Episode: 7, Reward: 33, avg loss: 0.954, eps: 0.919
Episode: 8, Reward: 19, avg loss: 1.145, eps: 0.910
Episode: 9, Reward: 13, avg loss: 1.535, eps: 0.904
Episode: 10, Reward: 10, avg loss: 1.502, eps: 0.899
Episode: 11, Reward: 12, avg loss: 2.176, eps: 0.893
Episode: 12, Reward: 33, avg loss: 3.043, eps: 0.878
Episode: 13, Reward: 9, avg loss: 4.733, eps: 0.874
Episode: 14, Reward: 11, avg loss: 4.771, eps: 0.869
Episode: 15, Reward: 37, avg loss: 6.326, eps: 0.852
Episode: 16, Reward: 13, avg loss: 7.279, eps: 0.846
Episode: 17, Reward: 10, avg loss: 6.540, eps: 0.842
Episode: 18, Reward: 11, avg loss: 15.645, eps: 0.837
Epi

Episode: 153, Reward: 173, avg loss: 0.685, eps: 0.012
Episode: 154, Reward: 215, avg loss: 0.650, eps: 0.011
Episode: 155, Reward: 185, avg loss: 0.654, eps: 0.011
Episode: 156, Reward: 180, avg loss: 0.644, eps: 0.011
Episode: 157, Reward: 239, avg loss: 0.672, eps: 0.011
Episode: 158, Reward: 198, avg loss: 0.682, eps: 0.011
Episode: 159, Reward: 100, avg loss: 0.656, eps: 0.011
Episode: 160, Reward: 240, avg loss: 0.654, eps: 0.011
Episode: 161, Reward: 259, avg loss: 0.653, eps: 0.011
Episode: 162, Reward: 187, avg loss: 0.647, eps: 0.011
Episode: 163, Reward: 173, avg loss: 0.659, eps: 0.011
Episode: 164, Reward: 153, avg loss: 0.621, eps: 0.011
Episode: 165, Reward: 106, avg loss: 0.673, eps: 0.011
Episode: 166, Reward: 193, avg loss: 0.639, eps: 0.010
Episode: 167, Reward: 103, avg loss: 0.619, eps: 0.010
Episode: 168, Reward: 240, avg loss: 0.636, eps: 0.010
Episode: 169, Reward: 195, avg loss: 0.657, eps: 0.010
Episode: 170, Reward: 181, avg loss: 0.653, eps: 0.010
Episode: 1

In [None]:
render = True
for i in range(num_episodes):
    state = env.reset()
    cnt = 0
    avg_loss = 0
    while True:
        if render:
            env.render()
        eps = 0.
        action = choose_action(state, primary_network, eps)
        next_state, reward, done, info = env.step(action)
        reward = np.random.normal(1.0, RANDOM_REWARD_STD)
        if done:
            next_state = None

        state = next_state

        if done:
            avg_loss /= cnt
            print(f"Episode: {i}, Reward: {cnt}, avg loss: {avg_loss:.3f}, eps: {eps:.3f}")
            with train_writer.as_default():
                tf.summary.scalar('reward', cnt, step=i)
                tf.summary.scalar('avg loss', avg_loss, step=i)
            break

        cnt += 1
env.close()

In [None]:
env.close()