## Double Deep Q Learning to Play CartPole

In [1]:
%matplotlib tk
import gym, random
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from collections import deque
from tqdm.notebook import tqdm
import pickle

In [2]:
COLAB = False
RESUME = False
path_base = "models/"
if COLAB:
    path_base = "drive/My Drive/"
env = gym.make('CartPole-v1')
from datetime import datetime

In [3]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
print (env.observation_space.shape)

(4,)


In [4]:
class Agent:
    def __init__(self, params):
        self.epsilon = 1
        self.epsilon_decay = params["decay"]
        self.epsilon_min = params["min_epsilon"]
        self.discount = params["discount"]
        self.merge_frequency = params["merge_frequency"]
        self.save_frequency = params["save_frequency"]
        self.num_actions = params["actions"]
        self.batch_size = params["batch_size"]
        self.optimizer = params["optimizer"]
        self.experience_memory = params["memory"]
        self.experience = deque()
        self.count = 0
        self.game = 0
        self.metric_states = []
        self.metric_outputs = []
        self.input_shape = params["input_shape"]
        self.q_network = self.build_network()
        self.target_network = self.build_network()
        self.load_weights()
        
    def merge_networks(self):
        self.target_network.set_weights(self.q_network.get_weights())
    def build_network(self):
        model = tf.keras.Sequential()
        initializer = tf.keras.initializers.GlorotUniform()
        model.add(tf.keras.layers.Dense(24, activation='relu', 
                        input_shape=self.input_shape, kernel_initializer=initializer))
        model.add(tf.keras.layers.Dense(24, activation='relu', 
                         kernel_initializer=initializer))
        model.add(tf.keras.layers.Dense(self.num_actions, activation='linear', kernel_initializer=initializer))
        model.compile(loss='mse', optimizer= self.optimizer)
        return model
    
    def agent_start(self, observation):
        q_values = self.q_network.predict(np.array([observation]))
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon, self.epsilon_min)
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(q_values[0])
        self.prev_state = observation
        self.prev_action = action 
        return action
    
    def agent_step(self, reward, observation):
        self.count += 1

        if self.game == 0:
            if self.count % 200 == 1 and len(self.metric_states) < 5:
                self.metric_states.append(observation)
        q_values = self.q_network.predict(np.array([observation]))
        relay = (self.prev_state, self.prev_action,  reward, observation, 0)
        self.experience.append(relay)
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(q_values[0])
        self.prev_state = observation
        self.prev_action = action 
        self.train(self.batch_size)
        
        return action, q_values[0][action]
        
    
    def agent_end(self, reward):
        relay = (self.prev_state, self.prev_action,  reward, self.prev_state, 1)
        self.experience.append(relay)
        self.game += 1
        if len(self.experience) > self.experience_memory:
            self.experience.popleft()
            
        if self.game%self.merge_frequency == 0:
            self.merge_networks()
            
        if self.game%self.save_frequency == 0:
            self.save_weights()
        
    def save_weights(self):
        self.q_network.save_weights(path_base + "q-cart.h5")
        self.target_network.save_weights(path_base + "target-cart.h5")
               
    def load_weights(self):
        self.q_network.load_weights(path_base + "q-cart.h5")
        self.target_network.load_weights(path_base + "target-cart.h5")
        
    def train(self, count):
        batch = random.sample(self.experience, min(count, len(self.experience)))
        step = 0
        input_tensor = [state for state, action, reward, future, terminated in batch]
        output_tensor = self.q_network.predict(np.array(input_tensor))
        future_input_tensor = [future for state, action, reward, future, terminated in batch]
        future_out = self.target_network.predict(np.array(future_input_tensor))
        for count, (state, action, reward, future, terminated) in enumerate(batch):
            target = output_tensor[count]
            updated = reward
            if not terminated:
                target_vals = future_out[count]
                updated += self.discount*(target_vals[np.argmax(target)])
                
            target[action] = updated
            output_tensor[count] = target 
        
        input_tensor = np.array(input_tensor)
        output_tensor = np.array(output_tensor)
        self.q_network.fit(input_tensor, output_tensor, epochs=1, verbose=0)
            
        metric = np.average([np.amax(out) for out in 
                                         self.q_network.predict(np.array(self.metric_states))])
        self.metric_outputs.append(metric)
        
        
    

In [5]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
params = {"decay":0.995, "batch_size":32, "merge_frequency": 10, "min_epsilon": 0.1, "input_shape": env.observation_space.shape, "save_frequency": 10, "discount": 0.95,  "actions": 2, "optimizer": optimizer, 
          "memory": 70000}
agent = Agent(params)

In [6]:
ITERATIONS = 400
np.set_printoptions(precision=3)
render = True
gbar = tqdm(desc="Game: ", total=ITERATIONS)
pbar = tqdm(desc="Game Step: ")
y = []
x = []
for _ in range(ITERATIONS):
    action = agent.agent_start(env.reset())
    observation, reward, done, info = env.step(action)
    
    game_reward = 0
    while not done:
        action, value = agent.agent_step(reward, observation)
        observation, reward, done, info = env.step(action)
        reward = reward if not done else -reward
        game_reward += reward
        if render:
            env.render()
        pbar.set_description("Action Value {0} Game Step: ".format(value))
        pbar.update(1)
        
    y.append(game_reward)
    x.append(_)
    
    gbar.update(1)
    pbar.refresh()
    pbar.reset()
    agent.agent_end(reward)
    
plt.clf()
plt.plot(x, y)
plt.show()
   


HBox(children=(IntProgress(value=0, description='Game: ', max=400, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=1, bar_style='info', description='Game Step: ', max=1, style=ProgressStyle(de…

InternalError:  Blas GEMM launch failed : a.shape=(1, 4), b.shape=(4, 24), m=1, n=24, k=4
	 [[node sequential/dense/MatMul (defined at <ipython-input-4-fe10c5ca9b2f>:37) ]] [Op:__inference_predict_function_277]

Function call stack:
predict_function


In [47]:
agent.q_network.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             (None, 80)                30800     
_________________________________________________________________
dense_32 (Dense)             (None, 6)                 486       
Total params: 31,286
Trainable params: 31,286
Non-trainable params: 0
_________________________________________________________________


In [18]:
env.close()