## Deep Q Learning to play Atari from RAM

In [111]:
import gym, random
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from collections import deque

In [112]:
env = gym.make('Pong-ram-v0')
tf.test.is_gpu_available()


True

In [113]:
class Agent:
    def __init__(self, params):
        self.epsilon = params["epsilon"]
        self.discount = params["discount"]
        self.frame_size = params["frame_size"]
        self.num_actions = params["actions"]
        self.optimizer = params["optimizer"]
        self.memory = deque()
        self.experience = list()
        self.current_index = 0
        self.q_network = self.build_network()
        self.target_network = self.build_network()
        default = np.zeros((128, ))
        for _ in range(5):
            self.memory.append(default)
        
    def merge_networks(self, tau):
        self.target_network.set_weights(tau*np.array(self.q_network.get_weights())
                                        + (1-tau)*np.array(self.target_network.get_weights()))
    def build_network(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Dense(80, activation='relu'))
        model.add(tf.keras.layers.Dense(self.num_actions, activation='linear'))
        model.compile(loss='mse', optimizer= self.optimizer)
        return model
        
    def get_input(self, state):
        self.memory.append(np.array(state))
        if len(self.memory) > self.frame_size:
            self.memory.popleft()
        input_layer = np.array([])
        for frame in self.memory:
            input_layer = np.concatenate([input_layer, frame])
        
        return np.array([input_layer])
        
    def agent_start(self, start_state):
        input_layer = self.get_input(start_state)
        print (input_layer.shape)
        q_values = self.q_network.predict(input_layer)
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(q_values)
        self.prev_state = input_layer
        self.prev_action = action 
        return action
    
    def agent_step(self, reward, state):
        input_layer = self.get_input(state)
        q_values = self.q_network.predict(input_layer)
        relay = (self.prev_state, self.prev_action,  reward, input_layer, 0)
        self.experience.append(relay)
        self.train(5)
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(q_values)
        self.prev_state = input_layer
        self.prev_action = action 
        return action
        
    
    def agent_end(self, reward):
        relay = (self.prev_state, self.prev_action,  reward, 0, 1)
        self.experience.append(relay)
        self.train(5)
        self.experience.clear()
        
        
    def train(self, count):
        batch = random.sample(self.experience, min(count, len(self.experience)))
        for state, action, reward, future, terminated in batch:
            target = self.q_network.predict(state)
            updated = reward
            if not terminated:
                target_vals = self.target_network.predict(future)[0]
                updated += self.discount*np.amax(target_vals)
                
            target[0][action] = updated
            
            self.q_network.fit(state, target, epochs=1, verbose=0)
            
        self.merge_networks(0.01)
        
    

In [114]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
params = {"epsilon":0.1, "discount": 1, "frame_size": 3, "actions": 6, "optimizer": optimizer}
agent = Agent(params)

In [None]:
ITERATIONS = 100
for _ in range(ITERATIONS):
    print ("Game {0}".format(_))
    action = agent.agent_start(env.reset())
    observation, reward, done, info = env.step([action])
    count = 0
    while not done:
        action = agent.agent_step(reward, observation)
        observation, reward, done, info = env.step([action])
        print ("Done {0} at Count {1}".format(done, count))
        count += 1
    agent.agent_end(reward)
   

Game 0
(1, 640)
Done False at Count 0
Done False at Count 1
Done False at Count 2
Done False at Count 3
Done False at Count 4
Done False at Count 5
Done False at Count 6
Done False at Count 7
Done False at Count 8
Done False at Count 9
Done False at Count 10
Done False at Count 11
Done False at Count 12
Done False at Count 13
Done False at Count 14
Done False at Count 15
Done False at Count 16
Done False at Count 17
Done False at Count 18
Done False at Count 19
Done False at Count 20
Done False at Count 21
Done False at Count 22
Done False at Count 23
Done False at Count 24
Done False at Count 25
Done False at Count 26
Done False at Count 27
Done False at Count 28
Done False at Count 29
Done False at Count 30
Done False at Count 31
Done False at Count 32
Done False at Count 33
Done False at Count 34
Done False at Count 35
Done False at Count 36
Done False at Count 37
Done False at Count 38
Done False at Count 39
Done False at Count 40
Done False at Count 41
Done False at Count 42
Done 

In [110]:
agent.q_network.summary()

Model: "sequential_34"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_102 (Dense)            multiple                  51280     
_________________________________________________________________
dense_103 (Dense)            multiple                  486       
Total params: 51,766
Trainable params: 51,766
Non-trainable params: 0
_________________________________________________________________
