## Deep Q Learning to play Atari from RAM

In [1]:
import gym, random
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from collections import deque

In [3]:
env = gym.make('Pong-ram-v0')
from datetime import datetime

In [6]:
class Agent:
    def __init__(self, params):
        self.epsilon = params["epsilon"]
        self.discount = params["discount"]
        self.frame_size = params["frame_size"]
        self.num_actions = params["actions"]
        self.optimizer = params["optimizer"]
        self.experience_memory = params["memory"]
        self.memory = deque()
        self.experience = list()
        self.q_network = self.build_network()
        self.target_network = self.build_network()
        default = np.zeros((128, ))
        for _ in range(self.frame_size):
            self.memory.append(default)
        
    def merge_networks(self):
        self.target_network.set_weights(self.q_network.get_weights())
    def build_network(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Dense(80, activation='relu', input_shape=(128*self.frame_size, )))
        model.add(tf.keras.layers.Dense(self.num_actions, activation='linear'))
        model.compile(loss='mse', optimizer= self.optimizer)
        return model
        
    def get_input(self, state):
        self.memory.append(np.array(state))
        if len(self.memory) > self.frame_size:
            self.memory.popleft()
        input_layer = np.array([])
        for frame in self.memory:
            input_layer = np.concatenate([input_layer, frame])
        
        return np.array([input_layer])
        
    def agent_start(self, start_state):
        input_layer = self.get_input(start_state)
        print (input_layer.shape)
        q_values = self.q_network.predict(input_layer)
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(q_values)
        self.prev_state = input_layer
        self.prev_action = action 
        return action
    
    def agent_step(self, reward, state):
        input_layer = self.get_input(state)
        q_values = self.q_network.predict(input_layer)
        relay = (self.prev_state, self.prev_action,  reward, input_layer, 0)
        self.experience.append(relay)
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(q_values)
        self.prev_state = input_layer
        self.prev_action = action 
        self.train(1000)
        return action
        
    
    def agent_end(self, reward):
        relay = (self.prev_state, self.prev_action,  reward, 0, 1)
        self.experience.append(relay)
        if len(self.experience) > self.experience_memory:
            self.experience.clear()
        
    def save_weights(self):
        self.q_network.save_weights("q.h5")
        self.target_network.save_weights("target.h5")
        
    def train(self, count):
        batch = random.sample(self.experience, min(count, len(self.experience)))
        step = 0
        print ("About to train")
        for state, action, reward, future, terminated in batch:
            step += 1
            if step%100 == 1:
                print ("Step {0} for training step".format(step))
            target = self.q_network.predict(state)
            updated = reward
            if not terminated:
                target_vals = self.target_network.predict(future)[0]
                updated += self.discount*np.amax(target_vals)
                
            target[0][action] = updated
            
        self.q_network.fit(state, target, epochs=1, verbose=0)
            
        self.merge_networks()
        
        self.save_weights()
        
    

In [7]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
params = {"epsilon":0.1, "discount": 1, "frame_size": 3, "actions": 6, "optimizer": optimizer, "memory": 10000}
agent = Agent(params)

In [8]:
ITERATIONS = 100
for _ in range(ITERATIONS):
    print ("Game {0}".format(_))
    action = agent.agent_start(env.reset())
    observation, reward, done, info = env.step([action])
    count = 0
    while not done:
        action = agent.agent_step(reward, observation)
        observation, reward, done, info = env.step([action])
        count += 1
    print ("About to train for game {0}".format(_))
    before = datetime.now()
    agent.agent_end(reward)
    after = datetime.now() - before
    print ("Training took {0} microseconds".format(after.microseconds))
   

Game 0
(1, 384)


KeyboardInterrupt: 

In [9]:
agent.q_network.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 80)                30800     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 486       
Total params: 31,286
Trainable params: 31,286
Non-trainable params: 0
_________________________________________________________________
