# Importing Libraries

In [None]:
import tensorflow as tf
import numpy as np
import retro

from skimage import transform
from skimage.color import rgb2gray

import matplotlib.pyplot as plt

from collections import deque

import random

import warnings

In [None]:
!python -m retro.import .  # Import game ROM

# Preprocessing

In [None]:
env = retro.make(game = 'SpaceInvaders-Atari2600') # Load the game from ROM

print("The size of the frame is: ", env.observation_space)  # Size of a frame of game window
print("The action size is: ", env.action_space.n)  # Number of actions possible in the game

# One hot encoded vectors of all actions
possible_actions = np.array(np.identity(env.action_space.n, dtype=int).tolist())

In [None]:
# Preprocess the game frame

def preprocess_frame(frame):
    gray_frame = rgb2gray(frame)  # Color does not add any additional information so its computationally efficient to
                                  # convert to grayscale
    
    cropped_frame = gray_frame[0:-12, 4:-12] # Cropping unecessary area from the frame
    
    normalized_frame = cropped_frame / 255.
    
    preprocessed_frame = transform.resize(normalized_frame, [110, 84])
    
    return preprocessed_frame

In [None]:
stack_size = 4  # Stacking 4 frames to give a picture of motion to the model

stacked_frame = deque([np.zeros((110, 84), dtype=np.int) for i in range(stack_size)], maxlen = 4)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)
    
    # for first frame, stack it four times
    if is_new_episode: 
        stacked_frame = deque([np.zeros((110, 84), dtype=np.int) for i in range(stack_size)], maxlen = 4)
        
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames, axis = 2)
        
    # if not the first frame, enqueue the latest frame and dequeue the oldest frame
    else:  
        stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames, axis = 2)
        
    return stacked_state, stacked_frames

# Hyperparameters

In [None]:
state_size = [110, 84, 4]  # Size of input to the model

action_size = env.action_space.n

total_episodes = 25  # Total number of training games
max_steps = 10000    # Maximum steps to be taken in a game
batch_size = 64      # Batch size input to the model

learning_rate = 0.00025
epsilon_start = 1.   # max exploration rate
epsilon_end = 0.01   # min exploration rate
decay_rate = 0.00001 # decay rate per game

gamma = 0.9          #discount factor

pretrain_length = batch_size  # Initial memory size
memory_size = 500000  # Maximum memory size

training = True    # Boolen value to train or not
episode_render = True # Boolean value to see the agent train

# Model Architecture

In [None]:
# stacked_frame [110, 84, 4] ---> conv1 ----> [26, 20, 32] (elu) ----> conv2 ----> [12, 9, 64] (elu)
# -----> conv3 -----> [5, 4, 64] (elu) ----> Flatten -----> [1280,] -----> Dense ----> [32,] (elu)
# ------> Dense ----> [self.action,]

class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name = 'DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # placeholders for inputs and actions
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name = 'inputs')
            self.actions_ = tf.placeholder(tf.float32, [None, self.action_size], name = 'actions')
            
            # Placeholder of target variable
            self.target_Q = tf.placeholder(tf.float32, [None], name = 'target')
            
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8, 8],
                                         strides = [4, 4],
                                         padding = 'valid',
                                         kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = 'conv1')
            
            self.conv1_out = tf.nn.elu(self.conv1, name = 'conv1_out')
            
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                         filters = 64,
                                         kernel_size = [4, 4],
                                         strides = [2, 2],
                                         padding = 'valid',
                                         kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = 'conv2')
            
            self.conv2_out = tf.nn.elu(self.conv2, name = 'conv2_out')
            
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                         filters = 64,
                                         kernel_size = [3, 3],
                                         strides = [2, 2],
                                         padding = 'valid',
                                         kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = 'conv3')
            
            self.conv3_out = tf.nn.elu(self.conv3, name = 'conv3_out')
            
            self.flatten = tf.contrib.layers.flatten(self.conv3_out)
            
            self.fc = tf.layers.dense(inputs = self.flatten,
                                     units = 32,
                                     activation = tf.nn.elu,
                                     kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                     name = 'fc1')
            
            self.output = tf.layers.dense(inputs = self.fc,
                                         kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                         units = self.action_size,
                                         activation = None)
            
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_)) # Q-value of the taken action
            
            # Mean squared loss function
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            # Adam optimizer
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)           
            

In [None]:
# reset the tensorflow graph
tf.reset_default_graph()

# Initialize network
DQNetwork = DQNetwork(state_size, action_size, learning_rate)

# Experience Replay

Memory class to store experience (stacked frames, action, reward, next state, boolean to know game finished or not)

In [None]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
        
    def add(self, experience):
        self.buffer.append(experience)
    
    # return random batch of experiences
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

Initially filling memory with random experiences from the start frame

In [None]:
memory = Memory(max_size = memory_size)

for i in range(pretrain_length):
    if i==0:
        state = env.reset()
        
        state, stacked_frames = stack_frames(stacked_frame,
                                            state, True)
        
    choice = random.randint(1, len(possible_actions)) - 1
    action = possible_actions[choice]
    next_state, reward, done, _ = env.step(action)
    
#     env.render()
    
    next_state, stacked_frames = stack_frames(stacked_frames,
                                             next_state, False)
    
    if done:
        next_state = np.zeros(state.shape)  # all zeros mark the final state
        
        memory.add((state, action, reward, next_state, done))
        
        state = env.reset()
        
        state, stacked_frames = stack_frames(stacked_frame,
                                            state, True)
        
    else:
        memory.add((state, action, reward, next_state, done))
        
        state = next_state

## Tensorboard Config

In [None]:
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

In [None]:
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    exp_exp_tradeoff = np.random.rand()
    
    explore_probability = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-decay_rate * decay_step)

    if explore_probability > exp_exp_tradeoff:
        choice = random.randint(1, len(possible_actions)) - 1
        action = possible_actions[choice]
        
    else:
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_ : state.reshape((1, *state.shape))})
        
        choice = np.argmax(Qs)
        action = possible_actions[choice]
        
    return action, explore_probability

In [None]:
saver = tf.train.Saver()
rewards_list = []

if training == True:
    with tf.Session() as sess:
        
        #Training from scratch: comment the line below and uncomment the line next to it
        saver.restore(sess, "./models/model.ckpt")
        
        
#         sess.run(tf.global_variables_initializer())
        
        decay_step = 0  
        
        for episode in range(1, total_episodes+1):
            step = 0
            
            episode_rewards = []
            
            state = env.reset()
            
            state, stacked_frames = stack_frames(stacked_frame, state, True)
            
            while step < max_steps:
                step += 1
                
                decay_step += 1 # decaying exploration for each step in the game
                
                # Predict the action output by the model
                action, explore_probability = predict_action(epsilon_start,
                                                            epsilon_end,
                                                            decay_rate,
                                                            decay_step,
                                                            state,
                                                            possible_actions)
                
                next_state, reward, done, _ = env.step(action)
                
                if episode_render:
                    env.render()
                
                episode_rewards.append(reward)
                
                if done:
                    next_state = np.zeros((110, 84), dtype = np.int)
                    
                    next_state, stacked_frames = stack_frames(stacked_frames,
                                                             next_state, False)
                    
                    step = max_steps # ending the current episode
                    
                    total_reward = np.sum(episode_rewards)
                    
                    print('Episode: {}'.format(episode),
                         'Total reward: {}'.format(total_reward),
                         'Explore P: {:.4f}'.format(explore_probability),
                         'Training Loss {:.4f}'.format(loss))
                    
                    rewards_list.append(total_reward)
                    
                    memory.add((state, action, reward, next_state, done)) # add experience to memory
                    
                else:
                    next_state, stacked_frames = stack_frames(stacked_frames,
                                                             next_state, False)
                    
                    memory.add((state, action, reward, next_state, done))  # add experience to memory
                    
                # sample a batch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin = 3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch], ndmin = 3)
                dones_mb = np.array([each[4] for each in batch])
                
                target_Qs_batch = []
                
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                for i in range(len(batch)):
                    terminal = dones_mb[i]
                    
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma*np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        
                targets_mb = np.array([each for each in target_Qs_batch])
                        
                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                  feed_dict = {DQNetwork.inputs_: states_mb,
                                              DQNetwork.target_Q: targets_mb,
                                              DQNetwork.actions_: actions_mb})
                
                summary = sess.run(write_op, feed_dict = {DQNetwork.inputs_: states_mb,
                                                         DQNetwork.target_Q: targets_mb,
                                                         DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
            print("Training game ",episode, " ends")
            if episode%2 == 0:
                save_path = saver.save(sess, './models/model.ckpt')
                print("Model Saved")
                    

# Testing - Agent plays the game

In [None]:
test_episodes = 1

with tf.Session() as sess:
    saver.restore(sess, "./models/model.ckpt") # Load the trained weights
    
    total_test_rewards = []
    
    for episode in range(test_episodes):
        total_rewards = 0
        
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
        while True:
            state = state.reshape((1, *state_size))
                                    
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state})
            
            choice = np.argmax(Qs)
            action = possible_actions[choice]
            print(action)
            
            next_state, reward, done, _ = env.step(action)
            env.render()
            
            total_rewards += reward
            
            if done:
                print("Score: ", total_rewards)
                total_test_rewards.append(total_rewards)
                break
                
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state
            
        env.close()
sess.close()

In [None]:
sess.close()