# Final Project: Deep Reinforcement Learning With Atari Pong Game

## 1. Preparation

### 1.1 Set Google Colab GPU

In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
!mkdir -p drive!google-drive-ocamlfuse drive  -o nonempty

### 1.2 Load packages & files

In [0]:
#install the necessary packages
!pip install gym[atari]

In [0]:
#load in the required packages
import tensorflow as tf
import os
import gym

#the wrappers file consists of steps that preprocess the game data ready for 
#training 
#the FireResetEnv, MaxAndSkipEnv classes are suggested by 
#https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
#the other classes are suggested by 
#https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter06/lib
import wrappers

import time
import numpy as np
import collections

import matplotlib.pyplot as plt

In [0]:
#create a folder for storing the training data 
LOG_DIR = './pong_log'
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [0]:
#set up the hyperparameters
#hyperparameters are suggested by 
#https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/02_dqn_pong.py
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 16.0

GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02

## 2. Training

### 2.1 Training A Reinforcement Learning Agent Playing Pong

#### 2.1.1 The Training Network and The Target Network

In [0]:
#convolutional network
#note that the architecture is suggested by https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
tf.reset_default_graph()

graph = tf.Graph()

with graph.as_default():
    
  #set up placeholders to feed in the training data
  x = tf.placeholder(tf.float32,shape=(None,84,84,4),name="train_input")
  y = tf.placeholder(tf.float32,shape=(None),name="labels")
  a = tf.placeholder(tf.int32,shape=(None,2),name="labels")
  x_target = tf.placeholder(tf.float32,shape=(None,84,84,4),name="target_input")
    
  #set up a training network for training an approximation to the Q function
  #the network consists of three convolutional layers and two dense layers 
  with tf.variable_scope("train") as scope:
    cov1 = tf.layers.conv2d(x, filters=32, kernel_size=8, strides=4,
                         padding='same', activation=tf.nn.relu,name="cov1")
    cov2 = tf.layers.conv2d(cov1, filters=64, kernel_size=4, strides=2,
                         padding='same', activation=tf.nn.relu,name="cov2")
    cov3 = tf.layers.conv2d(cov2, filters=64, kernel_size=3, strides=1,
                         padding='same', activation=tf.nn.relu,name="cov3")
    flat = tf.layers.flatten(cov3)
    dense = tf.layers.dense(flat, units=512,activation=tf.nn.relu,name="dense1")
    q_values = tf.layers.dense(dense, units=6, name="dense2")
    #output the q value prediction for the executed action
    q_values_flat = tf.gather_nd(q_values,a)

  #set up a target network with the same type of architecture but is used for 
  #updating the Q value training target using the Bellman equation 
  with tf.variable_scope("target") as scope:
    cov1_target = tf.layers.conv2d(x_target, filters=32, kernel_size=8, strides=4,
                         padding='same', activation=tf.nn.relu,name="cov1")
    cov2_target = tf.layers.conv2d(cov1_target, filters=64, kernel_size=4, strides=2,
                         padding='same', activation=tf.nn.relu,name="cov2")
    cov3_target = tf.layers.conv2d(cov2_target, filters=64, kernel_size=3, strides=1,
                         padding='same', activation=tf.nn.relu,name="cov3")
    flat_target = tf.layers.flatten(cov3_target)
    dense_target = tf.layers.dense(flat_target, units=512,activation=tf.nn.relu,name="dense1")
    q_values_target = tf.layers.dense(dense_target, units=6, name="dense2")  
    
  #compute the MSE loss based on the q prediction value based on executed action and target value 
  loss = tf.losses.mean_squared_error(labels=y,predictions=q_values_flat)

  #use the Adam Optimizer to minimize the MSE loss
  optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
  train_step = optimizer.minimize(loss)

  #matrices used for evaluating the prediction and target q values 
  output_target = graph.get_tensor_by_name('target/dense2/BiasAdd:0')
  output_train = graph.get_tensor_by_name('train/dense2/BiasAdd:0')


  
#summary_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'graph'), graph)
#summary_writer.close()

  

#### 2.1.2 Set Up Operation to Copy Weights From The Training Network to The Target Network

In [0]:
#the tensorflow code responsible for copying weights from one network to another is
#suggested by https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb
with graph.as_default():
  train_vars = graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope='train')
  train_var_dict = {var.name[len('train'):]:var for var in train_vars}

  target_vars = graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope='target')
  target_var_dict = {var.name[len('target'):]:var for var in target_vars}

  copy_ops = [targ_var.assign(train_var_dict[var_name]) 
              for var_name,targ_var in target_var_dict.items()]
  copy_train_to_target = tf.group(*copy_ops)

#### 2.1.3 Set Up The Experience Buffer For Storing Agent's Experience

In [0]:
#set up a namedtuple object to store one unit of memory 
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceBuffer:
  def __init__(self, buffer_size):
      #use the deque object to store the experiences. The maxlen argument makes 
      #sure that old experiences will be eliminated from the replay buffer
      #when the buffer reaches a certain size 
      #the use of deque object is suggested by 
      #https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb
      self.buffer = collections.deque(maxlen=buffer_size)
      
  def __len__(self):
      return len(self.buffer)

  def append(self, experience):
      self.buffer.append(experience)

  def sample(self, batch_size):
      #permute the indices for buffer and take the first number of indices equaling
      #to the batch size to sample a random sample from the buffer 
      indices = np.random.permutation(len(self.buffer))[:batch_size]
        
      #get the states, actions, rewards, a vector indicating how many games have been
      #completed, and next states 
      states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
      return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
             np.array(dones, dtype=np.uint8), np.array(next_states)

#### 2.1.4 Set Up The Agent to Execute A Step According to Action With Highest Q Value Or Explore the Environment Randomly

In [0]:
class Agent:
  def __init__(self, env, exp_buffer):
      self.env = env
      self.exp_buffer = exp_buffer
      self._reset()

  #reset game initial state when a game is done
  def _reset(self):
      self.state = env.reset()
      self.total_reward = 0.0

  #get the agent to play a step 
  def play_step(self, session, epsilon=0.0):
      done_reward = None

      #get a random number. If it smaller than epsilon explore the environment with 
      #a random step 
      if np.random.random() < epsilon:
          action = env.action_space.sample()
      else:
          state_a = np.array([self.state], copy=False)
          state_a.shape = (1,84,84,4)

          #evaluate the q value using the q train network
          q_vals_v = session.run(output_train,feed_dict={'train_input:0':state_a})
          action = np.argmax(q_vals_v)

      # do step in the environment,execute the step with highest q value
      new_state, reward, is_done, _ = self.env.step(action)
      self.total_reward += reward
      new_state = new_state

      #store this into the experience buffer 
      exp = Experience(self.state, action, reward, is_done, new_state)
      self.exp_buffer.append(exp)
      self.state = new_state

      #handle end-game situation 
      if is_done:
          done_reward = self.total_reward
          self._reset()
      return done_reward

#### 2.1.5 Get The Action Executed and The Target Q Value

In [0]:
def get_action_label(session,batch,method='simple'):
    #get the states, actions, rewards earned, and next states from the experience
    #buffer 
    states, actions, rewards, dones, next_states = batch
    
    #adjust the shape of the states tensor so that it conforms to the tensorflow
    #standard since the preprocessing in wrappers file works for pytorch not tensorflow 
    next_states.shape = (BATCH_SIZE,84,84,4)

    actions_index = np.zeros(2)
    for n_row,_ in enumerate(actions):
        actions_index = np.vstack((actions_index,np.array((n_row,actions[n_row]))))
    actions_index = np.delete(actions_index,0,axis=0)
    actions_index = np.array(actions_index,dtype='int64')
    
    #use the train network to evaluate q value for next state
    if method=='train':
      #evaluate next state q value using the train network
      next_state_values = session.run(output_train,feed_dict={'train_input:0':next_states})
    
      #evaluate the best q values
      next_state_values = np.amax(next_state_values,axis=1)
    
    #use the target network to evaluate q value for next state
    #the use of target network is suggested by https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
    elif method=='simple':
      #evaluate next state q value using the train network
      next_state_values = session.run(output_target,feed_dict={'target_input:0':next_states})
    
      #evaluate the best q values 
      next_state_values = np.amax(next_state_values,axis=1)
      
    #use the double q network architecture to evaluate next state 
    #the double q learning method is suggested by https://arxiv.org/pdf/1509.06461.pdf
    #to reduce the impact of overestimation of the q action values by simple deep q learning 
    elif method=='double':
      next_state_values_from_train_net = session.run(output_train,
                                                   feed_dict={'train_input:0':next_states})
      #get the action with the best value 
      best_action_from_train_net = np.argmax(next_state_values_from_train_net,axis=1)

      #evaluate the q value of this "best action" using the target network
      next_state_values_from_target_net = session.run(output_target,
                                                      feed_dict={'target_input:0':next_states})
      next_state_values = next_state_values_from_target_net[np.arange(BATCH_SIZE),
                                                            best_action_from_train_net]

    #handle end-game situation where q value of next state is 0 
    next_state_values[dones] = 0.0

    #calculate the expected q value, aka the labels for training, using the 
    #Bellman equation 
    expected_state_action_values = next_state_values * GAMMA + rewards
    return actions_index,expected_state_action_values

### 2.2 Train The Agent 

In [0]:
if __name__ == "__main__":
  #set up the game environment using the wrapper
  env = wrappers.make_env(DEFAULT_ENV_NAME)

  #initialize an experience buffer to store experience 
  buffer = ExperienceBuffer(REPLAY_SIZE)

  #create an agent 
  agent = Agent(env, buffer)
  epsilon = EPSILON_START
  
  #method used to compute the label
  method = 'train'

  #initialize values that will be used for evaluating the model 
  total_rewards = []
  frame_idx = 0
  ts_frame = 0
  ts = time.time()
  best_mean_reward = None

  epsilon_list = []
  speed_list = []
  mean_reward_list = []
  reward_list = []

  with graph.as_default():
    
    #set up a saver to save the training parameters 
    saver = tf.train.Saver()
  
    with tf.Session() as session:
      #if the saved file exists, restore the model. Initialize training
      #parameters otherwise
      if os.path.exists('pong_log/test/model.ckpt.index'):
        saver.restore(session,os.path.join(LOG_DIR, 'test/model.ckpt'))
        copy_train_to_target.run()
        EPSILON_START = epsilon
      else:
        session.run(tf.global_variables_initializer())
        summary_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'), graph)
        EPSILON_START = 1.0

      #session.run(tf.global_variables_initializer())
      #summary_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'), graph)

      while True:
        frame_idx += 1

          #set up a schedule for epsilon according to the number of processed
        #frame 
        epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

          #get the reward of the step 
        reward = agent.play_step(session, epsilon)

        #record information about speed, epsilon, mean reward, etc 
        #when each game is finished 
        if reward is not None:
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-100:])
            print("%d: done %d games, mean reward %.3f, reward %i, eps %.2f, speed %.2f f/s" % (
                frame_idx, len(total_rewards), mean_reward, reward, 
                epsilon, speed
            ))

            #record the parameters that show the performance of the model 
            epsilon_list.append(epsilon)
            speed_list.append(speed)
            mean_reward_list.append(mean_reward)
            reward_list.append(reward)

            #record and print out the best mean reward reached and save the model 
            if best_mean_reward is None or best_mean_reward < mean_reward:
                saver.save(session, os.path.join(LOG_DIR, 'saved_model/model.ckpt'))
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
                best_mean_reward = mean_reward
            if mean_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

        #continue to the next loop if the experience buffer is not enough data
        if len(buffer) < REPLAY_START_SIZE:
            continue

        #copy the weights from the train network to the target network
        #periodically 
        if frame_idx % SYNC_TARGET_FRAMES == 0:
            copy_train_to_target.run()

        #get a batch from the experience buffer and reshape the states 
        batch = buffer.sample(BATCH_SIZE)
        batch[0].shape = (BATCH_SIZE,84,84,4)

        #get the actions and labels for each batch depending on the method used 
        act_index,expected = get_action_label(session, batch,method=method)

        #minimize the loss for one step 
        with tf.device("/device:GPU:0"):
          _ = session.run(train_step,feed_dict={x:batch[0],y:expected,a:act_index})

##3. References

1.   https://gym.openai.com/envs/Pong-v0/
2.   https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
3. https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter06/lib
4. https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/02_dqn_pong.py
5. https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
6. https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb
7. https://arxiv.org/pdf/1509.06461.pdf
