##Set Up Google Cloud GPU

In [1]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

E: Package 'python-software-properties' has no installation candidate
Selecting previously unselected package google-drive-ocamlfuse.
(Reading database ... 131322 files and directories currently installed.)
Preparing to unpack .../google-drive-ocamlfuse_0.7.1-0ubuntu3~ubuntu18.04.1_amd64.deb ...
Unpacking google-drive-ocamlfuse (0.7.1-0ubuntu3~ubuntu18.04.1) ...
Setting up google-drive-ocamlfuse (0.7.1-0ubuntu3~ubuntu18.04.1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=ht

In [2]:
!mkdir -p drive!google-drive-ocamlfuse drive  -o nonempty

mkdir: invalid option -- 'o'
Try 'mkdir --help' for more information.


##Install Necessary and Import Necessary Packages 

In [0]:
#install the necessary packages
!pip install gym[atari]

In [0]:
#load in the required packages
import tensorflow as tf
import os
import gym

#the wrappers file consists of steps that preprocess the game data ready for 
#training 
#the FireResetEnv, MaxAndSkipEnv classes are suggested by 
#https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
#the other classes are suggested by 
#https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter06/lib
import wrappers

import time
import numpy as np
import collections

import matplotlib.pyplot as plt

##Training

###Define Hyperparameters

In [0]:
#set up the hyperparameters
#hyperparameters are suggested by 
#https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/02_dqn_pong.py
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19.0

GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02

###Define Neural Network Architectures and Optimization Loss and Method 

In [0]:
env = wrappers.make_env(DEFAULT_ENV_NAME)

tf.reset_default_graph()

def q_network(x, name):
  with tf.variable_scope(name) as scope:
    cov1 = tf.layers.conv2d(x, filters=32, kernel_size=8, strides=4,
                         padding='same', activation=tf.nn.relu,name="cov1")
    cov2 = tf.layers.conv2d(cov1, filters=64, kernel_size=4, strides=2,
                         padding='same', activation=tf.nn.relu,name="cov2")
    cov3 = tf.layers.conv2d(cov2, filters=64, kernel_size=3, strides=1,
                         padding='same', activation=tf.nn.relu,name="cov3")
    flat = tf.layers.flatten(cov3)
    dense = tf.layers.dense(flat, units=512,activation=tf.nn.relu,name="dense1")
    output = tf.layers.dense(dense, units=6, name="dense2")

    trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope=scope.name)
    trainable_vars_by_name = {var.name[len(scope.name):]: var
                              for var in trainable_vars}
    return output, trainable_vars_by_name

In [5]:
x = tf.placeholder(tf.float32, shape=(None,84,84,4), name="train_input")
train_q_values, train_vars = q_network(x, name="q_networks/train")
target_q_values, target_vars = q_network(x, name="q_networks/target")

copy_ops = [target_var.assign(train_vars[var_name])
            for var_name, target_var in target_vars.items()]
copy_train_to_target = tf.group(*copy_ops)

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.


In [6]:
with tf.variable_scope("train"):
    a = tf.placeholder(tf.int32, shape=[None])
    y = tf.placeholder(tf.float32, shape=[None, 1])
    
    q_value = tf.reduce_sum(train_q_values * tf.one_hot(a, env.action_space.n),
                            axis=1, keepdims=True)
    
    #compute the MSE loss based on the q prediction value based on executed action and target value 
    loss = tf.losses.mean_squared_error(labels=y,predictions=q_value)

    #use the Adam Optimizer to minimize the MSE loss
    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    train_step = optimizer.minimize(loss)
    

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


###Set Up the Experience Buffer and Agent Class

In [0]:
#set up a namedtuple object to store one unit of memory 
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceBuffer:
  def __init__(self, buffer_size):
      #use the deque object to store the experiences. The maxlen argument makes 
      #sure that old experiences will be eliminated from the replay buffer
      #when the buffer reaches a certain size 
      #the use of deque object is suggested by 
      #https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb
      self.buffer = collections.deque(maxlen=buffer_size)
      
  def __len__(self):
      return len(self.buffer)

  def append(self, experience):
      self.buffer.append(experience)

  def sample(self, batch_size):
      #permute the indices for buffer and take the first number of indices equaling
      #to the batch size to sample a random sample from the buffer 
      indices = np.random.permutation(len(self.buffer))[:batch_size]
        
      #get the states, actions, rewards, a vector indicating how many games have been
      #completed, and next states 
      states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
      return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
             np.array(dones, dtype=np.uint8), np.array(next_states)

In [0]:
class Agent:
  def __init__(self, env, exp_buffer):
      self.env = env
      self.exp_buffer = exp_buffer
      self._reset()

  #reset game initial state when a game is done
  def _reset(self):
      self.state = env.reset()
      self.total_reward = 0.0

  #get the agent to play a step 
  def play_step(self, epsilon=0.0):
      done_reward = None

      #get a random number. If it smaller than epsilon explore the environment with 
      #a random step 
      if np.random.random() < epsilon:
          action = env.action_space.sample()
      else:
          state_a = np.array([self.state], copy=False)
          state_a.shape = (1,84,84,4)

          #evaluate the q value using the q train network
          q_vals_v = train_q_values.eval(feed_dict={x: state_a})
          action = np.argmax(q_vals_v)

      # do step in the environment,execute the step with highest q value
      new_state, reward, is_done, _ = self.env.step(action)
      self.total_reward += reward

      #store this into the experience buffer 
      exp = Experience(self.state, action, reward, is_done, new_state)
      self.exp_buffer.append(exp)
      self.state = new_state

      #handle end-game situation 
      if is_done:
          done_reward = self.total_reward
          self._reset()
      return done_reward

###Now We Can Train Our Model

In [0]:
#set up the game environment using the wrapper


#initialize an experience buffer to store experience 
buffer = ExperienceBuffer(REPLAY_SIZE)

#create an agent 
agent = Agent(env, buffer)
epsilon = EPSILON_START

#method used to compute the label
method = 'train'

#initialize values that will be used for evaluating the model 
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_mean_reward = None

epsilon_list = []
speed_list = []
mean_reward_list = []
reward_list = []


saver = tf.train.Saver()

with tf.Session() as session:
  #if the saved file exists, restore the model. Initialize training
  #parameters otherwise
  #if os.path.exists('pong_log/test/model.ckpt.index'):
  #  saver.restore(session,os.path.join(LOG_DIR, 'test/model.ckpt'))
  #  copy_train_to_target.run()
  #  EPSILON_START = epsilon
  #else:
  session.run(tf.global_variables_initializer())
  #summary_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'), graph)
  EPSILON_START = 1.0

  #session.run(tf.global_variables_initializer())
  #summary_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'), graph)

  while True:
    frame_idx += 1

      #set up a schedule for epsilon according to the number of processed
    #frame 
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

      #get the reward of the step 
    reward = agent.play_step(epsilon)

    #record information about speed, epsilon, mean reward, etc 
    #when each game is finished 
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        mean_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, mean reward %.3f, reward %i, eps %.2f, speed %.2f f/s" % (
            frame_idx, len(total_rewards), mean_reward, reward, 
            epsilon, speed
        ))

        #record the parameters that show the performance of the model 
        epsilon_list.append(epsilon)
        speed_list.append(speed)
        mean_reward_list.append(mean_reward)
        reward_list.append(reward)

        #record and print out the best mean reward reached and save the model 
        if best_mean_reward is None or best_mean_reward < mean_reward:
            #saver.save(session, os.path.join(LOG_DIR, 'saved_model/model.ckpt'))
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward
        if mean_reward > MEAN_REWARD_BOUND:
            print("Solved in %d frames!" % frame_idx)
            break

    #continue to the next loop if the experience buffer is not enough data
    if len(buffer) < REPLAY_START_SIZE:
        continue

    #copy the weights from the train network to the target network
    #periodically 
    if frame_idx % SYNC_TARGET_FRAMES == 0:
        copy_train_to_target.run()

    #get a batch from the experience buffer and reshape the states 
    batch = buffer.sample(BATCH_SIZE)
    states, actions, rewards, dones, next_states = batch
    
    states.shape = (BATCH_SIZE,84,84,4)
    next_states.shape = (BATCH_SIZE,84,84,4)
    
    next_q_values = target_q_values.eval(feed_dict={x: next_states})
    next_q_value = np.amax(next_q_values, axis=1)
    expected_state_action_values = next_q_value * GAMMA + rewards
    expected_state_action_values = np.expand_dims(expected_state_action_values,
                                                 axis=-1)

    #minimize the loss for one step 
    with tf.device("/device:GPU:0"):
      _ = session.run(train_step,feed_dict={x:states,y:expected_state_action_values,a:actions})

1006: done 1 games, mean reward -20.000, reward -20, eps 0.99, speed 150.94 f/s
1888: done 2 games, mean reward -20.500, reward -21, eps 0.98, speed 497.59 f/s
2669: done 3 games, mean reward -20.667, reward -21, eps 0.97, speed 486.61 f/s
3519: done 4 games, mean reward -20.750, reward -21, eps 0.96, speed 486.24 f/s
4367: done 5 games, mean reward -20.800, reward -21, eps 0.96, speed 475.37 f/s
5157: done 6 games, mean reward -20.833, reward -21, eps 0.95, speed 474.91 f/s
5949: done 7 games, mean reward -20.857, reward -21, eps 0.94, speed 466.80 f/s
6811: done 8 games, mean reward -20.875, reward -21, eps 0.93, speed 465.17 f/s
7663: done 9 games, mean reward -20.889, reward -21, eps 0.92, speed 459.98 f/s
8743: done 10 games, mean reward -20.700, reward -19, eps 0.91, speed 465.98 f/s
9632: done 11 games, mean reward -20.636, reward -20, eps 0.90, speed 463.94 f/s
10484: done 12 games, mean reward -20.667, reward -21, eps 0.90, speed 77.41 f/s
11246: done 13 games, mean reward -20