# Breakout-v0 - Double Deep Q-Network

In [None]:
import gym
import numpy as np
import random
import tensorflow as tf
import os

#### Load the Breakout-v0 Environment

In [None]:
env = gym.make('Breakout-v0')

#### Implement Q-Network

In [None]:
class Qnetwork():
    def __init__(self, h_size, frameShape, batch_size):
        # The network recieves a frame from the game, flattened into an array.
        # It then resizes it and processes it through four convolutional layers.
        
        # raw pixel data (grayscale, so only 1 channel)
        self.rgb_array = tf.placeholder(shape=[None, frameShape[0], frameShape[1], 1], dtype=tf.float32)
        
        # tf input: a 4-D tensor [batch_size, height, width, channels]
        self.imageIn = tf.div(tf.image.resize_images(self.rgb_array, [84, 84]), 225)
        
        self.conv1 = tf.contrib.layers.convolution2d(
            inputs = self.imageIn,
            num_outputs = 32,
            kernel_size = [8, 8],
            stride = [4, 4],
            padding = 'VALID',
            activation_fn = tf.nn.relu)
        
        self.conv2 = tf.contrib.layers.convolution2d(
            inputs = self.conv1,
            num_outputs = 64,
            kernel_size = [4, 4],
            stride = [2, 2],
            padding = 'VALID',
            activation_fn = tf.nn.relu)
        
        self.conv3 = tf.contrib.layers.convolution2d(
            inputs = self.conv2,
            num_outputs = 64,
            kernel_size = [3, 3],
            stride = [1, 1],
            padding = 'VALID',
            activation_fn = tf.nn.relu)
        
        self.conv4 = tf.contrib.layers.convolution2d(
            inputs = self.conv3,
            num_outputs = 512,
            kernel_size = [7, 7],
            stride = [1, 1],
            padding = 'VALID',
            activation_fn = tf.nn.relu)
        
        # We take the output from the final convolutional layer and split it
        # into separate advantage and value streams.
        
        # split on the 3rd dimension into 2 different parts
        self.streamAC, self.streamVC = tf.split(3, 2, self.conv4)
        
        # flatten to [batch_size, k]
        self.streamA = tf.contrib.layers.flatten(self.streamAC)
        self.streamV = tf.contrib.layers.flatten(self.streamVC)
        
        # weights for advantage and value stream layer
        self.AW = tf.Variable(tf.random_normal([int(h_size/2) , env.action_space.n]))
        self.VW = tf.Variable(tf.random_normal([int(h_size/2) , 1]))
        
        # output of advantage and value layer
        self.Advantage = tf.matmul(self.streamA, self.AW)
        self.Value = tf.matmul(self.streamV, self.VW)
        
        # Then combine them together to get our final Q-values.
        # Q(s, a) = V(s) + A(a)
        self.Qout = self.Value + tf.sub(
            self.Advantage,
            tf.reduce_mean( # TODO: understand how this A(a) is calculated
                self.Advantage,
                reduction_indices=1,
                keep_dims=True))
        
        # index of max value across 1st dimension
        self.predict = tf.argmax(self.Qout, 1)
        
        # Below we obtain the loss by taking the sum of squares difference between
        # the target and prediction Q values.
        
        # target Q value
        self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
        
        # possible actions
        self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions, env.action_space.n, dtype=tf.float32)
        
        # predicted Q values
        self.Q = tf.reduce_sum(tf.mul(self.Qout, self.actions_onehot), reduction_indices=1)
        
        # error = sum( (target - actual)^2 ) / batch_size
        self.td_error = tf.square(self.targetQ - self.Q)
        self.loss = tf.reduce_mean(self.td_error)
        
        # define trainer
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.01)
        
        # define optimizer
        self.updateModel = self.trainer.minimize(self.loss)

#### Experience Replay

In [None]:
class experience_buffer():
    """Used to store experiences and samples randomly to train the network."""
    def __init__(self, buffer_size=1000):
        self.buffer_size = buffer_size
        # state, action, reward, state1, done
#         self.states = np.array([])
#         self.actions = np.array([])
#         self.rewards = np.array([])
#         self.states_ = np.array([])
#         self.dones = np.array([])
        self.states = []
        self.actions = []
        self.rewards = []
        self.states_ = []
        self.dones = []
        
    def add(self, states, actions, rewards, states_, dones):
        if len(self.states) + len(states) >= self.buffer_size:
            self.states = self.states[:(self.buffer_size - len(states))]
            self.actions = self.actions[:(self.buffer_size - len(actions))]
            self.rewards = self.rewards[:(self.buffer_size - len(rewards))]
            self.states_ = self.states_[:(self.buffer_size - len(states_))]
            self.dones = self.dones[:(self.buffer_size - len(dones))]

        self.states.extend(states)
        self.actions.extend(actions)
        self.rewards.extend(rewards)
        self.states_.extend(states_)
        self.dones.extend(dones)
#         self.states = np.append(self.states, states)
#         self.actions = np.append(self.actions, actions)
#         self.rewards = np.append(self.rewards, rewards)
#         self.states_ = np.append(self.states_, states_)
#         self.dones = np.append(self.dones, dones)
        
    def sample(self, size):
        samples = random.sample(range(len(self.actions)), size)

#         states = np.array([])
#         actions = np.array([])
#         rewards = np.array([])
#         states_ = np.array([])
#         dones = np.array([])
        states = []
        actions = []
        rewards = []
        states_ = []
        dones = []
        for i in samples:
#             states = np.append(states, self.states[i])
#             actions = np.append(actions, self.actions[i])
#             rewards = np.append(rewards, self.rewards[i])
#             states_ = np.append(states_, self.states_[i])
#             dones = np.append(dones, self.dones[i])
            
            states.append(self.states[i])
            actions.append(self.actions[i])
            rewards.append(self.rewards[i])
            states_.append(self.states_[i])
            dones.append(self.dones[i])
                
        return states, actions, rewards, states_, dones

In [None]:
def updateTargetGraph(tfVars,tau):
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[:int(total_vars/2)]):
        op_holder.append(tfVars[idx+int(total_vars/2)].assign((var.value()*tau) + ((1-tau)*tfVars[idx+int(total_vars/2)].value())))
    return op_holder

def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)

In [None]:
gamma = 0.99

def discount_rewards(r):
    """Take 1D float array of rewards and compute discounted reward.
    
    Discounts rewards for a given episode.
    This is the Monte-Carlo method since we apply it to all rewards
    in a given episode.
    
    Provides more robust reward signal to DQN.
    """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

#### Setup network

In [None]:
# number of experiences to use for each training step
batch_size = 32

# how often to execute training step
update_freq = 4

# discount factor on target Q-values
y = 0.99

# starting chance of random action
startE = 1

# final chance of random action
endE = 0.1

# how many training steps required to fully reduce startE to endE
anneling_steps = 100000

# number of episodes of env to train network with
num_episodes = 100000

# number of random actions before training begins
pre_train_steps = 1000

# Rate to update target network toward primary network
tau = 0.001 

# load saved model?
load_model = True

# path to save model to
path = "./dqn"

# size of final convolutional layer before
# splitting it into Advantage and Value streams
h_size = 512

#### Get frame shape

In [None]:
frameShape = env.ale.getScreenGrayscale().shape
frameShape

#### Begin Training

In [None]:
tf.reset_default_graph()
mainQN = Qnetwork(h_size, frameShape, batch_size)
targetQN = Qnetwork(h_size, frameShape, batch_size)

init = tf.initialize_all_variables()

saver = tf.train.Saver()

trainables = tf.trainable_variables()

targetOps = updateTargetGraph(trainables,tau)

myBuffer = experience_buffer()

# set rate of random action decrease
e = startE
stepDrop = (startE - endE) / anneling_steps

# create lists to contain total rewards and steps per episode
# movesList = []
lastReward = 0
total_steps = 0

# make path for model to be saved in
if not os.path.exists(path):
    os.makedirs(path)


with tf.Session() as sess:
    if load_model == True:
        print("Loading Model...")
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    sess.run(init)

    # set target network to be equal to primary network
    updateTarget(targetOps, sess)

    try:
        for i in range(num_episodes):
            episodeBuffer = experience_buffer()

            # reset environment and get first new observation
            env.reset()
            state = env.ale.getScreenGrayscale()
            done = False
            rewardAll = 0
            numMoves = 0

            # the Q-Network
            while numMoves < 500:
                numMoves += 1

                # choose action with probability e of being a random action
                if np.random.rand(1) < e or total_steps < pre_train_steps:
                    action = np.random.randint(0, env.action_space.n) # 2 = num different actions
                else:
                    action = sess.run(mainQN.predict, feed_dict={mainQN.rgb_array: [state]})

                observation, reward, done, _ = env.step(action)
#                 if i % 100 == 0 and i != 0:
                env.render()
                state1 = env.ale.getScreenGrayscale()
                total_steps += 1

                # save experience to episode buffer
                episodeBuffer.add([state], [action], [reward], [state1], [done])

                if total_steps > pre_train_steps:
                    if e > endE:
                        e -= stepDrop

                    if total_steps % update_freq == 0:
                        # random sample of experiences
                        states_t, actions_t, rewards_t, state1_t, dones_t = myBuffer.sample(batch_size)

                        states_t = np.reshape(states_t, [-1, 210, 160, 1])

                        # Double-DQN update to the target Q-values
                        Q1 = sess.run(mainQN.predict, feed_dict={
                                mainQN.rgb_array: state1_t})

                        Q2 = sess.run(targetQN.Qout, feed_dict={
                                targetQN.rgb_array: state1_t})
                        
                        # If resulting state is DONE, Q-Target = r
                        # If True: 0. If False: 1.
                        end_multiplier = -(np.array(dones_t) - 1)

                        # The Q values for predicted actions
                        doubleQ = np.array([Q2[i, j] for i, j in zip(range(len(Q1)), Q1)])
                        
                        targetQ = np.array(rewards_t) + (y * doubleQ * end_multiplier)
                        
                        # update network with target values
                        _  = sess.run(mainQN.updateModel,
                                     feed_dict={mainQN.rgb_array: states_t,
                                               mainQN.targetQ: targetQ,
                                               mainQN.actions: actions_t})
                        
                        updateTarget(targetOps, sess)

                rewardAll += reward
                state = state1

                if done == True:
                    break

            # get all experiences from this episode
            episodeRewards = np.array(episodeBuffer.rewards)

            # discount all rewards
            discountRewards = discount_rewards(episodeRewards)
            episodeBuffer.rewards = discountRewards

            # add discounted experiences to our experience buffer
            # state, action, reward, state1, done
            myBuffer.add(episodeBuffer.states,
                         episodeBuffer.actions,
                         episodeBuffer.rewards,
                         episodeBuffer.states_,
                         episodeBuffer.dones)

            lastReward = rewardAll

            # periodically save model
            if i % 20 == 0:
                saver.save(sess, path+'/model-Breakout-'+str(i)+'.cptk')
                print("Saved Model")
            if i % 10 == 0:
                print(i, total_steps, rewardAll, e)

        saver.save(sess, path+'/model-Breakout-'+str(i)+'.cptk')
    except Exception as e:
        # if frames are still rendering, stop it
        if env.viewer is not None:
            env.viewer.close()
            env.viewer = None
        
        saver.save(sess, path+'/model-Breakout-'+str(i)+'.cptk')
        print("Saved Model")
        raise e

print("Reward of last episode: " + str(lastReward))

#### View trained network

In [None]:
tf.reset_default_graph()
mainQN = Qnetwork(h_size, frameShape, batch_size)
targetQN = Qnetwork(h_size, frameShape, batch_size)

init = tf.initialize_all_variables()

saver = tf.train.Saver()

# create lists to contain total rewards and steps per episode
total_steps = 0

# make path for model to be saved in
if not os.path.exists(path):
    os.makedirs(path)


with tf.Session() as sess:
    if load_model == True:
        print("Loading Model...")
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    sess.run(init)

    try:
        for i in range(num_episodes):

            # reset environment and get first new observation
            env.reset()
            state = env.ale.getScreenGrayscale()
            done = False
            numMoves = 0
            rewardSum = 0

            # the Q-Network
            while numMoves < 500:
                numMoves += 1
                state = env.ale.getScreenGrayscale()
                action = sess.run(mainQN.predict, feed_dict={mainQN.rgb_array: [state]})

                observation, reward, done, _ = env.step(action)
                
                env.render()
                rewardSum += reward
                total_steps += 1

                if done == True:
                    break

            print(i, total_steps, rewardSum)

    except Exception as e:
        # if frames are still rendering, stop it
        if env.viewer is not None:
            env.viewer.close()
            env.viewer = None
        
        raise e

#### Human Player

In [None]:
env = gym.make('Breakout-v0')

In [None]:
buffer = experience_buffer()

In [None]:
observation = env.reset()
for i in range(100000):
    env.render()
    state = env.ale.getScreenGrayscale()
    
    action = int(input()) # your agent here (this takes random actions)
    
    observation, reward, done, info = env.step(action)
    state1 = env.ale.getScreenGrayscale()
    
    buffer.add([state], [action], [reward], [state1], [done])

In [None]:
if env.viewer is not None:
    env.viewer.close()
    env.viewer = None

In [None]:
np.array(buffer.states).shape

In [None]:
with open("test.txt", 'w') as file:
    file.write("[")
    for step in buffer.states:
        file.write("[")
        for pixel in step:
            file.write("{}, ".format(pixel))
        file.write("], ")
    file.write("]")

In [None]:
a = np.asarray(buffer.states)
a.tofile("./human_data/states.csv", sep=",")

In [None]:
data = np.genfromtxt('./human_data/states.csv', delimiter=",")

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('./human_data/states.csv', sep=',')
df.head()

In [None]:
tf.reset_default_graph()
mainQN = Qnetwork(h_size, frameShape, batch_size)

init = tf.initialize_all_variables()
sess = tf.Session()


In [None]:
env.step(1)

In [None]:
env.ale.getScreenGrayscale()
resized_image = sess.run(mainQN.imageIn, feed_dict={mainQN.rgb_array: [env.ale.getScreenGrayscale()]})

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
resized_image[0].reshape(84,84)
# resized_image[0][20]

In [None]:
plt.figure()
plt.imshow(resized_image[0].reshape(84, 84), cmap='Greys_r')
plt.axis('off')
plt.show()