In [15]:
%matplotlib inline
import gym
from gym.wrappers import Monitor
#import intertools
import numpy as np
import os 
import sys
import random
import psutil
import tensorflow as tf

if "../" not in sys.path:
    sys.path.append("../")
    
from lib import plotting
from collections import deque, namedtuple

In [16]:
env = gym.envs.make("Breakout-v0")

In [17]:
#Atari actions = 0(noop), 1(fire), 2(left), 3(right)
VALID_ACTIONS = [0, 1, 2, 3]

In [18]:
class StateProcessor():
    """
    Processes a raw Atari images. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        #Build the tensorflow graph
        with tf.variable_scope("state_processor"):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.imput_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
            self.output = tf.image.resize_images(
                self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)
            
    def process(self, sess, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A[210, 160, 3] Atari RGB state
            
        Returns:
            A processed [84, 84] state representing grayscale values.
            """
        return sess.run(self.output, {self.input_state: state})

In [19]:
class Estimator():
    """Q value estimator neural network.
    This network is used for both the Q network and the target network.
    """
    
    def __init__(self, scope="estimaator", summaries_dir=None):
        self.scope = scope
        #weites tensorboard summaries to disk
        self.summary_writer = None
        with tf.variable_scope(scope):
            #build the graph
            self._build_model()
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)
              
    def _build_model(self):
        """Build the tensorflow graph."""
        #our input are 4 grayscale frames of shape 84, 84 each
        self.X_p1 = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
        #the TD target value
        self.y_p1 = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        #integer id of which action was selected
        self.actions_p1 = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
        
        X = tf.to_float(self.X_p1) / 255.0
        batch_size = tf.shape(self.X_p1)[0]
        
        #the three convolutional layers
        conv1 = tf.contrib.layers.conv2d(
            X, 32, 8, 4, activation_fn=tf.nn.relu)
        conv2 = tf.contrib.layers.conv2d(
            conv1, 64, 4, 2, activation_fn=tf.nn.relu)
        conv3 = tf.contrib.layers.conv2d(
            conv2, 64, 3, 1, activation_fn=tf.nn.relu)
        
        #fully connected layers
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        self.predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS))
        
        #get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_p1
        self.action_pridictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)
        
        #calculate the loss
        self.losses = tf.squared_difference(self.y_p1, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)
        
        #optimizer parameters from original paper
        self.optimizer = th.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
        
        #Summaries for tensorboard
        self.summaries = tf.summary.merge([
            tf.summary.scalar("loss", self.loss),
            tf.summary.histogram("loss_hist", self.losses),
            tf.summary.histogram("q_values_hist", self.predictions),
            tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
        ])
        
        
    def predict(self, sess, s):
        """predicts action values.
        Args
            
            sess : tensorflow session
            s: state input of shape [batch_size, 4, 84, 84, 1]
            
            Returns
                tensor of shape [batch_size, NUM_VALID_ACRIONS] containing the estimated
                action values.
                """
        return sess.run(self.predictions, {self.X_p1: s })
    def update(self, sess, s, a, y):
        """
        updates the estimator towards the given targets.
        
        Args:
            sess: tensorflow session object
            s: State input of shape [batch_size, 4, 84, 84, 1]
            a: chosen actions of shape [batch_size]
            y: targets of shape [batch_size]
            Returns:
                the calculated loss on the batch
            """
        feed_dict = { self.X_p1: s, self.y_p1: y, self.actions_p1: a}
        summaries, global_step, _, loss = sess.run(
            [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss],
            feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        return loss

In [20]:
# For Testing ...
tf.reset_default_graph()
global_step = tf.Variable(0, name="global_step", trainable=False)

e = Estimator(scope="test")
sp = StateProcessor()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    #example observation batch
    observation = env.reset()
    observation_p = sp.process(sess, observation)
    observation = np.stack([observation_p] * 4, axis=2)
    observations = np.array([observation] * 2)
    
    #Test prediction
    print(e.predict(sess, observations))
    
    # Test training step
    y = np.array([10.0, 10.0])
    a = np.array([1, 3])
    print(e.update(sess, observations, a, y))

Instructions for updating:
Use tf.cast instead.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.flatten instead.


AttributeError: 'Estimator' object has no attribute 'action_predictions'

In [None]:
class ModelParametersCopier():
    """
    Copy model parameters of one estimator to another.
    """
    
    def __init__(self, estimator1, estimator2):
        """
        Defines copy-work operation graph.  
        Args:
          estimator1: Estimator to copy the paramters from
          estimator2: Estimator to copy the parameters to
        """
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        self.update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assign(e1_v)
            self.update_ops.append(op)
            
    def make(self, sess):
        """
        Makes copy.
        Args:
            sess: Tensorflow session instance
        """
        sess.run(self.update_ops)