In [1]:
import tensorflow as tf
import tensorflow.contrib.layers as layers

from utils.general import get_logger
from utils.test_env import EnvTest
from core.deep_q_learning import DQN
from q1_schedule import LinearExploration, LinearSchedule

from configs.q2_linear import config


In [2]:
class Linear(DQN):
    """
    Implement Fully Connected with Tensorflow
    """
    def add_placeholders_op(self):
        """
        Adds placeholders to the graph

        These placeholders are used as inputs to the rest of the model and will be fed
        data during training.
        """
        # this information might be useful
        state_shape = list(self.env.observation_space.shape)
        #Observationspace(shape) = state 4개로 이루어졌으며 각각의 state는 (84,84,3)의 형태를 가지고 있음
        ##############################################################
        """
        TODO: 
            Add placeholders:
            Remember that we stack 4 consecutive frames together.
                - self.s: batch of states, type = uint8
                    shape = (batch_size, img height, img width, nchannels x config.state_history)
                - self.a: batch of actions, type = int32
                    shape = (batch_size)
                - self.r: batch of rewards, type = float32
                    shape = (batch_size)
                - self.sp: batch of next states, type = uint8
                    shape = (batch_size, img height, img width, nchannels x config.state_history)
                - self.done_mask: batch of done, type = bool
                    shape = (batch_size)
                - self.lr: learning rate, type = float32
        
        (Don't change the variable names!)
        
        HINT: 
            Variables from config are accessible with self.config.variable_name.
            Check the use of None in the dimension for tensorflow placeholders.
            You can also use the state_shape computed above.
        """
        ##############################################################
        ################YOUR CODE HERE (6-15 lines) ##################

        self.s = tf.placeholder(tf.uint8, [None, 84, 84, 3 *4])
        self.a = tf.placeholder(tf.int32, [None])
        self.r = tf.placeholder(tf.float32, [None])
        self.sp = tf.placeholder(tf.uint8, [None, 84, 84, 3*4])
        self.done_mask = tf.placeholder(bool, [None])
        self.lr = tf.placeholder(tf.float32, [None])

        ##############################################################
        ######################## END YOUR CODE #######################


In [37]:
    def get_q_values_op(self, state, scope, reuse=False):
        """
        Returns Q values for all actions

        Args:
            state: (tf tensor) 
                shape = (batch_size, img height, img width, nchannels x config.state_history)
            scope: (string) scope name, that specifies if target network or not
            reuse: (bool) reuse of variables in the scope

        Returns:
            out: (tf tensor) of shape = (batch_size, num_actions)
        """
        # this information might be useful
        num_actions = self.env.action_space.n

        ##############################################################
        """
        TODO: 
            Implement a fully connected with no hidden layer (linear
            approximation with bias) using tensorflow.

        HINT: 
            - You may find the following functions useful:
                - tf.layers.flatten
                - tf.layers.dense

            - Make sure to also specify the scope and reuse
        """
        ##############################################################
        ################ YOUR CODE HERE - 2-3 lines ################## 
        #??? 이거 맞나...??
        with tf.variable_scope(scope , reuse):
            input = tf.layers.flatten(state)
            out= tf.layers.dense(inputs=input, units=num_action, bias_initializer=tf.zeros_initializer())
    
        #input = tf.layers.flatten(state, name=scope)
        #out = tf.layers.dense(input, num_actions, name=scope, reuse=reuse)
            

        ##############################################################
        ######################## END YOUR CODE #######################

        return out


In [38]:
    def add_update_target_op(self, q_scope, target_q_scope):
        """
        update_target_op will be called periodically 
        to copy Q network weights to target Q network

        Remember that in DQN, we maintain two identical Q networks with
        2 different sets of weights. In tensorflow, we distinguish them
        with two different scopes. If you're not familiar with the scope mechanism
        in tensorflow, read the docs
        https://www.tensorflow.org/programmers_guide/variable_scope

        Periodically, we need to update all the weights of the Q network 
        and assign them with the values from the regular network. 
        Args:
            q_scope: (string) name of the scope of variables for q
            target_q_scope: (string) name of the scope of variables
                        for the target network
        """
        ##############################################################
        """
        TODO: 
            Add an operator self.update_target_op that for each variable in
            tf.GraphKeys.GLOBAL_VARIABLES that is in q_scope, assigns its
            value to the corresponding variable in target_q_scope

        HINT: 
            You may find the following functions useful:
                - tf.get_collection #returns a list
                - tf.assign #assign은 variable에 value를 부과하는 함수
                - tf.group (the * operator can be used to unpack a list)

        (be sure that you set self.update_target_op)
        """
        #user zip so i can match elements that have the same indexes
        ##############################################################
        ################### YOUR CODE HERE - 5-10 lines #############
        
        q_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, q_scope)
        target_q_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, target_q_scope) 
        #3분 딥러닝 코드를 보고 작성한 코드... 이건가??? 오오오오오 이건강>ㅇ?ㅇ?ㅇ?ㅇ?ㅇ?
        #여기 코드에서 질문하고 싶은것은 이제 여기있는 코드는 DQN model에서의 마지막 함수에서 sess.run으로 실행이되는데 해당 파라미터가 update 시키는 것은 자기가 해당하는 scope의 노드에서만 활동하는건지???
        copy_op= []
        
        for q_var, target_q_var in zip(q_vars, target_q_vars):
            copy_op.append(tf.assign(target_q_var, q_var.value()))
        self.update_target_op = tf.group(copy_op)
     

        ##############################################################
        ######################## END YOUR CODE #######################


In [39]:
 def add_loss_op(self, q, target_q):
        """
        Sets the loss of a batch, self.loss is a scalar

        Args:
            q: (tf tensor) shape = (batch_size, num_actions)
            target_q: (tf tensor) shape = (batch_size, num_actions)
        """
        # you may need this variable
        num_actions = self.env.action_space.n

        ##############################################################
        """
        TODO: 
            The loss for an example is defined as:
                Q_samp(s) = r if done
                                  = r + gamma * max_a' Q_target(s', a')
                loss = (Q_samp(s) - Q(s, a))^2 
        HINT: 
            - Config variables are accessible through self.config
            - You can access placeholders like self.a (for actions) #얘들은 placeholder임
                self.r (rewards) or self.done_mask for instance
            - You may find the following functions useful
                - tf.cast
                - tf.reduce_max
                - tf.reduce_sum
                - tf.one_hot
                - tf.squared_difference
                - tf.reduce_mean
        """
        ##############################################################
        ##################### YOUR CODE HERE - 4-5 lines #############
       #현재 q, target_q has the value of each action 
        batch_size = self.config.batch_size
        gamma = tf.constant(self.config.gamma, shape = [batch_size])
       
        action = tf.one_hot(self.a, num_actions) #batch_size, num_action  으로 one_hot encoding
       
        q_sa = tf.add(self.r, tf.multiply(self.done_mask, tf.multiply(gamma, tf.reduce_max(target_q))))
        self.loss = tf.reduce_mean(tf.squared_difference(q_sa, tf.reduce_sum(tf.multiply(q, action), axis=1))) 
        

        ##############################################################
        ######################## END YOUR CODE #######################



In [40]:
    def add_optimizer_op(self, scope):
        """
        Set self.train_op and self.grad_norm
        Args:
            scope: (string) scope name, that specifies if target network or not
        """

        ##############################################################
        """
        TODO: 
            1. get Adam Optimizer
            2. compute grads with respect to variables in scope for self.loss
            3. if self.config.grad_clip is True, then clip the grads
                by norm using self.config.clip_val 
            4. apply the gradients and store the train op in self.train_op
                (sess.run(train_op) must update the variables)
            5. compute the global norm of the gradients (which are not None) and store 
                this scalar in self.grad_norm

        HINT: you may find the following functions useful
            - tf.get_collection
            - optimizer.compute_gradients
            - tf.clip_by_norm
            - optimizer.apply_gradients
            - tf.global_norm
             
             you can access config variables by writing self.config.variable_name
        """
        ##############################################################
        #################### YOUR CODE HERE - 8-12 lines #############

        self.optimizer = tf.train.AdamOptimizer(1e-6)
           
        update_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = scope)
 
        grads = self.optimizer.compute_gradients(self.loss, update_vars)
 
       
        if self.config.grad_clip is True:
            grads = tf.clip_by_norm(grads, self.config.clip_val)
       
        self.train_op = self.optimizer.apply_gradients(grads)
        self.grad_norm = tf.global_norm(grads) 
        
        ##############################################################
        ######################## END YOUR CODE #######################
    

In [43]:

if __name__ == '__main__':
    env = EnvTest((5, 5, 1))

    # exploration strategy
    exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps)

    # learning rate schedule
    lr_schedule  = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps)

    # train model
    model = Linear(env, config)
    model.run(exp_schedule, lr_schedule)

TypeError: get_q_values_op() got multiple values for argument 'scope'