In [77]:
from collections import defaultdict
from helper import get_softmax
import tensorflow as tf
import numpy as np
import random

global train_lib

def get_importance(state=None):
    return np.array([0.25, 0.25, 0.5])

def get_combined_probs(state):
    importance = get_importance(state)
    summed_probs = np.array([0., 0., 0.])
    for i in range(len(train_lib)):
        probs = get_softmax(train_lib[i], tau=1.0)
        summed_probs += importance[i] * probs
    return summed_probs

def get_action(state=None, tau=1.0):
    """ Returns an action selected through softmax. """
    # print(self.step_current, self.tau, self.batch_size)
    return np.random.choice(3, p=get_combined_probs(state))

In [82]:
train_lib = (np.array([[1, 1, 1]]),
             np.array([[0, 1, 2]]),
             np.array([[0, 0, 1]]))

action = get_action()
print("Action:", action)
# probabilities for each action
#probs = defaultdict()
#weighted_probs = defaultdict()

#print("summed_probs = " + str(summed_probs) + " --> sum = " + str(sum(summed_probs)))


('Action:', 2)


In [1]:
import os

import numpy as np

import tensorflow as tf

class Model(object):
    def __init__(self, args, rng, log_path):
        #_logger.info("Initializing Model (Type: %s)" %
        #             args.model)
        self.args = args
        self.rng = rng
        self.log_path = log_path


class TensorflowModel(Model):
    def __init__(self, args, rng, session,
                 input_shape, output_shape, log_path, scope):
        # Call super class
        super(TensorflowModel, self).__init__(args, rng, log_path)
        self.scope = scope
        self.session = session
        self.input_shape = input_shape
        self.output_shape = output_shape

class SimpleDQNModel(TensorflowModel):
    def __init__(self, args, rng, session,
                 input_shape, output_shape, log_path, scope):
        # Call super class
        super(SimpleDQNModel, self).__init__(args, rng, session,
                                             input_shape, output_shape,
                                             log_path, scope)

        # Define a placeholder for network input
        self.s_placeholder = tf.placeholder(
                shape=[None] + list(self.input_shape),
                dtype=tf.float32)
        # Build the network
        self.q_policy = self.build_network(scope)
        # Define a placeholder for loss calculation
        self.q_placeholder = tf.placeholder(shape=[None, self.output_shape],
                                            dtype=tf.float32)
        # Define important network parameters
        # self.loss = tf.losses.mean_squared_error(self.q_placeholder,
        #                                          self.q_policy)
        self.loss = tf.losses.huber_loss(self.q_placeholder, self.q_policy)
        # self.optimizer = tf.train.RMSPropOptimizer(self.args.alpha)
        self.optimizer = tf.train.AdamOptimizer(0.00025)

        # without gradient clipping
        self.train_step = self.optimizer.minimize(self.loss)

        # Define layer for selecting only the max value
        self.action = tf.argmax(self.q_policy, 1)

        # Define layer for a softmax output
        # TODO check if this works or if I should use agent to do it
        self.action_probs = tf.contrib.layers.softmax(self.q_policy)


    def build_network(self):
        # Create the hidden layers of the network.
        conv1 = tf.contrib.layers.conv2d(self.s_placeholder,
                                         num_outputs=16,
                                         kernel_size=[8, 8],
                                         stride=[4, 4],
                                         scope=scope + "/conv1")
        conv2 = tf.contrib.layers.conv2d(conv1,
                                         num_outputs=32,
                                         kernel_size=[4, 4],
                                         stride=[2, 2],
                                         scope=scope + "/conv2")
        conv3 = tf.contrib.layers.conv2d(conv2,
                                         num_outputs=32,
                                         kernel_size=[3, 3],
                                         stride=[1, 1],
                                         scope=scope + "/conv3")
        conv3_flat = tf.contrib.layers.flatten(conv3,
                                               scope=scope + "/conv3_flat")
        fc1 = tf.contrib.layers.fully_connected(conv3_flat,
                                                num_outputs=128,
                                                scope=scope + "/fc1")
        # Create the output layer of the network
        q = tf.contrib.layers.fully_connected(fc1,
                                              num_outputs=self.output_shape,
                                              activation_fn=None,
                                              scope=scope + "/q")
        return q

    def train(self, state, q):
        state = state.astype(np.float32)
        loss_batch, _ = self.session.run([self.loss, self.train_step],
                                         feed_dict={self.s_placeholder: state,
                                                    self.q_placeholder: q})
        return loss_batch

    def get_qs(self, state):
        """ Returns the Q values for all available outputs. """
        state = state.astype(np.float32)
        if len(state.shape) == 3:
            state = state.reshape([1] + list(self.input_shape))
        return self.session.run(self.q_policy,
                                feed_dict={self.s_placeholder: state})

    def get_action_probs(self, state):
        """ Returns a probability distribution over the possible actions. """
        state = state.astype(np.float32)
        return self.session.run(self.action_probs,
                                feed_dict={self.s_placeholder: state})

    def get_action(self, state):
        """ Returns the index from the maximal Q value """
        state = state.astype(np.float32)
        # print('Shape original', len(state.shape))
        state = state.reshape([1] + list(self.input_shape))
        # print('Shape altered', len(state.shape))
        return self.session.run(self.action,
                                feed_dict={self.s_placeholder: state})[0]
    
def copy_model_parameters(sess, estimator1, estimator2):
    """
    Copies the model parameters of one estimator to another.
    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)

    sess.run(update_ops)

In [2]:
rng = np.random.RandomState(123)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = False
config.allow_soft_placement = True

# Initiate tensorflow session
session = tf.Session(config=config)
model_input_shape = (80, 80) + (3,)
# Policy network
model = SimpleDQNModel(None,
                       rng,
                       session,
                       model_input_shape,
                       3,
                       None,
                       "policy")

# target network
target_model = SimpleDQNModel(None,
                              rng,
                              session,
                              model_input_shape,
                              3,
                              None,
                              "target")
init = tf.global_variables_initializer()
session.run(init)

In [3]:
s = np.ones((1,) + model_input_shape, dtype=np.uint8)

print(model.get_qs(s))
print(target_model.get_qs(s))
print("--------------- Copying -----------------")
copy_model_parameters(session, model, target_model)
print(model.get_qs(s))
print(target_model.get_qs(s))

[[-0.01797529 -0.17344828 -0.13216183]]
[[-0.00854884 -0.00646538 -0.12156008]]
--------------- Copying -----------------
[[-0.01797529 -0.17344828 -0.13216183]]
[[-0.01797529 -0.17344828 -0.13216183]]


In [4]:
tvar = tf.trainable_variables()

for idx, v in enumerate(tvar):
    print(" var {:3}: {:15} {}".format(idx, str(v.get_shape()), v.name))

session.close()

 var   0: (8, 8, 3, 16)   policy/conv1/weights:0
 var   1: (16,)           policy/conv1/biases:0
 var   2: (4, 4, 16, 32)  policy/conv2/weights:0
 var   3: (32,)           policy/conv2/biases:0
 var   4: (3, 3, 32, 32)  policy/conv3/weights:0
 var   5: (32,)           policy/conv3/biases:0
 var   6: (3200, 128)     policy/fc1/weights:0
 var   7: (128,)          policy/fc1/biases:0
 var   8: (128, 3)        policy/q/weights:0
 var   9: (3,)            policy/q/biases:0
 var  10: (8, 8, 3, 16)   target/conv1/weights:0
 var  11: (16,)           target/conv1/biases:0
 var  12: (4, 4, 16, 32)  target/conv2/weights:0
 var  13: (32,)           target/conv2/biases:0
 var  14: (3, 3, 32, 32)  target/conv3/weights:0
 var  15: (32,)           target/conv3/biases:0
 var  16: (3200, 128)     target/fc1/weights:0
 var  17: (128,)          target/fc1/biases:0
 var  18: (128, 3)        target/q/weights:0
 var  19: (3,)            target/q/biases:0
