- [Blog](https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-3-model-based-rl-9a6fe0cce99)
  - [Github](https://github.com/awjuliani/DeepRL-Agents/blob/master/Model-Network.ipynb)

In [1]:
import numpy as np
import tensorflow as tf

%matplotlib inline
import matplotlib.pyplot as plt
import gym

In [2]:
# helpers
def resetBuffer(buffer):
    for arr in buffer:
        arr[:] = 0

def discountRewards(rewards, discount_rate=0.99):
    """
    rewards: 1D numpy array of rewards
    """
    discounted_rewards = np.zeros_like(rewards)
    running_sum = 0
    for i in reversed(range(len(rewards))):
        running_sum = discount_rate * running_sum + rewards[i]
        discounted_rewards[i] = running_sum
    return discounted_rewards

In [3]:
env = gym.make("CartPole-v0")

[2017-06-21 13:25:27,581] Making new env: CartPole-v0


In [4]:
print(tf.contrib.layers.xavier_initializer.__doc__)

Returns an initializer performing "Xavier" initialization for weights.

  This function implements the weight initialization from:

  Xavier Glorot and Yoshua Bengio (2010):
           Understanding the difficulty of training deep feedforward neural
           networks. International conference on artificial intelligence and
           statistics.

  This initializer is designed to keep the scale of the gradients roughly the
  same in all layers. In uniform distribution this ends up being the range:
  `x = sqrt(6. / (in + out)); [-x, x]` and for normal distribution a standard
  deviation of `sqrt(3. / (in + out))` is used.

  Args:
    uniform: Whether to use uniform or normal distributed random initialization.
    seed: A Python integer. Used to create random seeds. See
      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
      for behavior.
    dtype: The data type. Only floating point types are supported.

  Returns:
    An initializer for a weight matrix.

In [5]:
print(tf.get_variable.__doc__)

Gets an existing variable with these parameters or create a new one.

This function prefixes the name with the current variable scope
and performs reuse checks. See the
@{$variable_scope$Variable Scope How To}
for an extensive description of how reusing works. Here is a basic example:

```python
with tf.variable_scope("foo"):
    v = tf.get_variable("v", [1])  # v.name == "foo/v:0"
    w = tf.get_variable("w", [1])  # w.name == "foo/w:0"
with tf.variable_scope("foo", reuse=True):
    v1 = tf.get_variable("v")  # The same as v above.
```

If initializer is `None` (the default), the default initializer passed in
the variable scope will be used. If that one is `None` too, a
`glorot_uniform_initializer` will be used. The initializer can also be
a Tensor, in which case the variable is initialized to this value and shape.

Similarly, if the regularizer is `None` (the default), the default regularizer
passed in the variable scope will be used (if that is `None` too,
then by default no regular

In [6]:
num_hidden_neurons = 8
learn_rate = 1e-2
discount_rate = 0.99
rmsp_decay_rate = 0.99

model_batch_size = 3
env_batch_size = 3

env_input_size, = env.observation_space.shape

In [7]:
# Policy NN
tf.reset_default_graph()
# input observation
observations = tf.placeholder(tf.float32, shape=[None, 4], name="policy/input_layer")
# two relu layers nn
W1 = tf.get_variable("policy/W1", 
                     shape=[4, num_hidden_neurons], 
                     initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations, W1))
W2 = tf.get_variable("policy/W2", 
                     shape=[num_hidden_neurons, 1],
                     initializer=tf.contrib.layers.xavier_initializer())
# policy_predict is a 1D array which ideally should be a vector of 0 and 1 
# (0 -> move left, 1 -> move right)
# predict_policy shape: [None, 1] 
predict_policy = tf.nn.sigmoid(tf.matmul(layer1, W2), name="policy/predict_policy")

train_variables = tf.trainable_variables()
target_policy = tf.placeholder(tf.float32, shape=[None, 1], name="policy/target_policy")
advantages = tf.placeholder(tf.float32, name="policy/advantages")
W1Grad = tf.placeholder(tf.float32, name="policy/batch_grad1")
W2Grad = tf.placeholder(tf.float32, name="policy/batch_grad2")
batch_grads = [W1Grad, W2Grad]
# if target_policy == 1: loglik = tf.log(1 - predict_policy) --> predict_policy == 1 is optimal
# if target_policy == 0: loglik = tf.log(predict_policy) --> predict_policy == 0 is optimal
loglik = tf.log(target_policy*(target_policy - predict_policy) + (1-target_policy)*(target_policy + predict_policy))

loss = -tf.reduce_sum(loglik*advantages)
newGrads = tf.gradients(loss, train_variables)
train_policy_op = tf.train.AdamOptimizer(learning_rate=learn_rate).apply_gradients(zip(batch_grads, train_variables))

In [8]:
# Model NN
num_hidden_neurons_model = 256

with tf.variable_scope("rnnlm"):
    # env_inpu_size+1: environment state + action
    previous_state = tf.placeholder(tf.float32, [None, env_input_size+1])
    W1_m = tf.get_variable("W1_m", 
                           shape=[5, num_hidden_neurons_model],
                           initializer=tf.contrib.layers.xavier_initializer())
    B1_m = tf.Variable(tf.zeros((num_hidden_neurons_model)), name="B1_m")
    layer1_m = tf.nn.relu(tf.matmul(previous_state, W1_m) + B1_m)
    
    W2_m = tf.get_variable("W2_m",
                           shape=[num_hidden_neurons_model, num_hidden_neurons_model],
                           initializer=tf.contrib.layers.xavier_initializer())
    B2_m = tf.Variable(tf.zeros((num_hidden_neurons_model)), name="B2_m")
    layer2_m = tf.nn.relu(tf.matmul(layer1_m, W2_m) + B2_m)
    
    # output weight for observation
    wO = tf.get_variable("wO", 
                         shape=[num_hidden_neurons_model, 4],
                         initializer=tf.contrib.layers.xavier_initializer())
    bO = tf.Variable(tf.zeros((4)), name="bO")
    
    # output weight for reward
    wR = tf.get_variable("wR",
                         shape=[num_hidden_neurons_model, 1],
                         initializer=tf.contrib.layers.xavier_initializer())
    bR = tf.Variable(tf.zeros((1)), name="bR")
    
    # output weight for done
    wD = tf.get_variable("wD",
                         shape=[num_hidden_neurons_model, 1],
                         initializer=tf.contrib.layers.xavier_initializer())
    bD = tf.Variable(tf.zeros((1)), name="bD")
    
    predict_observation = tf.matmul(layer2_m, wO) + bO
    predict_reward = tf.matmul(layer2_m, wR) + bR
    predict_done = tf.matmul(layer2_m, wD) + bD
    
    target_observation = tf.placeholder(tf.float32, [None, 4], name="target_observation")
    target_reward = tf.placeholder(tf.float32, [None, 1], name="target_reward")
    target_done = tf.placeholder(tf.float32, [None, 1], name="target_done")
    
    loss_observation = tf.square(target_observation - predict_observation)
    loss_reward = tf.square(target_reward - predict_reward)
    loss_done = -tf.log(target_done*predict_done + (1-target_done)*(1-predict_done))
    loss = tf.reduce_sum(loss_observation + loss_reward + loss_done)
    
    train_op_model = tf.train.AdamOptimizer(learning_rate=learn_rate).minimize(loss)
    predict_state = tf.concat([predict_observation, predict_reward, predict_done], 1)

In [10]:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

In [None]:
# traing helper
def stepModel(sess, xs, action):
    toFeed = np.reshape(np.hstack([xs[-1][0], np.array(action)]), (1, 5))
    predict_ = sess.run()

In [53]:
env.reset() # position, cart-velocity, pole-angle, pole-vel-at-tip

array([ 0.04686382, -0.02429138,  0.04838109,  0.04020312])