This is based on the code from [Juliani's tutorial](https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724)
This is an implemenation of vanila REINFORCE and only uses the reward function, no value function is used. See [Sutton's 2nd edition book Chapter 13.3](http://incompleteideas.net/book/bookdraft2017nov5.pdf) for a mathematical reference.

In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym

  from ._conv import register_converters as _register_converters
  from . import h5a, h5d, h5ds, h5f, h5fd, h5g, h5r, h5s, h5t, h5p, h5z
  from .. import h5g, h5i, h5o, h5r, h5t, h5l, h5p


In [14]:
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add*gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

### notes regading the discount_rewards code

...

discounted_r[-3] = $R_{T-2} + \gamma \, R_{T-1} + \gamma^2 \, R_T$

discounted_r[-2] = $R_{T-1} + \gamma \, R_T$

discounted_r[-1] = $R_T$


## Agent 

In [13]:
class Agent():
  
    def __init__(self, lr, s_size, a_size, h_size):
        # These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in, h_size, biases_initializer=None, activation_fn=tf.nn.relu)
        self.probability = slim.fully_connected(hidden, a_size, activation_fn=tf.nn.softmax, biases_initializer=None)
        self.chosen_action = tf.argmax(self.probability, 1)

        # The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        # to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32)

        self.indexes = tf.range(0, tf.shape(self.probability)[0]) * tf.shape(self.probability)[1] + self.action_holder
        # gets the probability which were associated with each action
        self.responsible_outputs = tf.gather(tf.reshape(self.probability, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs) * self.reward_holder)

        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx, var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32, name=str(idx) + '_holder')
            self.gradient_holders.append(placeholder)

        self.gradients = tf.gradients(self.loss, tvars)

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, tvars))


### notes regading the agent code

The dimension of the variable:

```python
    self.probability 
```

will be $(batch \times 2)$ as their are two actions the batch size is the length of the episode. The loss function needs the probabilities associated with the actions taken. The lines below achieve this

    
```python
    self.indexes = tf.range(0, tf.shape(self.probability)[0]) * tf.shape(self.probability)[1] + self.action_holder
    # gets the probability which were associated with each action
    self.responsible_outputs = tf.gather(tf.reshape(self.probability, [-1]), self.indexes)
```

where self.responsible_outputs is a vector of probabilities associated with each action. 

This line computes the partial derivate of the agent's model parameters with respect to the cost function.

```python
    self.gradients = tf.gradients(self.loss, tvars)
```

In our case we have two fully connected layers: $$\left[\frac{\partial\, J}{\partial W_1},\;\frac{\partial\, J}{\partial W_2}\right]$$


## Initialise variables

In [6]:
tf.reset_default_graph()  # Clear the Tensorflow graph.

# setup the environment
env = gym.make('CartPole-v0')

agent = Agent(lr=1e-2, s_size=4, a_size=2, h_size=8)  # Load the agent.
gamma = 0.99
total_episodes = 5000  # Set total number of episodes to train agent on.
max_ep = 999
update_frequency = 5 # how many MC samples of the gradient we gather before we make an update to the policy.

init = tf.global_variables_initializer()

[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


## Train

In [9]:
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_lenght = []

    # Setting gradients of the policy to ZERO dc_dw1, dc_dw2
    gradBuffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0

    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            # Probabilistically pick an action given our network outputs.
            a_dist = sess.run(agent.probability, feed_dict={agent.state_in: [s]})
            a = np.random.choice(a_dist[0], p=a_dist[0])
            a = np.argmax(a_dist == a)

            s1, r, d, _ = env.step(a)  # Get our reward for taking an action given a bandit.
            ep_history.append([s, a, r, s1])
            s = s1
            running_reward += r
            if d == True:
                # Update the network.
                ep_history = np.array(ep_history)
                ep_history[:, 2] = discount_rewards(ep_history[:, 2], gamma)
                feed_dict = {agent.reward_holder: ep_history[:, 2],
                             agent.action_holder: ep_history[:, 1], agent.state_in: np.vstack(ep_history[:, 0])}
                grads = sess.run(agent.gradients, feed_dict=feed_dict)
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict = dictionary = dict(zip(agent.gradient_holders, gradBuffer))
                    _ = sess.run(agent.update_batch, feed_dict=feed_dict)
                    for ix, grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0

                total_reward.append(running_reward)
                total_lenght.append(j)
                break


                # Update our running tally of scores.
        if i % 100 == 0:
            print(np.mean(total_reward[-100:]))
        i += 1

38.0
32.02
38.26
49.28
54.26
71.62
89.92
114.57
152.14
169.07
177.28
175.89
187.6
190.82
191.55
180.41
180.13
191.55
193.19
195.05
194.53
195.92
186.88
194.7
196.36
199.82
196.15
190.0
188.02
189.79
179.66
198.74
197.65
199.17
199.77
200.0
199.28
200.0
194.08
197.92
198.67
194.64
197.26
185.44
195.81
200.0
199.66
200.0
200.0
200.0
