- [Blog](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-1-5-contextual-bandits-bff01d1aad9c)
- [Policy Gradient](http://www.scholarpedia.org/article/Policy_gradient_methods)

In [1]:
%matplotlib inline
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim

In [22]:
class ContextualBandit(object):
    
    def __init__(self):
        self.state = 0
        # the best choise: (the lower, the better)
        #     slot 1: 4
        #     slot 2: 2
        #     slot 3: 1
        self.bandits = np.array([[0.2, 0,-0.0,  -5],
                                 [0.1,-5,   1,0.25],
                                 [ -5, 5,   5,   5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
    
    def randomBandit(self):
        self.state = np.random.randint(0, self.num_bandits)
        return self.state
    
    def pullArm(self, action, noise=0.0):
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        if (result+noise*np.random.rand(1)[0]) > bandit:
            return 1
        else:
            return -1

In [23]:
class Agent(object):
    
    def __init__(self, num_states, num_actions, learn_rate=0.001):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learn_rate = learn_rate
        
        # buidling graph
        self.state_in = tf.placeholder(shape=[1], dtype=tf.int32)
        one_hot_in_layer = tf.one_hot(self.state_in, self.num_states)
        # fully_connected conceptually is the implementation of following 
        # layer structure:
        #   output_layer = activation(input_layer*kernel) + bias
        # the source code: 
        #   https://github.com/tensorflow/tensorflow/blob/b07791f6e9b306937eb58f7bb6c3300cd26583af/tensorflow/contrib/layers/python/layers/layers.py
        output = slim.fully_connected(one_hot_in_layer, 
                                      num_actions,
                                      biases_initializer=None,
                                      activation_fn=tf.nn.sigmoid,
                                      weights_initializer=tf.ones_initializer())
        self.output = tf.reshape(output, [-1])
        self.chosen_action = tf.argmax(self.output, 0)
        
        self.reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
        select_action = self.action_holder[0]
        self.responsible_weight = self.output[select_action]
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learn_rate)
        self.train_op = optimizer.minimize(self.loss)

In [24]:
tf.reset_default_graph()

cBandit = ContextualBandit()
agent = Agent(num_states=cBandit.num_bandits, num_actions=cBandit.num_actions)
weight = tf.trainable_variables()[0]

num_episodes = 10000
total_rewards = np.zeros(cBandit.bandits.shape)
err_rate = 0.01

In [25]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    
    for i in range(num_episodes):
        current_state = cBandit.randomBandit()
        
        if np.random.rand(1) < err_rate:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(agent.chosen_action,
                              feed_dict={agent.state_in:[current_state]})
        
        reward = cBandit.pullArm(action, noise=1.0/(i+10))
        
        feed_dict = {agent.reward_holder:[reward], 
                     agent.action_holder:[action],
                     agent.state_in:[current_state]}
        _, w = sess.run([agent.train_op, weight], feed_dict=feed_dict)
        total_rewards[current_state, action] += reward
        if i % 500 == 0:
            print("Mean reward for each bandits {}".format(np.mean(total_rewards, axis=-1)))
print()
for bandit in range(cBandit.num_bandits):
    print("The agent thinks action the best action for bandit {} is {}".format(bandit, np.argmax(w[bandit])+1))
    if np.argmax(w[bandit]) == np.argmin(cBandit.bandits[bandit]):
        print("Correct!")
    else:
        print("Wrong...")

Mean reward for each bandits [ 0.    0.    0.25]
Mean reward for each bandits [ 29.75  41.5   40.5 ]
Mean reward for each bandits [ 67.    86.    82.25]
Mean reward for each bandits [ 109.5   125.75  123.  ]
Mean reward for each bandits [ 152.5   164.    165.75]
Mean reward for each bandits [ 193.    203.    208.25]
Mean reward for each bandits [ 228.75  247.75  251.25]
Mean reward for each bandits [ 268.75  290.75  291.25]
Mean reward for each bandits [ 307.75  332.5   333.5 ]
Mean reward for each bandits [ 348.5   373.5   376.25]
Mean reward for each bandits [ 391.    413.75  417.  ]
Mean reward for each bandits [ 432.5   455.75  456.  ]
Mean reward for each bandits [ 480.    489.    498.75]
Mean reward for each bandits [ 520.5   530.75  541.  ]
Mean reward for each bandits [ 564.    574.    578.75]
Mean reward for each bandits [ 605.75  614.75  619.75]
Mean reward for each bandits [ 646.5   655.75  662.  ]
Mean reward for each bandits [ 686.5   696.75  705.  ]
Mean reward for each b