# RL Intro: Multi-armed bandit # 

In [2]:
import tensorflow as tf
import numpy as np

## The Bandits ## 

pullBandit fxn generates a random number from a normal distribution with a mean of 0. The lower the bandit number, the more likely a positive reward will be returned. We want our agent to always choose the bandit that will give that positive reward. 

In [3]:
#List out our bandits. Currently bandit 4 (index#3) is set to most often provide a positive reward.
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)
def pullBandit(bandit):
    #Get a random number.
    result = np.random.randn(1)
    if result > bandit:
        #return a positive reward.
        return 1
    else:
        #return a negative reward.
        return -1

## The Agent ## 

Neural agent -- consists of set of values for each bandit. Each value is an estimate of the value of the return from choosing the bandit. We use a policy gradient method to update the agent by moving the value for the selected action towards the received reward.

In [4]:
tf.reset_default_graph()

#These two lines established the feed-forward part of the network. This does the actual choosing.
weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights,0)

#The next six lines establish the training proceedure. We feed the reward and chosen action into the network
#to compute the loss, and use it to update the network.
reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
responsible_weight = tf.slice(weights,action_holder,[1])
loss = -(tf.log(responsible_weight)*reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
update = optimizer.minimize(loss)

## Training ## 

In [6]:
total_episodes = 1000 #Set total number of episodes to train agent on.
total_reward = np.zeros(num_bandits) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

init = tf.initialize_all_variables()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        
        reward = pullBandit(bandits[action]) #Get our reward from picking one of the bandits.
        
        #Update the network.
        _,resp,ww = sess.run([update,responsible_weight,weights], feed_dict={reward_holder:[reward],action_holder:[action]})
        
        #Update our running tally of scores.
        total_reward[action] += reward
        if i % 50 == 0:
            print("Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward))
        i+=1
print("The agent thinks bandit " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandits)):
    print("...and it was right!")
else:
    print("...and it was wrong!")

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Running reward for the 4 bandits: [-1.  0.  0.  0.]
Running reward for the 4 bandits: [-1.  1. -1. 44.]
Running reward for the 4 bandits: [-1.  1.  0. 91.]
Running reward for the 4 bandits: [ -2.   2.   1. 136.]
Running reward for the 4 bandits: [ -1.   0.  -2. 180.]
Running reward for the 4 bandits: [ -1.   2.   1. 225.]
Running reward for the 4 bandits: [  0.   3.   1. 273.]
Running reward for the 4 bandits: [  0.   4.   0. 321.]
Running reward for the 4 bandits: [ -2.   5.   1. 367.]
Running reward for the 4 bandits: [ -3.   6.   2. 410.]
Running reward for the 4 bandits: [ -3.   6.   3. 459.]
Running reward for the 4 bandits: [ -6.   6.   2. 505.]
Running reward for the 4 bandits: [ -6.   6.   3. 550.]
Running reward for the 4 bandits: [ -5.   5.   4. 595.]
Running reward for the 4 bandits: [ -6.   6.   4. 639.]
Running reward for the 4 bandits: [ -4.   6.   5. 686.]
Running reward for the 4 bandits: [ -3.   