In [2]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

In [3]:
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)

def pullBandit(bandit):
    result = np.random.randn(1)
    if (result > bandit):
        return 1
    else:
        return -1

tf.reset_default_graph()

# Forward network
weights = tf.Variable(tf.ones([num_bandits]))
selected_action = tf.argmax(weights, 0)

# Backward
model_reward = tf.placeholder(shape=[1], dtype=tf.float32)
model_action = tf.placeholder(shape=[1], dtype=tf.int32)
    
# Which weight is responsible for selection?
responsible_weight = tf.slice(weights, model_action, [1])
loss = -(tf.log(responsible_weight) * model_reward)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
solver = optimizer.minimize(loss)


In [4]:
total_episodes = 1000 #Set total number of episodes to train agent on.
total_reward = np.zeros(num_bandits) #Set scoreboard for bandits to 0.
epsilon = 0.1 #Set the chance of taking a random action.

init = tf.initialize_all_variables()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        #Choose either a random action or one from our network.
        if np.random.rand(1) < epsilon:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(selected_action)
        
        reward = pullBandit(bandits[action]) #Get our reward from picking one of the bandits.
        
        #Update the network.
        _, resp, ww = sess.run([solver, responsible_weight, weights],
                               feed_dict={model_reward:[reward],
                                          model_action:[action]})
        
        #Update our running tally of scores.
        total_reward[action] += reward
        if i % 50 == 0:
            print("Action: {}".format(action))
            print("Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward))
            print(ww)
        i+=1

print("The agent thinks bandit " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandits)):
    print("...and it was right!")
else:
    print("...and it was wrong!")

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Action: 3
Running reward for the 4 bandits: [0. 0. 0. 1.]
[1. 1. 1. 1.]
Action: 3
Running reward for the 4 bandits: [-1. -1.  0. 49.]
[0.999     0.999     1.        1.0469222]
Action: 3
Running reward for the 4 bandits: [-1. -2. -1. 95.]
[0.999     0.997999  0.9990011 1.0899937]
Action: 3
Running reward for the 4 bandits: [ -2.  -1.   0. 142.]
[0.997999  0.999001  1.000002  1.1323096]
Action: 3
Running reward for the 4 bandits: [ -2.  -1.  -1. 185.]
[0.99800104 0.999001   0.9990031  1.1696824 ]
Action: 3
Running reward for the 4 bandits: [ -1.   0.   0. 230.]
[0.99900305 1.000002   1.0000051  1.2075546 ]
Action: 3
Running reward for the 4 bandits: [ -1.  -1.   2. 275.]
[0.99900407 0.99900204 1.0020041  1.2442743 ]
Action: 3
Running reward for the 4 bandits: [  0.   0.   2. 321.]
[1.000005  1.000004  1.0020041 1.2807212]
Action: 3
Running reward for the 4 bandits: [  0.   0.   5. 368.]
[1.000005  1.000004  1.00499

In [49]:
class ContextualBandit:
    def __init__(self):
        self.current_bandit_ = 0
        self.bandits_ = np.array([[[0.2,0,-0.0,-5],
                                   [0.1,-5,1,0.25],
                                   [-5,5,5,5]]])
        self.num_bandits_ = self.bandits_.shape[0]
        self.num_actions_ = self.bandits_.shape[1]
        
    def randomize(self):
        self.current_bandit_ = np.random.randint(0, self.num_bandits_)
        print(self.num_bandits_)
        
    def pull_arm(self, action):
        if np.random.randn(1) > self.bandits_[self.current_bandit_, action]:
            return 1
        else:
            return -1
        
    @property
    def bandit(self):
        return self.current_bandit_
    
    @property
    def num_bandits(self):
        return self.num_bandits_
    
    @property
    def num_actions(self):
        return self.num_actions_

In [50]:
class Agent:
    def __init__(self, lr, num_bandits, num_actions):
        # forward path
        self.input_current_bandit_ = tf.placeholder(shape=[1], dtype=tf.int32)
        state_vec = slim.one_hot_encoding(self.input_current_bandit_, num_bandits)
        output = slim.fully_connected(state_vec, num_actions,
                                      biases_initializer=None,
                                      activation_fn=tf.nn.sigmoid,
                                      weights_initializer=tf.ones_initializer())
        self.output_ = tf.reshape(output, [-1])
        self.selected_action = tf.argmax(self.output_, 0)
        
        # compute loss
        self.reward_ = tf.placeholder(shape=[1], dtype=tf.float32)
        self.action_ = tf.placeholder(shape=[1], dtype=tf.int32)
        
        resonsible_weight = tf.slice(self.output_, self.action_, [1])
        self.loss_ = -(tf.log(resonsible_weight) * self.reward_)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.solver_ = optimizer.minimize(self.loss_)
        
    @property
    def net_reward(self):
        return self.reward_
    
    @property
    def net_action(self):
        return self.action_


In [51]:
tf.reset_default_graph()

bandit = ContextualBandit()
agent = Agent(0.0001, bandit.num_bandits, bandit.num_actions)
total_episode = 10

tf.trainable_variables()

[<tf.Variable 'fully_connected/weights:0' shape=(1, 3) dtype=float32_ref>]

In [52]:
init = tf.initialize_all_variables()

In [53]:
# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    for i in range(total_episode):
        bandit.randomize()
        current_bandit = bandit.bandit
        print(bandit.current_bandit_)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [35]:
np.random.randint(0, 3)

0

In [47]:
bandit = ContextualBandit()
bandit.randomize()
print(bandit.bandit)

0
