In [104]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

In [105]:
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)

def pullBandit(bandit):
    result = np.random.randn(1)
    if (result > bandit):
        return 1
    else:
        return -1

tf.reset_default_graph()

# Forward network
weights = tf.Variable(tf.ones([num_bandits]))
selected_action = tf.argmax(weights, 0)

# Backward
model_reward = tf.placeholder(shape=[1], dtype=tf.float32)
model_action = tf.placeholder(shape=[1], dtype=tf.int32)
    
# Which weight is responsible for selection?
responsible_weight = tf.slice(weights, model_action, [1])
loss = -(tf.log(responsible_weight) * model_reward)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
solver = optimizer.minimize(loss)


In [106]:
total_episodes = 1000 #Set total number of episodes to train agent on.
total_reward = np.zeros(num_bandits) #Set scoreboard for bandits to 0.
epsilon = 0.1 #Set the chance of taking a random action.

init = tf.initialize_all_variables()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        #Choose either a random action or one from our network.
        if np.random.rand(1) < epsilon:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(selected_action)
        
        reward = pullBandit(bandits[action]) #Get our reward from picking one of the bandits.
        
        #Update the network.
        _, resp, ww = sess.run([solver, responsible_weight, weights],
                               feed_dict={model_reward:[reward],
                                          model_action:[action]})
        
        #Update our running tally of scores.
        total_reward[action] += reward
        if i % 50 == 0:
            print("Action: {}".format(action))
            print("Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward))
            print(ww)
        i+=1

print("The agent thinks bandit " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandits)):
    print("...and it was right!")
else:
    print("...and it was wrong!")

Action: 0
Running reward for the 4 bandits: [1. 0. 0. 0.]
[1. 1. 1. 1.]
Action: 2
Running reward for the 4 bandits: [-3. -1.  6.  1.]
[0.996999   0.99900216 1.005006   1.001     ]
Action: 2
Running reward for the 4 bandits: [-3. -2. 22.  2.]
[0.996999   0.99800116 1.0208248  1.001999  ]
Action: 2
Running reward for the 4 bandits: [-2. -3. 37.  3.]
[0.99800205 0.99699914 1.0354359  1.002997  ]
Action: 2
Running reward for the 4 bandits: [-4. -4. 41.  6.]
[0.995997  0.9959961 1.041234  1.0059851]
Action: 2
Running reward for the 4 bandits: [-4. -3. 56.  8.]
[0.995997   0.99700016 1.0555619  1.0079722 ]
Action: 2
Running reward for the 4 bandits: [-5. -2. 62.  8.]
[0.994993  0.9980032 1.0612512 1.0079722]
Action: 2
Running reward for the 4 bandits: [-6.  0. 78.  9.]
[0.9939879 1.0000062 1.0743809 1.0089643]
Action: 2
Running reward for the 4 bandits: [-8.  0. 88.  9.]
[0.9919748 1.0000062 1.0836676 1.0089643]
Action: 2
Running reward for the 4 bandits: [-8. -1. 93.  9.]
[0.9919748 0.99900

In [107]:
class ContextualBandit:
    def __init__(self):
        self.current_bandit_ = 0
        self.bandits_ = np.array([[0.0,0.2,-0.0,-5],
                                  [0.1,-5,1,0.25],
                                  [-5,5,5,5]])
        self.num_bandits_ = self.bandits_.shape[0]
        self.num_actions_ = self.bandits_.shape[1]
        
    def randomize(self):
        self.current_bandit_ = np.random.randint(0, self.num_bandits_)
        
    def pull_arm(self, action):
        if np.random.randn(1) > self.bandits_[self.current_bandit_, action]:
            return 1
        else:
            return -1
        
    @property
    def bandit(self):
        return self.current_bandit_
    
    @property
    def num_bandits(self):
        return self.num_bandits_
    
    @property
    def num_actions(self):
        return self.num_actions_

In [113]:
class Agent:
    def __init__(self, lr, num_bandits, num_actions):
        # forward path
        self.input_current_bandit_ = tf.placeholder(shape=[1], dtype=tf.int32)
        state_vec = slim.one_hot_encoding(self.input_current_bandit_, num_bandits)
        output = slim.fully_connected(state_vec, num_actions,
                                      biases_initializer=None,
                                      activation_fn=tf.nn.sigmoid,
                                      weights_initializer=tf.ones_initializer())
        self.output_ = tf.reshape(output, [-1])
        self.selected_action = tf.argmax(self.output_, 0)
        
        self.weights_ = tf.trainable_variables()[0]
        
        # compute loss
        self.reward_ = tf.placeholder(shape=[1], dtype=tf.float32)
        self.action_ = tf.placeholder(shape=[1], dtype=tf.int32)
        
        resonsible_weight = tf.slice(self.output_, self.action_, [1])
        self.loss_ = -(tf.log(resonsible_weight) * self.reward_)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.solver_ = optimizer.minimize(self.loss_)
        
    @property
    def solver(self):
        return self.solver_
    
    @property
    def net_reward(self):
        return self.reward_
    
    @property
    def net_action(self):
        return self.action_


In [121]:
tf.reset_default_graph()

bandit = ContextualBandit()
agent = Agent(0.0001, bandit.num_bandits, bandit.num_actions)
total_episode = 10000
total_reward = np.zeros([bandit.num_bandits, bandit.num_actions])

tf.trainable_variables()

[<tf.Variable 'fully_connected/weights:0' shape=(3, 4) dtype=float32_ref>]

In [122]:
init = tf.initialize_all_variables()

In [123]:
# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    for i in range(total_episode):
        bandit.randomize()
        current_bandit = bandit.bandit

        if np.random.rand(1) < 0.1:
            action = np.random.randint(bandit.num_actions)
        else:
            action = sess.run(agent.selected_action, feed_dict={agent.input_current_bandit_: [current_bandit]})
        
        reward = bandit.pull_arm(action)
            
        # print((current_bandit, action, reward))
        _, ww = sess.run([agent.solver, agent.weights_], feed_dict={agent.net_reward: [reward],
                                                 agent.net_action: [action],
                                                 agent.input_current_bandit_: [current_bandit]
                                                })
        total_reward[current_bandit, action] += reward
        if i % 500 == 0:
            print("Mean reward for each of the " + str(bandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1)))

for a in range(bandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising....")
    if np.argmax(ww[a]) == np.argmin(bandit.bandits_[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")

Mean reward for each of the 3 bandits: [-0.25  0.    0.  ]
Mean reward for each of the 3 bandits: [42.   37.5  33.25]
Mean reward for each of the 3 bandits: [81.5  77.   69.25]
Mean reward for each of the 3 bandits: [119.25 117.5  103.5 ]
Mean reward for each of the 3 bandits: [159.   155.25 136.5 ]
Mean reward for each of the 3 bandits: [195.75 190.25 172.75]
Mean reward for each of the 3 bandits: [238.25 224.   206.5 ]
Mean reward for each of the 3 bandits: [276.75 258.25 242.75]
Mean reward for each of the 3 bandits: [316.75 296.25 276.25]
Mean reward for each of the 3 bandits: [356.5  336.   307.25]
Mean reward for each of the 3 bandits: [396.5  369.   343.25]
Mean reward for each of the 3 bandits: [428.   410.5  382.25]
Mean reward for each of the 3 bandits: [468.   445.25 418.5 ]
Mean reward for each of the 3 bandits: [508.   483.   453.25]
Mean reward for each of the 3 bandits: [546.5  519.5  487.25]
Mean reward for each of the 3 bandits: [587.   551.5  526.25]
Mean reward for e

In [35]:
np.random.randint(0, 3)

0

In [47]:
bandit = ContextualBandit()
bandit.randomize()
print(bandit.bandit)

0
