In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib,tensorflow

Ethen 2018-10-19 11:08:28 

CPython 3.6.4
IPython 6.4.0

numpy 1.14.1
pandas 0.23.0
sklearn 0.19.1
matplotlib 2.2.2
tensorflow 1.7.0


# Multi-Armed Bandit

- https://github.com/awjuliani/DeepRL-Agents
- https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-1-fd544fab149

Typical aspect of a task that makes it a RL problems are the following:

- Different actions leads to different rewards. e.g. When looking for treasure in a maze, going left may lead to the treasure, whereas going right may lead to a pit of snakes.
- Rewards are delayed over time. Even if going left in the example above is the correct right to do, we may not know this till later in the maze.
- Reward for an action is conditional on the state of the environment. Continuing with our maze example, going left may be ideal at a certain fork in the path, but not at others.

The n-armed bandit is a nice starting place since we don't have to worry about aspect 2 and 3. All we need to focus on is learning which rewards we get for each of the possible actions, and ensuring we chose the optimal ones. In the context of RL lingo, this is called learning a policy.

In [2]:
# the lower the value, the more likely a positive reward will be returned,
# in this case, bandit 4 is set to be the bandit that provids the most positive reward
bandits = np.array([0.2, 0, -0.01, -5])
num_bandits = len(bandits)


def pull_bandit(bandit):
    """
    The probability of success is drawn from a normal distribution with
    a mean around 0. And bandit whose value is below the numbers drawn
    from the noraml distribution will receive a positive reward.
    """
    result = np.random.randn(1)
    if result > bandit:
        return 1
    else:
        return -1

In [3]:
tf.reset_default_graph()

weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, axis=0)

reward_holder = tf.placeholder(dtype=tf.float32)
action_holder = tf.placeholder(dtype=tf.int32)
responsible_weight = tf.gather(weights, action_holder)

loss = -tf.log(responsible_weight) * reward_holder
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
update = optimizer.minimize(loss)

In [4]:
epsilon = 0.1
total_episodes = 1000
total_reward = np.zeros(num_bandits)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(total_episodes):

        # either explore or exploit
        if np.random.rand(1) < epsilon:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)

        # tally the reward for the action
        reward = pull_bandit(bandits[action])
        total_reward[action] += reward

        sess.run(update, feed_dict={action_holder: action, reward_holder: reward})
        if i % 100 == 0:
            print('Running reward: ' + str(total_reward))

    final_weights = sess.run(weights)

Running reward: [1. 0. 0. 0.]
Running reward: [-3.  0.  0. 68.]
Running reward: [ -2.  -1.   2. 160.]
Running reward: [  1.  -1.   2. 255.]
Running reward: [ -2.  -3.   1. 347.]
Running reward: [ -4.  -6.   0. 441.]
Running reward: [ -2.  -6.   0. 539.]
Running reward: [ -2.  -7.   0. 632.]
Running reward: [  0.  -6.   0. 725.]
Running reward: [  1.  -7.   0. 817.]


In [5]:
print('final weights: ', final_weights)
max_bandit = np.argmax(final_weights)
print('The agent thinks bandit ' + str(max_bandit + 1) + ' is the most promising....')
if max_bandit == np.argmax(-bandits):
    print('...and it was right!')
else:
    print('...and it was wrong!')

final weights:  [1.000018  0.9939969 1.0040039 1.6794388]
The agent thinks bandit 4 is the most promising....
...and it was right!


# Contextual Bandit

In [15]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim

Instructions for updating:
Use the retry module or similar alternatives.


- https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-1-5-contextual-bandits-bff01d1aad9c

In n-armed bandits, there are no environmental states and the agent learn to choose which action it best to take. Without the environmental states, the best action at any moment is also the best action always. The problem we will be looking at here contains states, but they aren't determined by the previous states or actions. Additionally, we won't be considering delayed rewards.

Contextual Bandit introduces the concept of the state, the state consists of a description of the environment that the agent can use to take more informed actions. In this problem, instead of a single bandit, there are now multiple bandits and the state of the environment tells us which bandit we are dealing with. The goal of the agent is to learn the best action not just for a single bandit, but for any number of them. Since each bandit will have different reward probabilities for each arm, our agent will need to learn to condition its action on the state of the environment.

In [12]:
class ContextualBandit:

    def __init__(self):
        self.state = 0
        self.bandits = np.array([[0.2, 0, -0.0, -5],
                                 [0.1, -5, 1, 0.25],
                                 [-5, 5, 5, 5]])
        self.num_bandits, self.num_actions = self.bandits.shape

    def get_state(self):
        self.state = np.random.randint(num_bandits)
        return self.state

    def pull_arm(self, action):
        bandit = self.bandits[self.state, action]
        result = np.random.rand(1)
        if result > bandit:
            return 1
        else:
            return -1

In [14]:
contextual_bandit = ContextualBandit()
print(contextual_bandit.num_bandits)
print(contextual_bandit.num_actions)

3
4
