# Non-stationary Multi-Armed Bandit

Exercise 2.5 (programming) Design and conduct an experiment to demonstrate the difficulties that
sample-average methods have for nonstationary problems. Use a modified version of the 10-armed
testbed in which all the q ∗ (a) start out equal and then take independent random walks (say by adding
a normally distributed increment with mean zero and standard deviation 0.01 to all the q ∗ (a) on each
step). Prepare plots like Figure 2.2 for an action-value method using sample averages, incrementally
computed, and another action-value method using a constant step-size parameter, α = 0.1. Use ε = 0.1
and longer runs, say of 10,000 steps.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [18]:
class MultiArmedBandit():
    
    def __init__(self):
        self._means = np.zeros(10)
        
    def pull_arm(self, arm):
        optimal_action = np.argmax(self._means)
        reward = np.random.randn() + self._means[arm]
        self._means += 0.1 * np.random.randn(10)
        return (reward, optimal_action)

In [22]:
def compute_action(eps, action_values):
    if np.random.rand() < eps:
        return np.random.randint(10)
    else:
        return np.argmax(action_values)
    
def update_action_values_sample_average(action_values, action_selected, next_action, next_reward):
    action_values[next_action] += 1 / action_selected[next_action] * (next_reward-action_values[next_action])
    action_selected[next_action] += 1
    
def update_action_values_constant_stepsize(action_values, next_action, next_reward, alpha):
    action_values[next_action] += alpha * (next_reward-action_values[next_action])
    action_selected[next_action] += 1
    
def update_statistics(rewards, optimal_action_chosen, next_reward, next_action, optimal_action, eps, bandit, step):
    if next_action == optimal_action:
        optimal_action_chosen[step, bandit, eps] = 1
    rewards[step, bandit, eps] = next_reward

In [24]:
n_timesteps = 1000
eps = 0.1
rewards = np.zeros((n_timesteps,2000, 2))
optimal_action_chosen = np.zeros((n_timesteps,2000, 2))

# sample-average
for bandit in range(2000):
    mba = MultiArmedBandit()
    action_values = np.zeros(10)
    action_selected = np.ones(10)
    for step in range(n_timesteps):
        next_action = compute_action(eps, action_values)
        (next_reward, optimal_action) = mba.pull_arm(next_action)
        update_action_values_sample_average(action_values, action_selected, next_action, next_reward)
        update_statistics(rewards, optimal_action_chosen, next_reward, next_action, optimal_action, 0, bandit, step)

# constant stepsize
for bandit in range(2000):
    mba = MultiArmedBandit()
    action_values = np.zeros(10)
    action_selected = np.ones(10)
    for step in range(n_timesteps):
        next_action = compute_action(eps, action_values)
        (next_reward, optimal_action) = mba.pull_arm(next_action)
        update_action_values_constant_stepsize(action_values, next_action, next_reward, 0.1)
        update_statistics(rewards, optimal_action_chosen, next_reward, next_action, optimal_action, 1, bandit, step)

In [None]:
average_rewards = np.zeros((n_timesteps,2))
average_rewards = np.mean(rewards, axis = 1)
for i in range(2):
    average_rewards[:,i] = [x / y for (x,y) in zip(np.cumsum(average_rewards[:,i]), np.arange(1,n_timesteps+1))]

fig = plt.figure(figsize=(12,12))
for i in range(2):
    plt.plot(range(1,n_timesteps+1),average_rewards[:,i])
plt.title('Average reward for different exploration rates with sample-average action values')
plt.xlabel('Time steps')
plt.ylabel('Average Reward')
plt.legend(['Greedy', 'eps = 0.1'])

In [None]:
sum_optimal_actions_chosen = np.cumsum(optimal_action_chosen, axis = 0)
sum_optimal_actions_chosen = np.mean(sum_optimal_actions_chosen, axis = 1)
for i in range(2):
    sum_optimal_actions_chosen[:,i] = [x / y for (x,y) in \
                                       zip(sum_optimal_actions_chosen[:,i], np.arange(1,n_timesteps+1))]
fig = plt.figure(figsize=(12,12))
for i in range(2):
    plt.plot(range(1,n_timesteps+1),100*sum_optimal_actions_chosen[:,i])
plt.title('Percentage of optimal action taken for different exploration rates with sample-average action values')
plt.xlabel('Time steps')
plt.ylabel('% of Optimal Action')
plt.legend(['Greedy', 'eps = 0.1'])