<h1>Multi-armed Bandits</h1>

# 0. Elements of Reinforcement Learning

+ Policy

+ Reward Signal

+ Value Function

+ Model

# 1. $k$-armed Bandits

The value of action $a$, which is the predicted expectation of rewards, is defined as, 

> $q_*(a) = \mathbb{E}[R_t|A_t = a]$

$A_t$ is the action selected on time step $t$, and $R_t$ is the corresponding reward.

In [4]:
# e-greedy
import numpy as np
from matplotlib import pyplot as plt

np.random.seed(1)

def normal_fn(mu, std):
    while True:
        yield np.random.normal(mu, std)

        
def exclude_sample(exclude, low=0, high=9):
    rand_n = np.random.randint(low, high)
    
    if rand_n < exclude:
        return rand_n
    else:
        return exclude + 1
        
        
def bandit_experience(reward_fns, n_step=2000, n_arm=10, epsilon=0.01):

    """
    Q-value
        Expected value of each action, 
        to be learned or counted as average of samples.
    """
    Q_reward = np.zeros(n_arm)
    Q_num = np.zeros(n_arm)

    total_reward = 0.
    avg_rewards = []

    for step in range(n_step):
        # Choose current action, based on the Q_reward
        action = np.argmax(Q_reward)

        ## e-greedy
        _ = np.random.random()

        if _ < epsilon:
            action = exclude_sample(action, low=0, high=9)

        # Get reward of the action, update the respectively Q_value
        reward = next(reward_fns[action])

        action_avg_reward = Q_reward[action]
        num = Q_num[action]
        Q_reward[action] = (action_avg_reward * num + reward) / (num + 1)
        Q_num[action] += 1

        # avg_reward of each step 
        total_reward += reward
        avg_reward = total_reward / (step+1)
        avg_rewards.append(avg_reward)
    return np.array(avg_rewards)        

In [None]:
# Reward Function
reward_fns = []
n_arm = 10
for i in range(n_arm):
    mu = np.random.random()
    std = 1
    reward_fns.append(normal_fn(mu, std))

# Run
n_run = 1000
n_step = 2000

rewards_1 = np.zeros(n_step)
rewards_2 = np.zeros(n_step)
rewards_3 = np.zeros(n_step)

for _ in range(n_run):
    rewards_1 += bandit_experience(reward_fns, n_step, n_arm, epsilon=0)
    rewards_2 += bandit_experience(reward_fns, n_step, n_arm, epsilon=0.01)
    rewards_3 += bandit_experience(reward_fns, n_step, n_arm, epsilon=0.1)

rewards_1 /= n_step
rewards_2 /= n_step
rewards_3 /= n_step

# Plot
x = np.linspace(1, 2000, 2000)

plt.plot(x, rewards_1, 'b-', label='$\epsilon$=0 greedy')
plt.plot(x, rewards_2, 'r-', label='$\epsilon$=0.01')
plt.plot(x, rewards_3, 'g-', label='$\epsilon$=0.1')
plt.legend(loc='best')
plt.xlabel('step')
plt.ylabel('average reward')
plt.title('$\epsilon$-greedy')
plt.show()