In [3]:
from IPython.display import Image
from collections import defaultdict
from matplotlib import animation
import matplotlib.pyplot as plt
import random

In [2]:
import sys
import os
sys.path.append(os.path.abspath('../'))

## Bandit

- 什么叫多臂老虎机（Multi-Armed Bandit，MAB）
    - 一个拥有 $N$ 根拉杆 （arms）的老虎机，拉动每一根拉杆都对应一个关于奖励的概率分布 $\mathcal R$;
    - 每次拉动其中一根拉杆，就可以从该拉杆对应的概率分布 $\mathcal R$ 中获得一个奖励 $r$（observation，观测值），
        - $r\sim \mathcal R$
    - 序列决策问题：我们在各根拉杆奖励的概率分布 $\mathcal R$ 分布未知的情况下，从头开始尝试，目标在操作 $T$ 次拉杆后获得尽可能高的累积奖励（cumulative rewards）
- 仍属于 value-based methods
    - 状态/action：价值的度量；
- 多臂老虎机是后续学习 MCTS 的基础；
    - 蕴含了很多搜索的思想；
    - double EE：exploration & exploition
        - epsilon greedy；
        - exploration：探索拉杆的获奖概率；
        - exploition：根据经验选择获奖最多的拉杆；
    - 序列决策问题？？
- representation 表示上；
    - state：Dummy state 1；
    - action：index of arms，有多少个臂，就意味着有多大的动作空间；

## 问题定义

- $N$ armed 

In [1]:
class MultiArmedBandit:

    """ Select an action for this state given from a list given a Q-function """

    def select(self, state, actions, qfunction):
        abstract

    """ Reset a multi-armed bandit to its initial configuration """

    def reset(self):
        self.__init__()

In [6]:
Image(url='./imgs/mab_algo.png', width=600)

In [2]:
def run_bandit(bandit, episodes=200, episode_length=500):
    
    # 5 arms or 5 actions
    actions = [0, 1, 2, 3, 4]
    rewards = []
    dummy_state = 0
    
    for _ in range(episodes):
        bandit.reset()
        # 各个臂上 reward = 5 的 p，1-p 时 reward = 0
        probabilities = [0.1, 0.3, 0.7, 0.2, 0.1]
        N_dict = defaultdict(int)
        # (state, action)
        Q_table = defaultdict(lambda: 0.)
        episode_rewards = []
        for step in range(episode_length):
            action = bandit.select(dummy_state, actions, q_table)
            reward = 0
            if random.random() < probabilities[action]:
                reward = 5
            episode_rewards.append(reward)
            N_dict[action] += 1
            Q_table[(dummy_state, action)] += (reward - Q_table[(dummy_state, action)]) / N_dict[action]
        rewards += episode_rewards
    return reward

In [4]:
def get_max_q(qtable, state, actions):
    max_q = float('-inf')
    # 也就是 a
    arg_max_q = None
    for a in actions:
        q = qtable[(state, a)]
        if max_q < q:
            max_q = q
            arg_max_q = a
    return arg_max_q, max_q

In [7]:
class EpsilonGreedy(MultiArmedBandit):
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon

    def reset(self):
        pass

    def select(self, state, actions, qtable):
        # epsilon greedy
        if random.random() < self.epsilon:
            return random.choice(actions)
        (arg_max_q, _) = get_max_q(qtable, sta)
        return arg_max_q