## definition

In [1]:
import numpy as np

In [2]:
class MAB:
    """
    알고리즘에 의해 선택된 bandit을 draw하는 역할
    
    """
    
    def __init__(self, bandit_probs):
        self.bandit_probs = bandit_probs
     
    # reward,regret
    def draw(self, k):
        reward = np.random.binomial(1,self.bandit_probs[k])
        regret = np.max(self.bandit_probs) - self.bandit_probs[k]
        return reward, regret

In [157]:
# 베타분포를 이용하는경우
def calculator(alpha,beta):
    mean = alpha/(alpha+beta)
    var = (alpha*beta)/(((alpha+beta)**2)*(alpha+beta+1))
    return mean, var

def epsilonGreedy(success, fail, epsilon):
    """
    * epsilonGreedy. 입실론보다 작으면 exloration에 투자, 크면 exploitation에 투자.
    * 현재까지 정보를 토대로 모수 계산.
    """
    # exploration : 랜덤선택
    if np.random.rand() < epsilon or success.sum()==0:
        k = np.random.randint(0,len(success),1)[0]
    
    # exploitation : 모수 계산
    else :
#         mean, var = calculator(1+success, 1+fail) 베타분포를 이용하는경우
        mean = success/(success+fail)
        k = np.argmax(mean)
    return k    

## example

In [181]:
# setting
bandits_prob=[0.2, 0.3, 0.5, 0.7] # 모수. unknown
n_bandits = len(bandits_prob) 
n_draws = 500 

count_array = np.zeros((n_bandits,n_draws)) # 던진횟수 기록
reward_array = np.zeros((n_bandits,n_draws)) # 성공(보상)횟수 기록

epsilon = 0.2

In [182]:
# initialize
test = MAB(bandits_prob)

In [183]:
# opertation
for i in range(n_draws):
    success = reward_array.sum(axis=1) # 성공횟수
    fail = count_array.sum(axis=1) - success # 실패횟수
    
    k = epsilonGreedy(success, fail, epsilon) # 선택된 bandit
    reward, regret = test.draw(k)
    
    # 업데이트
    count_array[k,i] = 1 
    reward_array[k,i] = reward 



array([  8.,   6.,  17., 276.])