In [1]:
import numpy as np

def thompson_sampling(acts, reward_func, n = 1000):
    states = {a: (0, 0) for a in acts}
    
    def action():
        bs = {a: np.random.beta(s[0] + 1, s[1] + 1) for a, s in states.items()}
        return max(bs, key = bs.get)
    
    for _ in range(n):
        a = action()
        r = reward_func(a)
        
        states[a] = (states[a][0] + r, states[a][1] + 1 - r)
    
    return states

In [2]:
def probability_reward_func(probs):
    return lambda a: 1 if np.random.rand() < probs[a] else 0

In [3]:
def summary(states):
    for a, s in states.items():
        print(f'{a}: win={s[0]}, lose={s[1]}, p={s[0] / sum(s)}')

In [4]:
probs1 = { 'a': 0.2, 'b': 0.5, 'c': 0.7 }

summary( thompson_sampling(probs1.keys(), probability_reward_func(probs1)) )

a: win=1, lose=5, p=0.16666666666666666
b: win=13, lose=19, p=0.40625
c: win=679, lose=283, p=0.7058212058212058


In [5]:
probs2 = { 'a': 0.2, 'b': 0.5, 'c': 0.7, 'd': 0.1, 'e': 0.8 }

summary( thompson_sampling(probs2.keys(), probability_reward_func(probs2)) )

a: win=0, lose=4, p=0.0
b: win=3, lose=5, p=0.375
c: win=111, lose=40, p=0.7350993377483444
d: win=3, lose=6, p=0.3333333333333333
e: win=668, lose=160, p=0.8067632850241546
