In [164]:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import time
import plotly.io as pio
pio.renderers.default = 'iframe'

In [147]:
class BernouilliThompsonSampling:
    def __init__(self, num_arms, parameters_to_estimate):
        self.num_arms = num_arms
        self.parameters_to_estimate = parameters_to_estimate
        self.regrets = []
        self.n_iters = 0

    def init_priors(self):
        self.prior_distributions = [(1,1) for _ in range(self.num_arms)]

    def sample_priors(self):
        return [np.random.beta(a, b) for (a, b) in self.prior_distributions]

    def get_reward(self, sampled_value, chosen_arm_index):
        return int(sampled_value < self.parameters_to_estimate[chosen_arm_index])

    def update_priors(self, arm_index, reward):
        current_distrib = self.prior_distributions[arm_index]
        self.prior_distributions[arm_index] = (current_distrib[0] + reward, current_distrib[1] + 1 - reward)
        self.n_iters += 1
        
    def compute_regret(self, chosen_arm_index):
        optimal_reward = max(self.parameters_to_estimate)
        chosen_reward = self.parameters_to_estimate[chosen_arm_index]
        regret = optimal_reward - chosen_reward
        self.regrets.append(regret)

    def show(self):
        fig = make_subplots(rows=1, cols=2, subplot_titles=("Beta distributions", "Cumulative Regret"))

        for i, (a, b) in enumerate(self.prior_distributions):
            x = np.linspace(stats.beta.ppf(0.001, a, b), stats.beta.ppf(0.999, a, b), 200)
            beta_pdf = stats.beta.pdf(x, a, b)
            fig.add_trace(go.Scatter(x=x,
                                     y=beta_pdf,
                                     mode='lines',
                                     name=f'Theoretical_reward : {self.parameters_to_estimate[i]} - Success/Fails : {self.prior_distributions[i]}'),
                          row=1, col=1)
        fig.add_trace(go.Scatter(x=list(range(len(self.regrets))), y=np.cumsum(self.regrets), mode='lines', name='Cumulative regrets'), row=1, col=2)

        fig.update_layout(title_text=f"Itération : {self.n_iters}")
        fig.show()
         

In [3]:
num_arms = 10
num_iters = 500
eval_iters = 50

parameters_to_estimate = np.random.uniform(low=0, high=1, size=num_arms)
bt_sampling = BernouilliThompsonSampling(num_arms, parameters_to_estimate)
bt_sampling.init_priors()

for i in range(num_iters):
    sampled_values = bt_sampling.sample_priors()
    best_arm_index = np.argmax(sampled_values)
    
    associated_reward = bt_sampling.get_reward(sampled_value=sampled_values[best_arm_index],
                                               chosen_arm_index=best_arm_index)
    bt_sampling.update_priors(best_arm_index, associated_reward)
    bt_sampling.compute_regret(best_arm_index)

    if i % eval_iters == eval_iters - 1:
        time.sleep(1)
        bt_sampling.show()
        