In [21]:
import numpy as np
from numpy import random
from random import choices

In [44]:
class Stimuli:
    def __init__(self,mu,dev):
        self.mu = mu
        self.dev = dev
    def sample(self):
        return random.normal(self.mu, self.dev)      # random sample from distribution of sample

def ucb1(candidates, total_time):
    scores = []                # Initialise scores
    history = {idx: [c.sample()] for idx,c in enumerate(candidates)}   # Init history then sample each option once.
    for t in range(len(candidates),total_time):
        # Find action + upper confidence bound for each candidate
        mu_plus_ucb = [np.mean(history[idx])+np.sqrt(2*np.log(t)/len(history[idx])) for idx in range(len(candidates))]
        chosen = candidates[np.argmax(mu_plus_ucb)]     # Find candidate with highest value
        score = chosen.sample()                         # Sample from candidate distribution   
        scores.append(score)                            # Add to score list
        history[candidates.index(chosen)].append(score) # Update history 
    return sum(scores)

def softmax(candidates,total_time):
    scores = []
    history = {idx: [c.sample()] for idx,c in enumerate(candidates)}
    for t in range(len(candidates),total_time): 
        expected_rewards = [np.mean(history[item]) for item in history]                           # overall reward
        weights = [np.exp(np.mean(history[item])) / np.sum(expected_rewards) for item in history] # probability vector
        chosen = choices(candidates,weights)
        score = chosen[0].sample()
        scores.append(score)
        history[candidates.index(chosen[0])].append(score)
    return sum(scores)

def logistic_noise(candidates,total_time):

In [38]:
mu_vals = [0.3922,0.6555,0.1712,0.7060,0.1300,0.2769,0.0462,0.0971,0.8235]
sig_vals = [np.sqrt(0.0005) for i in mu_vals]
mu_sig = zip(mu_vals,sig_vals)
candidates = [Stimuli(mu,sigma) for mu,sigma in mu_sig]
total_time = 500
optimal_average = max(mu_vals)*total_time


In [47]:
ucb1_vals = []
for _ in range(10):
    val = ucb1(candidates,total_time)
    ucb1_vals.append(val)
print('UCB1 Mean Regret: %s'%((optimal_average - np.mean(ucb1_vals)) / optimal_average))
# Mean regret = how much value was lost compared to optimal normalised by optimal -> lower is better
print(np.mean(ucb1_vals)/total_time)

UCB1 Mean Regret: 0.7055503778490656
0.24247926384129448


In [48]:
softmax_vals = []
for _ in range(10):         # Number of averages
    val = softmax(candidates,total_time)
    softmax_vals.append(val)
print('Softmax Mean Regret: %s'%((optimal_average - np.mean(softmax_vals)) / optimal_average))
# Mean regret = how much value was lost compared to optimal normalised by optimal -> lower is better
print(np.mean(softmax_vals)/total_time)

Softmax Mean Regret: 0.7171119672735508
0.2329582949502309
