In [None]:
import numpy as np

In [None]:
#Multi-Armed Bandit
#We have K = 9 coins with (unknown) success probabilities:
pvals = np.arange(0.1, 1, 0.1)
K = len(pvals)
print(K)
T = 1000 #Number of rounds that we will play for

In [None]:
#In each round, we pick a coin:
i_pick = 3
i_reward = np.random.binomial(1, pvals[i_pick+1])
print(i_reward)

In [None]:
#Random choice of coin at each round:
cumu_reward = 0
for tme in range(T):
    i_pick = np.random.choice(len(pvals))
    i_reward = np.random.binomial(1, pvals[i_pick])
    cumu_reward = cumu_reward + i_reward
print(cumu_reward)

In [None]:
delta = 0.1
50*np.log(1/delta)

In [None]:
#Explore then Commit Algorithm:
m = 50 #we will explore each coin for m rounds (overall there will be m*K rounds of exploration)
allrounds_ETC = np.zeros((K, T))
allrewards_ETC = np.zeros((K, T))
cumu_reward_ETC = 0
#Exploration phase:
for tme in range(1, m*K + 1):
    i_pick = (tme-1) % K
    allrounds_ETC[i_pick, tme-1] = 1
    i_reward = np.random.binomial(1, pvals[i_pick])
    allrewards_ETC[i_pick, tme -1] = i_reward
    cumu_reward_ETC = cumu_reward_ETC + i_reward
print(cumu_reward_ETC)

coin_sums = allrounds_ETC.sum(axis=1)
print(coin_sums)

cumu_reward_alternative = allrewards_ETC.sum()
print(cumu_reward_alternative)

num_tosses = allrounds_ETC.sum(axis=1)
reward_coins = allrewards_ETC.sum(axis=1)
phat = reward_coins/num_tosses
print(phat)

In [None]:
#UCB Algorithm
#First explore as in  ETC
for tme in range(m*K+1, T+1):
    #delta = 1/(tme ** 3)
    delta = 1
    num_tosses = allrounds_ETC.sum(axis=1)
    reward_coins = allrewards_ETC.sum(axis=1)
    phat = reward_coins/num_tosses
    ucb = phat + np.sqrt((np.log(1/delta))/(2*num_tosses))
    i_pick = np.argmax(ucb)
    allrounds_ETC[i_pick, tme-1] = 1
    i_reward = np.random.binomial(1, pvals[i_pick])
    allrewards_ETC[i_pick, tme -1] = i_reward
total_reward = allrewards_ETC.sum()
print(total_reward)

In [None]:
#Full UCB Algorithm:
m = 1 #we will explore each coin for m rounds (overall there will be m*K rounds of exploration)
allrounds_UCB = np.zeros((K, T))
allrewards_UCB = np.zeros((K, T))
cumu_reward_UCB = 0
#Exploration phase:
for tme in range(1, m*K + 1):
    i_pick = (tme-1) % K
    allrounds_UCB[i_pick, tme-1] = 1
    i_reward = np.random.binomial(1, pvals[i_pick])
    allrewards_UCB[i_pick, tme -1] = i_reward
    cumu_reward_UCB = cumu_reward_UCB + i_reward
for tme in range(m*K+1, T+1):
    #delta = 1
    delta = 1/(tme ** 3)
    num_tosses = allrounds_UCB.sum(axis=1)
    reward_coins = allrewards_UCB.sum(axis=1)
    phat = reward_coins/num_tosses
    ucb = phat + np.sqrt((np.log(1/delta))/(2*num_tosses))
    i_pick = np.argmax(ucb)
    allrounds_UCB[i_pick, tme-1] = 1
    i_reward = np.random.binomial(1, pvals[i_pick])
    allrewards_UCB[i_pick, tme -1] = i_reward
total_reward = allrewards_UCB.sum()
print(total_reward)

In [None]:
#How many times was each coin picked?
print(allrounds_UCB.sum(axis = 1))

In [None]:
#Cumulative Reward
cumu_reward_UCB = np.cumsum(allrewards_UCB.sum(axis = 0))

import matplotlib.pyplot as plt
plt.plot(cumu_reward_UCB)
plt.xlabel('Round')
plt.ylabel('Cumulative Reward')
plt.title('Explore then Commit Algorithm')
plt.grid(True)
plt.show()

In [None]:
#Thompson Sampling
allrounds_TS = np.zeros((K, T))
allrewards_TS = np.zeros((K, T))
for tme in range(1, T+1):
    num_tosses = allrounds_TS.sum(axis=1)
    reward_coins = allrewards_TS.sum(axis=1)
    samples = [np.random.beta(reward_coins[i] + 1, num_tosses[i] - reward_coins[i]+1) for i in range(K)]
    i_pick = np.argmax(samples)
    allrounds_TS[i_pick, tme-1] = 1
    i_reward = np.random.binomial(1, pvals[i_pick])
    allrewards_TS[i_pick, tme -1] = i_reward
total_reward_TS = allrewards_TS.sum()
print(total_reward_TS)

In [None]:
#How many times was each coin picked?
print(allrounds_TS.sum(axis = 1))

In [None]:
#Plotting cumulative Reward
cumu_reward_TS = np.cumsum(allrewards_TS.sum(axis = 0))

plt.plot(cumu_reward_UCB, color = 'blue', label = 'UCB')
plt.plot(cumu_reward_TS, color = 'red', label = 'TS')
plt.xlabel('Round')
plt.ylabel('Cumulative Reward')
plt.title('Cumulative Rewards of UCB and TS')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#Plotting Regrets (instead of Rewards):
cumu_regret_UCB = np.arange(1,T+1)*np.max(pvals) - cumu_reward_UCB
cumu_regret_TS = np.arange(1,T+1)*np.max(pvals) - cumu_reward_TS

plt.plot(cumu_regret_UCB, color = 'blue', label = 'UCB')
plt.plot(cumu_regret_TS, color = 'red', label = 'TS')
plt.xlabel('Round')
plt.ylabel('Cumulative Regret')
plt.title('Cumulative Regrets of UCB and TS')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#Averaging over multiple simulation runs:
#To get smooth regrets, we can average individual regrets over multiple simulation runs:

#UCB:
m = 1
n_sims = 80
allsims_UCB = np.zeros((n_sims, T))
for sim in range(n_sims):
    allrounds_UCB = np.zeros((K, T))
    allrewards_UCB = np.zeros((K, T))
    #Exploration phase:
    for tme in range(m*K):
        i_pick = tme % K
        allrounds_UCB[i_pick, tme] = 1
        i_reward = np.random.binomial(1, pvals[i_pick])
        allrewards_UCB[i_pick, tme] = i_reward
    for tme in range(m*K, T):
        num_tosses = allrounds_UCB.sum(axis=1)
        reward_coins = allrewards_UCB.sum(axis=1)
        phat = reward_coins/num_tosses
        delta = 1/(tme ** 3)
        #delta = 1
        ucb = phat + np.sqrt((np.log(1/delta))/(2*num_tosses))
        i_pick = np.argmax(ucb)
        allrounds_UCB[i_pick, tme] = 1
        i_reward = np.random.binomial(1, pvals[i_pick])
        allrewards_UCB[i_pick, tme] = i_reward
    cumu_reward_UCB = np.cumsum(allrewards_UCB.sum(axis = 0))
    cumu_regret_UCB = np.arange(1,T+1)*np.max(pvals) - cumu_reward_UCB
    allsims_UCB[sim,:] = cumu_regret_UCB
average_cumu_regret_UCB = np.mean(allsims_UCB, axis = 0)

#For Thompson Sampling:
allsims_TS = np.zeros((n_sims, T))
for sim in range(n_sims):
    allrounds_TS = np.zeros((K, T)) 
    allrewards_TS = np.zeros((K, T))
    for tme in range(T):
        num_tosses = allrounds_TS.sum(axis=1)
        reward_coins = allrewards_TS.sum(axis=1)
        samples = [np.random.beta(reward_coins[i] + 1, num_tosses[i] - reward_coins[i]+1) for i in range(K)]
        i_pick = np.argmax(samples)
        allrounds_TS[i_pick, tme] = 1
        i_reward = np.random.binomial(1, pvals[i_pick])
        allrewards_TS[i_pick, tme] = i_reward
    cumu_reward_TS = np.cumsum(allrewards_TS.sum(axis = 0))
    cumu_regret_TS = np.arange(1,T+1)*np.max(pvals) - cumu_reward_TS
    allsims_TS[sim,:] = cumu_regret_TS
average_cumu_regret_TS = np.mean(allsims_TS, axis = 0)

plt.plot(average_cumu_regret_UCB, color = 'blue', label = 'UCB')
plt.plot(average_cumu_regret_TS, color = 'red', label = 'TS')
plt.xlabel('Round')
plt.ylabel('Average Regret')
plt.title('Averaged Cumulative Regrets of UCB and TS')
plt.legend()
plt.grid(True)
plt.show()