In [None]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
from time import sleep

%matplotlib inline

# Naive bandits
## $\epsilon$-greedy and UCB1

In [None]:
def test_eps_greedy(n_steps, bandits, eps=.01):
    rew_rng = np.random.default_rng(seed)
    exp_rng = np.random.default_rng(seed)
    
    n_bandits = bandits.shape[0]
    R = np.zeros(n_bandits)
    N = np.zeros(n_bandits) + 1e-5
    
    optimal = bandits[np.argmax(bandits)]
    every_k = max(n_steps / 1000, 1)
    cum_regret, regret = 0, []

    for i in range(n_steps):
        avg_ret = R/N

        if exp_rng.random() <= eps:
            # rand exploration
            a = exp_rng.choice(n_bandits)
        else:
            a = np.argmax(avg_ret)
        
        x = rew_rng.random()
        r = 1. if x <= bandits[a] else 0.
        r_opt = 1. if x <= optimal else 0.
        cum_regret += r_opt - r
        
        R[a] += r
        N[a] += 1
        
        if i % every_k == 0:
            regret.append(cum_regret)        
        
    return regret, R, N

seed = 1337
np.random.seed(seed)
n_bandits = 4
n_steps = 10000

bandits = np.random.random_sample(n_bandits)
print(bandits)

regret, R, N = test_eps_greedy(n_steps, bandits, .01)
print(R/N, R, N)

_ = plt.plot(regret, label='.01')

regret, R, N = test_eps_greedy(n_steps, bandits, .05)
print(R/N, R, N)
_ = plt.plot(regret, label='.05')

regret, R, N = test_eps_greedy(n_steps, bandits, .2)
print(R/N, R, N)
_ = plt.plot(regret, label='.2')
plt.legend()

In [None]:
def test_ucb1(n_steps, bandits, eps=.01):
    rew_rng = np.random.default_rng(seed)
    
    n_bandits = bandits.shape[0]
    R = np.zeros(n_bandits)
    N = np.zeros(n_bandits) + 1e-5
    
    optimal = bandits[np.argmax(bandits)]
    every_k = max(n_steps / 1000, 1)
    cum_regret, regret = 0, []

    for i in range(1, n_steps+1):
        avg_ret = R/N
        U = (2 * np.log(i) / N)**.5
        
        a = np.argmax(avg_ret + U)
        
        x = rew_rng.random()
        r = 1. if x <= bandits[a] else 0.
        r_opt = 1. if x <= optimal else 0.
        cum_regret += r_opt - r
        
        R[a] += r
        N[a] += 1
        
        if i % every_k == 0:
            regret.append(cum_regret)        
        
    return regret, R, N

seed = 1337
n_bandits = 4
n_steps = 10000
np.random.seed(seed)
bandits = np.random.random_sample(n_bandits)
print(bandits)

regret, R, N = test_ucb1(n_steps, bandits, .01)
print(R/N, R, N)

_ = plt.plot(regret, label='.01')

In [None]:
seed = 1337
np.random.seed(seed)
n_bandits = 4
n_steps = 1000

bandits = np.random.random_sample(n_bandits)
print(bandits)

regret, R, N = test_eps_greedy(n_steps, bandits, .01)
print(R/N, R, N)

_ = plt.plot(regret, label='.01')

regret, R, N = test_eps_greedy(n_steps, bandits, .05)
print(R/N, R, N)
_ = plt.plot(regret, label='.05')

regret, R, N = test_eps_greedy(n_steps, bandits, .2)
print(R/N, R, N)
_ = plt.plot(regret, label='.2')

regret, R, N = test_ucb1(n_steps, bandits)
print(R/N, R, N)

_ = plt.plot(regret, label='usb')

plt.legend()

In [None]:
seed = 1337
np.random.seed(seed)
n_bandits = 5
n_steps = 3000

bandits = np.array([.1, .3, .4, .7, .8])
print(bandits)

regret, R, N = test_eps_greedy(n_steps, bandits, .05)
print(R/N, R, N)
_ = plt.plot(regret, label='.05')

regret, R, N = test_ucb1(n_steps, bandits)
print(R/N, R, N)

_ = plt.plot(regret, label='usb')

plt.legend()

### Cell-based approximations

In [None]:
def test_cells_ucb1_vb(n_steps, bandits, n_cells, k_cells):
    rew_rng = np.random.default_rng(seed)
    upd_rng = np.random.default_rng(seed)
    
    n_bandits = bandits.shape[0]
    R = np.zeros((n_bandits, n_cells))
    N = np.zeros((n_bandits, n_cells)) + 1e-5
    
    optimal = bandits[np.argmax(bandits)]
    every_k = max(n_steps / 1000, 1)
    cum_regret, regret = 0, []

    for i in range(1, n_steps+1):
        avg_ret = R.sum(axis=1) / N.sum(axis=1)
        U = ((k_cells / n_cells) * 2 * np.log(i) / N.mean(axis=1))**.5
        
        a = np.argmax(avg_ret + U)
        
        x = rew_rng.random()
        r = 1. if x <= bandits[a] else 0.
        r_opt = 1. if x <= optimal else 0.
        cum_regret += r_opt - r
        
        upd_indices = upd_rng.choice(n_cells, size=k_cells, replace=False)
        
        R[a][upd_indices] += r
        N[a][upd_indices] += 1
        
        if i % every_k == 0:
            regret.append(cum_regret)        
        
    return regret, R.sum(axis=1) / N.sum(axis=1)


seed = 1337
n_bandits = 4
n_steps = 3000
np.random.seed(seed)
bandits = np.array([.1, .3, .4, .7, .8])
print(bandits)

regret, probs = test_cells_ucb1_vb(n_steps, bandits, 8, 2)
print(probs)
_ = plt.plot(regret, label='2')


regret, probs = test_cells_ucb1_vb(n_steps, bandits, 8, 4)
print(probs)
_ = plt.plot(regret, label='4')


regret, probs = test_cells_ucb1_vb(n_steps, bandits, 8, 6)
print(probs)
_ = plt.plot(regret, label='6')

regret, R, N = test_ucb1(n_steps, bandits)
print(R/N, R, N)

_ = plt.plot(regret, label='usb')

plt.legend()

In [None]:
def get_avg_ret2(R, N):
    return (R / N).avg(axis=1)

def test_cells_ucb1_vb_crude(n_steps, bandits, n_cells, k_cells):
    rew_rng = np.random.default_rng(seed)
    upd_rng = np.random.default_rng(seed)
    
    n_bandits = bandits.shape[0]
    R = np.zeros((n_bandits, n_cells))
    N = np.zeros((n_bandits, n_cells)) + 1e-5
    
    optimal = bandits[np.argmax(bandits)]
    every_k = max(n_steps / 1000, 1)
    cum_regret, regret = 0, []

    for i in range(1, n_steps+1):
        avg_ret = R.sum(axis=1) / N.sum(axis=1)
        U = (2 * np.log(i * k_cells) / N.sum(axis=1))**.5
        
        a = np.argmax(avg_ret + U)
        
        x = rew_rng.random()
        r = 1. if x <= bandits[a] else 0.
        r_opt = 1. if x <= optimal else 0.
        cum_regret += r_opt - r
        
        upd_indices = upd_rng.choice(n_cells, size=k_cells, replace=False)
        
        R[a][upd_indices] += r
        N[a][upd_indices] += 1
        
        if i % every_k == 0:
            regret.append(cum_regret)        
        
    return regret, R.sum(axis=1) / N.sum(axis=1)


seed = 1337
n_steps = 3000
np.random.seed(seed)
bandits = np.array([.1, .3, .4, .7, .8])
print(bandits)

regret, probs = test_cells_ucb1_vb_crude(n_steps, bandits, 8, 2)
print(probs)
_ = plt.plot(regret, label='2')


regret, probs = test_cells_ucb1_vb_crude(n_steps, bandits, 8, 4)
print(probs)
_ = plt.plot(regret, label='4')


regret, probs = test_cells_ucb1_vb_crude(n_steps, bandits, 8, 6)
print(probs)
_ = plt.plot(regret, label='6')

regret, R, N = test_ucb1(n_steps, bandits)
print(R/N, R, N)

_ = plt.plot(regret, label='usb')

plt.legend()

In [None]:
def test_cells_ucb1_cb(n_steps, bandits, n_cells, k_cells):
    rew_rng = np.random.default_rng(seed)
    upd_rng = np.random.default_rng(seed)
    
    n_bandits = bandits.shape[0]
    R = np.zeros((n_bandits, n_cells))
    N = np.zeros((n_bandits, n_cells)) + 1e-5
    
    optimal = bandits[np.argmax(bandits)]
    every_k = max(n_steps / 1000, 1)
    cum_regret, regret = 0, []

    for i in range(1, n_steps+1):
        avg_ret = R / N
        U = ((k_cells / n_cells) * 2 * np.log(i) / N)**.5
        a = np.argmax((avg_ret + U).mean(axis=1))
        
        x = rew_rng.random()
        r = 1. if x <= bandits[a] else 0.
        r_opt = 1. if x <= optimal else 0.
        cum_regret += r_opt - r
        
        upd_indices = upd_rng.choice(n_cells, size=k_cells, replace=False)
        
        R[a][upd_indices] += r
        N[a][upd_indices] += 1
        
        if i % every_k == 0:
            regret.append(cum_regret)        
        
    return regret, (R / N).mean(axis=1)


seed = 1337
n_steps = 3000
np.random.seed(seed)
bandits = np.array([.1, .3, .4, .7, .8])
print(bandits)

regret, probs = test_cells_ucb1_cb(n_steps, bandits, 8, 2)
print(probs)
_ = plt.plot(regret, label='2')


regret, probs = test_cells_ucb1_cb(n_steps, bandits, 8, 4)
print(probs)
_ = plt.plot(regret, label='4')


regret, probs = test_cells_ucb1_cb(n_steps, bandits, 8, 6)
print(probs)
_ = plt.plot(regret, label='6')

regret, R, N = test_ucb1(n_steps, bandits)
print(R/N, R, N)
_ = plt.plot(regret, label='usb')

plt.legend()

In [None]:
seed = 1337
np.random.seed(seed)
n_bandits = 5
n_steps = 30000

bandits = np.array([.1, .3, .4, .7, .8])
print(bandits)

regret, R, N = test_eps_greedy(n_steps, bandits, .05)
print(R/N, R, N)
_ = plt.plot(regret, label='.05')


for k_cells in [2, 5]:
#     regret, probs = test_cells_ucb1_vb(n_steps, bandits, 8, k_cells)
#     print(probs)
#     _ = plt.plot(regret, label=f'vb-{k_cells}')
    
    regret, probs = test_cells_ucb1_vb_crude(n_steps, bandits, 8, k_cells)
    print(probs)
    _ = plt.plot(regret, label=f'vbc-{k_cells}')
    
    regret, probs = test_cells_ucb1_cb(n_steps, bandits, 8, k_cells)
    print(probs)
    _ = plt.plot(regret, label=f'cb-{k_cells}')

regret, R, N = test_ucb1(n_steps, bandits)
print(R/N, R, N)

_ = plt.plot(regret, label='usb')

plt.legend()

## Non-orthogonal action encoding

In [None]:
a = np.arange(40).reshape((5, 8)).ravel()
action_cell_indices = []
for i in range(5):
    base_indices = list(range(i*8, (i+1)*8))
    shared_indices = [
        x if x < i*8 else x + 8
        for x in np.random.choice((5-1)*8, 4, replace=False)
    ]
    action_cell_indices.append(sorted(base_indices + shared_indices))
    
x = np.array(action_cell_indices)
x

a[x]

In [None]:
def init_cells(n_actions, n_cells, n_shared_cells):
    rng = np.random.default_rng(seed)
    
    R = np.zeros((n_actions, n_cells)).ravel()
    N = np.zeros((n_actions, n_cells)).ravel() + 1e-5
    
    action_cell_indices = []
    for i in range(n_actions):
        base_indices = list(range(i*n_cells, (i+1)*n_cells))
        shared_indices = [
            x if x < i*n_cells else x + n_cells
            for x in rng.choice((n_actions - 1) * 8, n_shared_cells, replace=False)
        ]
        action_cell_indices.append(sorted(base_indices + shared_indices))

    return R, N, np.array(action_cell_indices)
                                   
def test_cells_ucb1_vb(n_steps, bandits, n_cells, k_cells, n_shared_cells):
    rew_rng = np.random.default_rng(seed)
    upd_rng = np.random.default_rng(seed)
    
    n_bandits = bandits.shape[0]
    R, N, action_cell_indices = init_cells(n_bandits, n_cells, n_shared_cells)
    optimal, every_k, cum_regret, regret = bandits[np.argmax(bandits)], max(n_steps / 1000, 1), 0, []

    for i in range(1, n_steps+1):
        Q = R[action_cell_indices].sum(axis=1) / N[action_cell_indices].sum(axis=1)
        U = ((k_cells / n_cells) * 2 * np.log(i) / N[action_cell_indices].mean(axis=1))**.5
        a = np.argmax(Q + U)
        
        x = rew_rng.random()
        r = 1. if x <= bandits[a] else 0.        
        upd_indices = upd_rng.choice(action_cell_indices[a], size=k_cells, replace=False)        
        R[upd_indices] += r
        N[upd_indices] += 1
        
        r_opt = 1. if x <= optimal else 0.
        cum_regret += r_opt - r
        if i % every_k == 0:
            regret.append(cum_regret)
        
    return regret, R[action_cell_indices].sum(axis=1) / N[action_cell_indices].sum(axis=1)


seed = 1337
n_bandits = 4
n_steps = 100000
np.random.seed(seed)
bandits = np.array([.1, .3, .4, .7, .8])
print(bandits)


k = 4
for n_shared_cells in [2, 4]:
    regret, probs = test_cells_ucb1_vb(n_steps, bandits, 12, k, n_shared_cells)
    print(probs)
    _ = plt.plot(regret, label=f'{k}-{n_shared_cells}')


# regret, probs = test_cells_ucb1_vb(n_steps, bandits, 8, 4, 2)
# print(probs)
# _ = plt.plot(regret, label='4')


# regret, probs = test_cells_ucb1_vb(n_steps, bandits, 8, 6, 2)
# print(probs)
# _ = plt.plot(regret, label='6')

regret, R, N = test_ucb1(n_steps, bandits)
print(R/N, R, N)

_ = plt.plot(regret, label='usb')

plt.legend()

In [None]:
def init_cells(n_actions, n_cells, n_shared_cells):
    rng = np.random.default_rng(seed)
    
    R = np.zeros((n_actions, n_cells)).ravel()
    N = np.zeros((n_actions, n_cells)).ravel() + 1e-5
    
    action_cell_indices = []
    for i in range(n_actions):
        base_indices = list(range(i*n_cells, (i+1)*n_cells))
        shared_indices = [
            x if x < i*n_cells else x + n_cells
            for x in rng.choice((n_actions - 1) * 8, n_shared_cells, replace=False)
        ]
        action_cell_indices.append(sorted(base_indices + shared_indices))

    return R, N, np.array(action_cell_indices)
                                   
def test_cells_ucb1_cb(n_steps, bandits, n_cells, k_cells, n_shared_cells):
    rew_rng = np.random.default_rng(seed)
    upd_rng = np.random.default_rng(seed)
    
    n_bandits = bandits.shape[0]
    R, N, action_cell_indices = init_cells(n_bandits, n_cells, n_shared_cells)
    optimal, every_k, cum_regret, regret = bandits[np.argmax(bandits)], max(n_steps / 1000, 1), 0, []

    for i in range(1, n_steps+1):
        Q = R / N
        U = ((k_cells / n_cells) * 2 * np.log(i) / N)**.5
        a = np.argmax(
            (Q + U)[action_cell_indices].mean(axis=1)
        )
        
        x = rew_rng.random()
        r = 1. if x <= bandits[a] else 0.        
        upd_indices = upd_rng.choice(action_cell_indices[a], size=k_cells, replace=False)        
        R[upd_indices] += r
        N[upd_indices] += 1
        
        r_opt = 1. if x <= optimal else 0.
        cum_regret += r_opt - r
        if i % every_k == 0:
            regret.append(cum_regret)
        
    return regret, Q[action_cell_indices].mean(axis=1)


seed = 1337
n_bandits = 4
n_steps = 100000
np.random.seed(seed)
bandits = np.array([.1, .3, .4, .7, .8])
print(bandits)


k = 4
for n_shared_cells in [2, 8]:
    regret, probs = test_cells_ucb1_cb(n_steps, bandits, 12, k, n_shared_cells)
    print(probs)
    _ = plt.plot(regret, label=f'{k}-{n_shared_cells}')
    

n_shared_cells = 4
for k in [2, 4, 8, 16]:
    regret, probs = test_cells_ucb1_cb(n_steps, bandits, 12, k, n_shared_cells)
    print(probs)
    _ = plt.plot(regret, label=f'{k}-{n_shared_cells}')

regret, R, N = test_ucb1(n_steps, bandits)
print(R/N, R, N)

_ = plt.plot(regret, label='usb')

plt.legend()