Copyright 2022 Google LLC

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy import special

# Helper Functions

In [None]:
def KL(mu, nu, dist='Gaussian'):
    """
    Parameters
    ----------
    mu, nu:
        array of size (K, n_instances), where K is the number of arms,
        and n_instances is the number of problem instances.
        values denote the expected reward of arms.
    """
    if dist == 'Gaussian':
        return (mu-nu)**2/2
    elif dist == 'Bernoulli':
        return mu*np.log(mu/nu) + (1-mu)*np.log((1-mu)/(1-nu))
    elif dist == 'Exponential':
        return np.log(mu/nu) + nu/mu - 1

def KLRatio(mu, w, dist='Gaussian'):
    """
    Parameters
    ----------
    mu:
        array of size (K, n_instances), where K is the number of arms,
        and n_instances is the number of problem instances.
        values denote the expected reward of arms.
    w:
        array of size (K, n_instances).
        values denote the weight of each arm

    Output
    -------
    KL(mu1, (w1*mu1+wa*mua)/(w1+wa))/KL(mua, (w1*mu1+wa*mua)/(w1+wa)) for every arm a
    """
    K = mu.shape[0]
    n = mu.shape[1]
    best_arm_idx = np.argmax(mu, axis=0)
    w_best = w[best_arm_idx, np.arange(n)]
    mu_best = mu[best_arm_idx, np.arange(n)]
    mu_avg = (w*mu+(w_best*mu_best)[None,:])/(w+w_best[None,:])
    return KL(mu_best[None,:], mu_avg, dist)/KL(mu, mu_avg, dist)

def KLObjective(mu, w, dist='Gaussian'):
    """
    Parameters
    ----------
    mu:
        array of size (K, n_instances), where K is the number of arms,
        and n_instances is the number of problem instances.
        values denote the expected reward of arms.
    w:
        array of size (K, n_instances).
        values denote the weight of each arm

    Output
    -------
    w1*KL(mu1, (w1*mu1+wa*mua)/(w1+wa)) + wa* KL(mua, (w1*mu1+wa*mua)/(w1+wa)) for every arm a
    """

    K = mu.shape[0]
    n = mu.shape[1]
    best_arm_idx = np.argmax(mu, axis=0)
    w_best = w[best_arm_idx, np.arange(n)]
    mu_best = mu[best_arm_idx, np.arange(n)]
    mu_avg = (w*mu+(w_best*mu_best)[None,:])/(w+w_best[None,:])

    return w_best[None,:]*KL(mu_best[None,:], mu_avg, dist) + w*KL(mu, mu_avg, dist)

def KLObjectiveGrad(mu, w, dist='Gaussian'):
    """
    Parameters
    ----------
    mu:
        array of size (K, n_instances), where K is the number of arms,
        and n_instances is the number of problem instances.
        values denote the expected reward of arms.
    w:
        array of size (K, n_instances).
        values denote the weight of each arm

    Output
    -------
    [KL(mu1, (w1*mu1+wa*mua)/(w1+wa)), KL(mua, (w1*mu1+wa*mua)/(w1+wa))]
    where a is the arm with the smallest index
    """

    K = mu.shape[0]
    n = mu.shape[1]
    best_arm_idx = np.argmax(mu, axis=0)
    w_best = w[best_arm_idx, np.arange(n)]
    mu_best = mu[best_arm_idx, np.arange(n)]
    mu_avg = (w*mu+(w_best*mu_best)[None,:])/(w+w_best[None,:])

    t1 = KL(mu_best[None,:], mu_avg, dist)
    t2 = KL(mu, mu_avg, dist)
    kl_obj = w_best[None,:]*t1 + w*t2
    kl_obj[best_arm_idx, np.arange(n)] = np.Inf
    competitors = np.argmin(kl_obj, axis=0)
    result = np.zeros_like(w)
    result[best_arm_idx, np.arange(n)] = t1[competitors, np.arange(n)]
    result[competitors, np.arange(n)] = t2[competitors, np.arange(n)]
    return result


# max-min solvers

In [None]:
class OfflineOnlineMaxMinSolver(object):
    def __init__(self, mu, delta, n_offline, dist='Gaussian'):
        """
        Parameters
        ----------
        mu:
            array of size (K, n_instances), where K is the number of arms,
            and n_instances is the number of problem instances.
            values denote the expected reward of arms.
        delta:
            confidence parameter (scalar)
        n_offline:
            array of size (K, n_instances).
            number of offline samples available for each arm
        dist:
            distribution of the arms (string)
            options: [Gaussian, Bernoulli, Exponential]
        """
        self.mu = mu

        self.K = mu.shape[0]
        self.n_instances = mu.shape[1]
        self.best_arm_idx = np.argmax(self.mu, axis=0)

        self.delta = delta
        self.beta = -np.log(delta) + np.log(-np.log(delta))

        self.n_offline = n_offline
        self.dist = dist

        self.n_upper_bound = 1e9

    def _na_star(self, n1_online, tol=1e-6):
        # do bisection search to find Na*(N1)
        n_total_lb = np.zeros_like(self.mu)
        n_total_ub = self.n_upper_bound*np.ones_like(self.mu)
        constraint_error = np.zeros_like(self.mu) # store the error in the constraint

        n1_offline = self.n_offline[self.best_arm_idx, np.arange(self.n_instances)]
        n1_total = n1_offline + n1_online
        n_total_lb[self.best_arm_idx, np.arange(self.n_instances)] = n1_total
        n_total_ub[self.best_arm_idx, np.arange(self.n_instances)] = n1_total

        iter = 0

        while np.any(n_total_ub - n_total_lb > tol):
          # determine the next point for bisection search
          n_total_next = 0.5*(n_total_lb+n_total_ub)

          # do bisection search based on the constraint value
          constraint = KLObjective(self.mu, n_total_next, self.dist)
          constraint_error = np.abs(constraint - self.beta)
          constraint_error[self.best_arm_idx, np.arange(self.n_instances)] = 0

          idx_neg = np.where((constraint <= self.beta))
          idx_pos = np.where((constraint > self.beta))
          n_total_lb[idx_neg] = n_total_next[idx_neg]
          n_total_ub[idx_pos] = n_total_next[idx_pos]

          iter+=1
          if iter%1000 == 0:
            print('warning. bisection search is not converging')
            print(n_total_ub)
            print(n_total_lb)


        return 0.5*(n_total_lb+n_total_ub) - n_offline, constraint_error

    def _n1_sub_grad(self, n1_online, tol=1e-6):
        # computes the gradient of N1+sum_{a>1} Na*(N1) w.r.t N1
        n_online, constraint_error = self._na_star(n1_online, tol)
        kl_ratio = KLRatio(self.mu, self.n_offline + n_online, self.dist)

        inactive_arms = np.where(n_online <= 0)
        kl_ratio[inactive_arms] = 0
        kl_ratio[self.best_arm_idx, np.arange(self.n_instances)] = -1

        sub_grad = -np.sum(kl_ratio, axis=0)

        # if there exists any constraint that isn't satisfied, increase n1
        idx_bad = np.where(np.any(constraint_error > 1e-3, axis = 0))
        sub_grad[idx_bad] = -1
        return sub_grad

    def compute_optimal_proportions(self, tol=1e-2, algo='bisection'):
        if algo == 'bisection':
          # lower and upper bounds for n1 (number of online samples of the best arm)
          n1_lb = np.zeros(self.n_instances)
          n1_ub = self.n_upper_bound*np.ones(self.n_instances)
          while np.any(n1_ub - n1_lb > tol):
            # determine the next point for bisection search
            n1_next = 0.5*(n1_lb+n1_ub)

            # do bisection search based on the sign of the gradients
            n1_grad = self._n1_sub_grad(n1_next)
            idx_neg_grad = np.where((n1_grad <= 0))
            idx_pos_grad = np.where((n1_grad > 0))
            n1_lb[idx_neg_grad] = n1_next[idx_neg_grad]
            n1_ub[idx_pos_grad] = n1_next[idx_pos_grad]

          optimal_proportions, _ = self._na_star(0.5*(n1_lb+n1_ub))
          optimal_proportions[np.where(optimal_proportions < 0.)] = 0.
          return optimal_proportions/np.sum(optimal_proportions, axis=0)[None,:]


In [None]:
class OnlineMaxMinSolver(object):
    def __init__(self, mu, delta, dist='Gaussian'):
        """
        Parameters
        ----------
        mu:
            array of size (K, n_instances), where K is the number of arms,
            and n_instances is the number of problem instances.
            values denote the expected reward of arms.
        delta:
            confidence parameter (scalar)
        n_offline:
            array of size (K, n_instances)
        dist:
            distribution of the arms (string)
            options: [Gaussian, Bernoulli, Exponential]
        """
        self.mu = mu

        self.K = mu.shape[0]
        self.n_instances = mu.shape[1]
        self.best_arm_idx = np.argmax(self.mu, axis=0)

        self.delta = delta
        self.beta = -np.log(delta) + np.log(-np.log(delta))

        self.dist = dist


    def compute_optimal_proportions(self, iters=10000, algo='Top2'):
        if algo == 'FW': # FW doesn't work theoretically
          w = np.ones_like(self.mu)/self.K
          for i in range(1, iters):
            kl_ratio = KLRatio(self.mu, w, self.dist)
            kl_ratio[self.best_arm_idx, np.arange(self.n_instances)] = 0
            kl_objective = KLObjective(self.mu, w, self.dist)
            kl_objective[self.best_arm_idx, np.arange(self.n_instances)] = np.Inf

            idx = np.argmin(kl_objective, axis=0)
            idx1 = np.where(kl_ratio[idx, np.arange(self.n_instances)] >= 1)
            w_next = i*w
            w_next[self.best_arm_idx[idx1], idx1] += 1

            idx2 = np.where(kl_ratio[idx, np.arange(self.n_instances)] < 1)
            w_next[idx[idx2[0]], idx2] += 1
            w = w_next/(i+1)

          return w
        elif algo == 'TCB': # Transportation Cost Balancing
          w = np.ones_like(self.mu)/self.K
          for i in range(1, iters):
            kl_objective = KLObjective(self.mu, w, self.dist)
            kl_objective[self.best_arm_idx, np.arange(self.n_instances)] = np.Inf
            idx = np.argmin(kl_objective, axis=0)

            # w counterfactual for challenger
            w_cf_ch = i*w
            w_cf_ch[idx, np.arange(self.n_instances)]  += 1
            w_cf_ch /= i+1
            kl_objective_cf_ch = KLObjective(self.mu, w_cf_ch, self.dist)
            kl_objective_cf_ch[self.best_arm_idx, np.arange(self.n_instances)] = np.Inf
            kl_objective_cf_ch = np.ndarray.min(kl_objective_cf_ch, axis=0)
            # w counterfactual for best arm
            w_cf_eba = i*w
            w_cf_eba[self.best_arm_idx, np.arange(self.n_instances)] += 1
            w_cf_eba /= i+1
            kl_objective_cf_eba = KLObjective(self.mu, w_cf_eba, self.dist)
            kl_objective_cf_eba[self.best_arm_idx, np.arange(self.n_instances)] = np.Inf
            kl_objective_cf_eba = np.ndarray.min(kl_objective_cf_eba, axis=0)

            # pull arms based on the counterfactual objective values
            idx1 = np.where(kl_objective_cf_ch < kl_objective_cf_eba)
            w_next = i*w
            w_next[self.best_arm_idx[idx1], idx1] += 1

            idx2 = np.where(kl_objective_cf_ch > kl_objective_cf_eba)
            w_next[idx[idx2[0]], idx2] += 1
            w = w_next/(i+1)

          return w
        elif algo == 'Top2':
          w = np.ones_like(self.mu)/self.K
          for i in range(1, iters):
            kl_ratio = KLRatio(self.mu, w, self.dist)
            kl_ratio[self.best_arm_idx, np.arange(self.n_instances)] = 0
            sum_kl_ratio = np.sum(kl_ratio, axis=0)
            kl_objective = KLObjective(self.mu, w, self.dist)
            kl_objective[self.best_arm_idx, np.arange(self.n_instances)] = np.Inf

            # pull arm 1 if the sum_kl_ratio is greater than 1
            idx1 = np.where(sum_kl_ratio > 1)
            w_next = i*w
            w_next[self.best_arm_idx[idx1], idx1] += 1
            w_next[:, idx1] /= (i+1)

            # otherwise pull a competitor
            idx2 = np.where(sum_kl_ratio <= 1)
            idx2_ = idx2[0]
            if idx2_.size == 0:
              w = w_next
              continue
            for j in range(idx2_.size):
              arm_pull = np.argmin(kl_objective[:, idx2_[j]])
              w_next[arm_pull, idx2_[j]] += 1
            w_next[:, idx2] /= (i+1)
            w = w_next

          return w
        elif algo == 'EWA': # exponential weights algorithm
          w = np.ones_like(self.mu)/self.K
          g = np.zeros_like(self.mu)
          for i in range(1, iters):
            eta = i**0.5
            g = g + KLObjectiveGrad(self.mu, w, self.dist)
            w = sp.special.softmax(eta*g, axis=0)
          return w
        elif algo == 'OEWA': # optimistic exponential weights algorithm
          w = np.ones_like(self.mu)/self.K
          g = np.zeros_like(self.mu)
          g_curr = np.zeros_like(self.mu)
          for i in range(1, iters):
            eta = i**0.5
            g_curr = KLObjectiveGrad(self.mu, w, self.dist)
            g = g + g_curr
            w = sp.special.softmax(eta*(g+g_curr), axis=0)
          return w


# Test max-min solvers

## Generate Problem Instances


In [None]:
# generate the problem instance
K = 10 # number of arms
delta = 0.03 # gap between arms
mu = np.random.uniform(0.0+delta, 1.0 - delta, K) # expected reward of the arms
mu = -np.sort(-mu)
mu[0] = 1-delta/2

In [None]:
# run bisection search
mu_rep = np.hstack((mu[:,None], mu[:, None]))
n_offline = np.zeros_like(mu_rep)
bisection_solver = OfflineOnlineMaxMinSolver(mu = mu_rep, delta = 1e-6, n_offline=n_offline, dist='Gaussian')
opt_props = bisection_solver.compute_optimal_proportions()
print(opt_props/np.sum(opt_props, axis=0)[None,:])


In [None]:
# run top2
Top2_solver = OnlineMaxMinSolver(mu = mu_rep, delta = 1e-6, dist='Gaussian')
Top2_opt_props = Top2_solver.compute_optimal_proportions(algo='Top2')
print(Top2_opt_props/np.sum(Top2_opt_props, axis=0)[None,:])


# BAI-MAB

Code only works for a single bandit instance. Can't simultaneously solve multiple bandit instances

In [None]:
def beta(N, delta):
    K = len(N);
    t = sum(N);
    # beta = np.log(K-1) -np.log(delta) + 6*np.log(1 + np.log(t/2)) + 8*np.log(1 + np.log((K-1)/delta));
    beta = np.log(K-1) - np.log(delta) + np.log(1 + np.log(t));
    return beta

# Define a Stopping condition function.
def Stop(mu, N, delta, dist):
    """
    mu : bandit instance
    N : number of samples to each arm
    delta : bound on error probability
    """

    K = len(N);
    t = sum(N);
    w = N/t;
    glrt = t*KLObjective(mu, w, dist);
    best_idx = np.argmax(mu);
    # index for best arm will always be minimum (~ 0). Exclude that.
    glrt[best_idx] = float('inf')
    m = min(glrt);
    threshold = beta(N,delta)

    return m >= threshold

def track(w, N):
    """
    w : weight to track
    N : number of samples to each arm
    returns the arm to pull
    """

    K = len(N);
    t = sum(N);
    return np.argmax(w/N);

def sample(mu, dist, idx):
    """
    mu : mean vector
    dist : class of SPEF
    idx : index to sample from
    """
    # Generate a sample from distribution dist with mean mu[idx]
    if dist == 'Gaussian':
        return np.random.normal(mu[idx], 1);
    elif dist == 'Bernoulli':
        return np.random.binomial(1, mu[idx]);
    elif dist == 'Exponential':
        return np.random.exponential(scale=mu[idx], size=1)


# Online+Offline Bisection Search

In [None]:
def batched_tas(mu, K, mu_hat, n_offline, dist, delta):
    """
    mu : mean vector
    mu_hat : emp. mean from offline samples
    K : number of arms
    n_offline : number of offline samples from each arm
    dist : SPEF
    delta : error bound
    """

    N = np.copy(n_offline); # array to store number of samples for each arm
    w = np.zeros_like(mu); # store the average w (to be tracked)
    wt = np.zeros_like(mu); # store the current w

    for _ in range(10):
        for at in range(0,K):
            X = sample(mu, dist, at);
            mu_hat[at] = (mu_hat[at] * N[at] + X)/(N[at] + 1);
            N[at] += 1;

    w = 1/K * np.ones_like(mu);
    wt = w;
    t = 10*K;

    while (not Stop(mu_hat, N, delta, dist)):
        if float(np.sqrt(np.floor(t/K))).is_integer():
            # print(t, "~~~~~~~ Entering Forced Exploration ~~~~~~~")
            t_ = 0;
            while (t_ < K):
                # Forced Exploration
                wt = 1/K * np.ones_like(mu);

                w = (1-1/(t+t_))*w + 1/(t+t_)*wt
                at = track(w, N);
                X = sample(mu, dist, at);
                mu_hat[at] = (mu_hat[at] * N[at] + X)/(N[at] + 1);
                N[at] += 1;
                t_ += 1;
            t += K;
            # print(t, "~~~~~~~ Completed FE, computing oracle weights ~~~~~~~")

            bisection_solver = OfflineOnlineMaxMinSolver(mu_hat, 1e-6, n_offline, dist);
            wt = bisection_solver.compute_optimal_proportions();
        else:
            w = (1-1/t)*w + (1/t)*wt
            at = track(w,N);
            X = sample(mu, dist, at);
            mu_hat[at] = (mu_hat[at] * N[at] + X)/(N[at] + 1);
            N[at] += 1;
            t += 1;

    return np.argmax(mu_hat, axis=0), t;


# LUCB1

In [None]:
def LUCB1(mu, K, mu_hat, n_offline, dist, delta, epsilon):
    """
    mu : mean vector
    mu_hat : emp. mean from offline samples
    K : number of arms
    n_offline : number of offline samples from each arm
    dist : SPEF
    delta : error bound
    epsilon : lcb - ucb gap to sto
    """

    N = np.copy(n_offline);  # array to store number of samples for each arm
    lcb = np.zeros_like(mu); # store the lcb index
    ucb = np.zeros_like(mu); # store the ucb index

    # sample each arm once
    for _ in range(10):
        for at in range(0,K):
            X = sample(mu, dist, at);
            mu_hat[at] = (mu_hat[at] * N[at] + X)/(N[at] + 1);
            N[at] += 1;

    t = sum(N);

    # Compute the lcb and ucb indexes
    for a in range(0, K):
        lcb[a] = mu_hat[a] - np.sqrt(2*(np.log(K*t*t/delta)+np.log(np.log(K*t*t/delta)))/N[a]); # from Kaufmann, Kalyanakrishnan 2013
        ucb[a] = mu_hat[a] + np.sqrt(2*(np.log(K*t*t/delta)+np.log(np.log(K*t*t/delta)))/N[a]); # from Kaufmann, Kalyanakrishnan 2013
        # lcb[a] = mu_hat[a] - np.sqrt((np.log((1+np.log(t))/delta))/N[a]); # conjectured in Garivier, Kaufmann 2016
        # ucb[a] = mu_hat[a] + np.sqrt((np.log((1+np.log(t))/delta))/N[a]); # conjectured in Garivier, Kaufmann 2016

    emp_best = np.argmax(mu_hat);

    # u_t = challenger - arm with largest ucb among the sub-optimal arms
    u_t = -1;
    max_ucb = float('-inf')
    for a in range(0,K):
        if a!= emp_best:
            if ucb[a] > max_ucb:
                u_t = a
                max_ucb = ucb[a]

    # l_t = optimal arm
    l_t = emp_best;

    # stopping threshold
    b_t = ucb[u_t] - lcb[l_t];

    while b_t > epsilon:
        X1 = sample(mu, dist, u_t);
        mu_hat[u_t] = (mu_hat[u_t] * N[u_t] + X1)/(N[u_t] + 1);
        N[u_t] += 1;

        # sample leader
        X2 = sample(mu, dist, l_t);
        mu_hat[l_t] = (mu_hat[l_t] * N[l_t] + X2)/(N[l_t] + 1);
        N[l_t] += 1;

        t += 2;

        # update ucb and lcb for each arm
        for a in range(0, K):
            lcb[a] = mu_hat[a] - np.sqrt((np.log(K*t*t/delta)+np.log(np.log(K*t*t/delta)))/N[a]/2);
            ucb[a] = mu_hat[a] + np.sqrt((np.log(K*t*t/delta)+np.log(np.log(K*t*t/delta)))/N[a]/2);
            # lcb[a] = mu_hat[a] - np.sqrt((np.log((1+np.log(t))/delta))/N[a]); # conjectured in Garivier, Kaufmann 2016
            # ucb[a] = mu_hat[a] + np.sqrt((np.log((1+np.log(t))/delta))/N[a]); # conjectured in Garivier, Kaufmann 2016

        emp_best = np.argmax(mu_hat);

        # u_t = challenger - arm with largest ucb among the sub-optimal arms
        u_t = -1;
        max_ucb = float('-inf')
        for a in range(0,K):
            if a!= emp_best:
                if ucb[a] > max_ucb:
                    u_t = a
                    max_ucb = ucb[a]

        # l_t = optimal arm
        l_t = emp_best;

        # stopping threshold
        b_t = ucb[u_t] - lcb[l_t];

    return np.argmax(mu_hat, axis=0), t-sum(n_offline);


# $\beta$-Top2

In [None]:
def beta_top2(mu, K, mu_hat, n_offline, dist, delta, bta):
    """
    mu : mean vector
    mu_hat : emp. mean from offline samples
    K : number of arms
    n_offline : number of offline samples from each arm
    dist : SPEF
    delta : error bound
    bta : probability to sample best arm
    """

    N = np.copy(n_offline);  # array to store number of samples for each arm
    index = np.zeros_like(mu); # store the index for each arm
    Delta = np.zeros_like(mu); # store sub-optimality gap for each arm
    u_t = -1; # challenger arm index
    l_t = -1; # leader arm index

    # sample each arm 10 times
    for _ in range(10):
        for at in range(0,K):
            X = sample(mu, dist, at);
            mu_hat[at] = (mu_hat[at] * N[at] + X)/(N[at] + 1);
            N[at] += 1;

    t = sum(N);

    while (not Stop(mu_hat, N, delta, dist)):
        # compute the best arm
        emp_best = np.argmax(mu_hat);

        # compute Delta for each arm
        # Delta = [-mu_hat[x] + mu_hat[emp_best] for x in range(0,K)]

        # Compute the indexes for each arm
        index = KLObjective(mu_hat, N, dist);
        index[emp_best] = float('inf');

        # for a in range(0, K):
        #    if a == emp_best:
        #        index[a] = float('inf');
        #    else:
        #        # index[a] = N[emp_best]*N[a]*(Delta[a]**2)/((N[emp_best]+N[a])**2);
        #        index[a] =

        # u_t = challenger - arm with smallest index
        u_t = np.argmin(index);

        # l_t = leader - arm with the maximum mean
        l_t = emp_best;

        # sample from Bernoulli with bta
        X1 = np.random.binomial(1, bta, 1);

        if X1 == 1:
            # sample leader (emp_best)
            X2 = sample(mu, dist, l_t);
            mu_hat[l_t] = (mu_hat[l_t] * N[l_t] + X2)/(N[l_t] + 1);
            N[l_t] += 1;
        else:
            # sample challenger
            X2 = sample(mu, dist, u_t);
            mu_hat[u_t] = (mu_hat[u_t] * N[u_t] + X2)/(N[u_t] + 1);
            N[u_t] += 1;

        t += 1;

    return np.argmax(mu_hat, axis=0), t-sum(n_offline);

# Optimal Top2

In [None]:
def opt_top2(mu, K, mu_hat, n_offline, dist, delta):
    """
    mu : mean vector
    mu_hat : emp. mean from offline samples
    K : number of arms
    n_offline : number of offline samples from each arm
    dist : SPEF
    delta : error bound
    bta : probability to sample best arm
    """

    N = np.copy(n_offline);  # array to store number of samples for each arm
    index = np.zeros_like(mu); # store the index for each arm
    Delta = np.zeros_like(mu); # store sub-optimality gap for each arm
    B = np.zeros_like(mu); # array to store arms in set B with minimum index (to include in sum-ratio)
    u_t = -1; # challenger arm index
    l_t = -1; # leader arm index

    # sample each arm 10 times
    for _ in range(10):
        for at in range(0,K):
            X = sample(mu, dist, at);
            mu_hat[at] = (mu_hat[at] * N[at] + X)/(N[at] + 1);
            N[at] += 1;

    t = sum(N);

    while (not Stop(mu_hat, N, delta, dist)):
        # compute the best arm
        emp_best = np.argmax(mu_hat);

        # compute Delta for each arm
        # Delta = [-mu_hat[x] + mu_hat[emp_best] for x in range(0,K)]

        # Compute the indexes for each arm
        index = KLObjective(mu_hat, N, dist);
        index[emp_best] = float('inf');

        # u_t = challenger - arm with smallest index
        u_t = np.argmin(index);
        B[u_t] = 1; # include arm u_t in B

        # l_t = leader - arm with the maximum mean
        l_t = emp_best;

        s = -1;
        # Check for sum-ratio constraint
        for a in range(0,K):
            if B[a] == 1 and a != l_t:
                x1a = (N[l_t]*mu_hat[l_t] + N[a]*mu_hat[a])/(N[l_t]+N[a]);
                s+=KL(mu_hat[l_t],x1a , dist)/KL(mu_hat[a],x1a, dist);

        if s >= 0:
            # sample leader (emp_best)
            X2 = sample(mu, dist, l_t);
            mu_hat[l_t] = (mu_hat[l_t] * N[l_t] + X2)/(N[l_t] + 1);
            N[l_t] += 1;
        else:
            # sample challenger
            X2 = sample(mu, dist, u_t);
            mu_hat[u_t] = (mu_hat[u_t] * N[u_t] + X2)/(N[u_t] + 1);
            N[u_t] += 1;

        t += 1;

    print(B);
    return np.argmax(mu_hat, axis=0), t-sum(n_offline);

# Offline+Online Experiments

### generate the problem instance

In [None]:
# generate the problem instance
K = 10 # number of arms
delta = 0.03 # gap between arms
mu = np.random.uniform(0.0+delta, 1.0 - delta, K) # expected reward of the arms
mu = -np.sort(-mu)
mu[0] = 1-delta/2
mu = np.asarray([0.985, 0.83741233, 0.73414625, 0.60694074, 0.50982398, 0.48770951, 0.29959668, 0.27820935, 0.23755138, 0.0356999 ])
print(mu)

# #mu = [0.322, 0.381, 0.87, 0.88, 0.9]
# mu = [0.35, 0.381, 0.8, 0.9]
# #mu = [0.322, 0.322, 0.322, 0.322, 0.322, 0.381, 0.8, 0.9]
# K = len(mu);
# delta = 0.05 #1e-3

# mu = np.array(mu)
# print(mu)



mu = mu[:,None]
# print(mu)

dist = 'Gaussian';
n_offline = np.zeros_like(mu);
s_hat = np.zeros_like(mu);
mu_hat = np.zeros_like(mu);

### Online+offline Experiments

In [None]:
ns = [1, 2, 10, 500, 1000, 5000, 10000, 50000]

reps = 50
result_oo = np.zeros((len(ns), reps))
result_oo_lucb = np.zeros((len(ns), reps))
result_oo_beta_top2 = np.zeros((len(ns), reps))
result_oo_optimal_top2 = np.zeros((len(ns), reps))

mistake_tas = 0;
mistake_lucb = 0;
mistake_beta_top2 = 0;
mistake_optimal_top2 = 0;

print(mu)
for i in range(len(ns)):
    ni = ns[i]
    for j in range(reps):
        # generate offline data
        n_offline = np.ones(mu.shape)*ni

        '''
        #if poorly-explored offline data - Setting 1
        n_offline = np.ones(mu.shape)*10;
        n_offline[np.argmax(mu)] = 2000+ni*2;
        n_offline[np.argmin(mu)] = 2000+ni*2;


        #if poorly-explored offline data - Setting 2
        n_offline = np.ones(mu.shape)*10;
        n_offline[np.argmax(mu)] = 5000+ni*2;
        n_offline[np.argmin(mu)] = 5000+ni*2;

        #if poorly-explored offline data - Setting 3
        n_offline[np.argmax(mu)] = ni*100;
        n_offline[np.argmin(mu)] = ni*100;
        '''

        #if poorly-explored offline data - Setting 4
        n_offline = np.ones(mu.shape)*10
        n_offline[np.argmax(mu)] = ni*K;

        mu_hat = np.zeros_like(mu)
        for k in range(mu.shape[0]):
            mu_hat[k, 0] = np.mean(np.random.normal(mu[k,0], 1, size=int(n_offline[k]))) # This is for Gaussians.
            #mu_hat[k, 0] = np.mean(np.random.binomial(1, mu[k,0], size=int(n_offline[k]))) # This for Bernoulli.
            #Bernoulli doesn't work directly as dist variable needs to be set appropriately.


        tas = batched_tas(mu, K, np.copy(mu_hat), np.copy(n_offline), dist, delta)
        print(tas)
        if(tas[0][0] != np.argmax(mu)):
            mistake_tas += 1;
        result_oo[i,j] = tas[1]
        print(result_oo[i,j], mistake_tas)

        print(i,j, '~~~~~~~~~~~~~~~~~~~~~~~ TAS ~~~~~~~~~~~~~~~~~~~~~~~')

        '''
        lucb = LUCB1(mu, K, np.copy(mu_hat), np.copy(n_offline), dist, delta, 0)
        print(lucb)
        if(lucb[0][0] != np.argmax(mu)):
            mistake_lucb += 1;
        result_oo_lucb[i,j] = lucb[1]
        print(result_oo_lucb[i,j], mistake_lucb)

        print(i,j,'~~~~~~~~~~~~~~~~~~~~~~~ LUCB ~~~~~~~~~~~~~~~~~~~~~~~')

        '''


        bta = 0.5;
        top2 = beta_top2(mu, K, np.copy(mu_hat), np.copy(n_offline), dist, delta, bta)
        print(top2)
        if(top2[0][0] != np.argmax(mu)):
            mistake_beta_top2 += 1;
        result_oo_beta_top2[i,j] = top2[1]
        print(result_oo_beta_top2[i,j], mistake_beta_top2)

        print(i,j,'~~~~~~~~~~~~~~~~~~~~~~~ bta top2, bta = 0.5 ~~~~~~~~~~~~~~~~~~~~~~~')

        optimal_top2 = opt_top2(mu, K, np.copy(mu_hat), np.copy(n_offline), dist, delta)
        print(optimal_top2)
        if(optimal_top2[0][0] != np.argmax(mu)):
            mistake_optimal_top2 += 1;
        result_oo_optimal_top2[i,j] = optimal_top2[1]
        print(result_oo_optimal_top2[i,j], mistake_optimal_top2)

        print(i,j,'~~~~~~~~~~~~~~~~~~~~~~~ optimal top2 ~~~~~~~~~~~~~~~~~~~~~~~')

In [None]:
# create pandas dataframe
import pandas as pd
import seaborn
seaborn.set(style="ticks")

ns_col = K*np.multiply(np.asarray(ns)[:, None], np.ones(reps)[None,:]).reshape(-1).astype(int)
dataset = pd.DataFrame({'offline samples': ns_col})
dataset['Batched TaS'] = result_oo.reshape(-1)
dataset['Proposed Top2'] = result_oo_optimal_top2.reshape(-1)

ax = (
    dataset.set_index('offline samples', append=True)  # set offline samples as part of the index
      .stack()                      # pull A - D into rows
      .to_frame()                   # convert to a dataframe
      .reset_index()                # make the index into reg. columns
      .rename(columns={'level_2': 'quantity', 0: 'stopping time'})  # rename columns
      .drop('level_0', axis='columns')   # drop junk columns
      .pipe((seaborn.boxplot, 'data'), x='offline samples', y='stopping time', hue='quantity')
)
seaborn.set(rc={'figure.figsize':(6.4, 4.8)})
seaborn.despine(trim=True)

In [None]:
# create pandas dataframe
import pandas as pd
import seaborn
seaborn.set(style="ticks")

ns_col = K*np.multiply(np.asarray(ns)[:, None], np.ones(reps)[None,:]).reshape(-1).astype(int)
dataset = pd.DataFrame({'offline samples': ns_col})
dataset['Batched TaS'] = result_oo.reshape(-1)
dataset['Proposed Top2'] = result_oo_optimal_top2.reshape(-1)
dataset['Beta Top2'] = result_oo_beta_top2.reshape(-1)

ax = (
    dataset.set_index('offline samples', append=True)  # set offline samples as part of the index
      .stack()                      # pull A - D into rows
      .to_frame()                   # convert to a dataframe
      .reset_index()                # make the index into reg. columns
      .rename(columns={'level_2': 'quantity', 0: 'stopping time'})  # rename columns
      .drop('level_0', axis='columns')   # drop junk columns
      .pipe((seaborn.boxplot, 'data'), x='offline samples', y='stopping time', hue='quantity')
)
seaborn.set(rc={'figure.figsize':(6.4, 4.8)})
seaborn.despine(trim=True)

In [None]:
# create pandas dataframe
import pandas as pd
import seaborn
seaborn.set(style="ticks")

ns_col = K*np.multiply(np.asarray(ns)[:, None], np.ones(reps)[None,:]).reshape(-1).astype(int)
dataset = pd.DataFrame({'offline samples': ns_col})
dataset['Batched TaS'] = result_oo.reshape(-1)
dataset['Beta Top2'] = result_oo_beta_top2.reshape(-1)

ax = (
    dataset.set_index('offline samples', append=True)  # set offline samples as part of the index
      .stack()                      # pull A - D into rows
      .to_frame()                   # convert to a dataframe
      .reset_index()                # make the index into reg. columns
      .rename(columns={'level_2': 'quantity', 0: 'stopping time'})  # rename columns
      .drop('level_0', axis='columns')   # drop junk columns
      .pipe((seaborn.boxplot, 'data'), x='offline samples', y='stopping time', hue='quantity')
)
seaborn.set(rc={'figure.figsize':(6.4, 4.8)})
seaborn.despine(trim=True)

### Results for Batched TaS

In [None]:
import os
from pathlib import Path
from google.colab import drive
drive.mount('/content/drive')

#path = r"/content/drive/My Drive/OfflineOnline"
#os.chdir(path)

online_samples_list = []
for i in range(0, len(ns)):
    online_samples_list += [result_oo[i,:]]
fig = plt.figure()
plt.rcParams["figure.figsize"] = (6.4, 4.8)

# Creating axes instance
ax = fig.add_axes([0, 0, 1, 1])

# Creating plot
bp = ax.boxplot(online_samples_list)
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8], [K*x for x in ns[0:]])
#plt.xticks([1, 2, 3, 4], [10*x for x in ns[0:]])
plt.xlabel('Offline samples', fontsize=15)
plt.ylabel('Stopping Time (Batched TaS)', fontsize=15)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
#plt.yscale("log")
plt.title('Bandits with {} arms'.format(K), fontsize=20)

plt.savefig("/content/drive/My Drive/OfflineOnline/poorly_explored_BatchedTaS_Gauss10arms.pdf", format="pdf", bbox_inches="tight")

# from google.colab import drive
# drive.mount('/content/gdrive')
# with open('/content/gdrive/My Drive/BatchedTaS_Gauss4arns.pdf', 'wb') as f:
#   f.write('content')

# show plot
plt.show()

### Results for beta_top2

In [None]:
result_oo_beta_top2 = result_oo_beta_top2
bta = 0.5

online_samples_list = []
for i in range(0, len(ns)):
    online_samples_list += [result_oo_beta_top2[i,:]]
fig = plt.figure()
plt.rcParams["figure.figsize"] = (6.4, 4.8)

# Creating axes instance
ax = fig.add_axes([0, 0, 1, 1])

# Creating y-label
y_str = 'Stopping Time (top2 with beta = '+str(bta)+')'

# Creating plot
bp = ax.boxplot(online_samples_list)
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8], [K*x for x in ns[0:]])
#plt.xticks([1, 2, 3, 4], [10*x for x in ns[0:]])
plt.xlabel('Offline samples', fontsize=15)
plt.ylabel(y_str, fontsize=15)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
#plt.yscale("log")
plt.title(r'Bandits with {} arms'.format(K), fontsize=20)


plt.savefig("/content/drive/My Drive/OfflineOnline/poorly_explored_2Top2_Gauss10arms.pdf", format="pdf", bbox_inches="tight")
#plt.savefig("OptTop2/2Top2_Gauss10arms.pdf", format="pdf", bbox_inches="tight")

# show plot
plt.show()

### Results for opt_top2

In [None]:

bta = 0.5

online_samples_list = []
for i in range(0, len(ns)):
    online_samples_list += [result_oo_optimal_top2[i,:]]
fig = plt.figure()
plt.rcParams["figure.figsize"] = (6.4, 4.8)

# Creating axes instance
ax = fig.add_axes([0, 0, 1, 1])

# Creating y-label
y_str = 'Stopping Time (optimal top2)'

# Creating plot
bp = ax.boxplot(online_samples_list)
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8], [K*x for x in ns[0:]])
#plt.xticks([1, 2, 3, 4], [10*x for x in ns[0:]])
plt.xlabel('Offline samples', fontsize=15)
plt.ylabel(y_str, fontsize=15)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
#plt.yscale("log")
plt.title(r'Bandits with {} arms'.format(K), fontsize=20)


plt.savefig("/content/drive/My Drive/OfflineOnline/poorly_explored_OptTop2_Gauss10arns.pdf", format="pdf", bbox_inches="tight")
#plt.savefig("OptTop2/OptTop2_Gauss10arns.pdf", format="pdf", bbox_inches="tight")

# show plot
plt.show()

### Results for LUCB

In [None]:
online_samples_list = []
for i in range(1, len(ns)):
    online_samples_list += [result_oo_lucb[i,:]]
fig = plt.figure()
plt.rcParams["figure.figsize"] = (6.4, 4.8)

# Creating axes instance
ax = fig.add_axes([0, 0, 1, 1])

# Creating plot
bp = ax.boxplot(online_samples_list)
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8], [10*x for x in ns[0:]])
plt.xlabel('Offline samples (well explored offline data)', fontsize=15)
plt.ylabel('Stopping Time (LUCB)', fontsize=15)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.yscale("log")
plt.title(r'Bandits with 10 arms'.format(1), fontsize=20)


#plt.savefig("LUCB_Bernoulli10.pdf", format="pdf", bbox_inches="tight")

# show plot
plt.show()