In [1]:
import numpy as np
from UCB_discrete import UCB_os_gau, UCB_os_exp, UCB1_os, UCB_os_comb
from collections import defaultdict
import Environment
from Environment import AbsGau, Exp, Comb
from SimulatedGames import simulate
from ipywidgets import IntProgress
from IPython.display import display

%matplotlib inline

# UCB for order statistics

This notebook is designed to show the idea using order statistics to design UCB policies works empirically. The experiment is designed on simulated data, for two special reward distributions proved by [notes](https://github.com/chengsoonong/eheye/tree/master/writing/QuantUCB_orderstat): 

(1) Absolute value of center Gaussian rewards  
(2) Exponential rewards  
(3) Arbtrary distributions (combing 1 and 2)

For each case, we show the results for both estiamted parameter and true parameter.

Settings: 3 independent arms, 10000 rounds with 50 indepenent experiments. Evaluated by 
a. expected sub-optimal draws.
b. the percent of best arm selected. 

Policy (Empirical policy): In round t+1, select the arm with index, 
$$argmax_{i \in \mathcal{K}} \hat{m}_{i, T_i(t)} + \beta(\sqrt{2v_t \varepsilon} + 2 \varepsilon \sqrt{v_t/T_i(t)})$$

where $\hat{m}_{i, T_i(t)}$ is the empirical median for arm i at the round t, $\varepsilon = \alpha \log t$, $v_t$ depends on reward distributions. $T_i(t)$ is the number of times arm i has been played until round t. 

We fix the parameters $\alpha, \beta$ in this notebook, see [here](https://github.com/chengsoonong/eheye/blob/master/UCB/OS_UCB/ucb_os_hyperparameter_tuning_outliers_testing.ipynb) for experiments on tuning parameters, [here](https://github.com/chengsoonong/eheye/blob/master/UCB/OS_UCB/Variance_Sensitiveness_test.ipynb) for experiments test the policy senstivity for different variance of reward distribution, [here](https://github.com/chengsoonong/eheye/blob/master/UCB/OS_UCB/Sanity_test.ipynb) for some sanity test of our policy.  

In [2]:
# setting

num_rounds = 1000
num_exper = 50
num_arms = 3

# environment

environments = {#AbsGau: [0.7, 0.8, 0.9], 
                #Exp: [0.7, 1, 1.2],
                Comb: [0.7, 0.7, 1]
               }

rewards_env, medians = Environment.setup_env(num_arms, environments)

# policy

policy = {'AbsGau': UCB_os_gau,
          'Exp': UCB_os_exp,
          'Comb': UCB_os_comb
         }
# hyper-parameters for policy

hyperpara_list = [[1, 0.2]]

evaluation = ['sd', 'r', 'bd']

In [3]:
results = defaultdict(dict)

for key in rewards_env.keys():
    for hyperpara in hyperpara_list:
        for est_var in [True, False]:
            name = key + '_' + str(num_exper) + '_' + str(num_rounds) 
            subname = str(est_var) + str(hyperpara)
            print(name + subname)
            p = IntProgress(max = num_exper)
            p.description = 'Running'
            display(p)
            results[name][subname], results[name]['bound'] = \
                    simulate(rewards_env[key], medians[key], policy[key], num_exper, num_rounds, est_var, hyperpara, evaluation, p)

Comb_50_1000True[1, 0.2]


IntProgress(value=0, description='Running', max=50)

Comb_50_1000False[1, 0.2]


IntProgress(value=0, description='Running', max=50)

In [4]:
#with open('os_saving.pickle', 'rb') as handle:
#    b = pickle.load(handle)

In [5]:

hyperpara = [1, 1]
est_var = False

for key in rewards_env.keys():
    name = key + '_' + str(num_exper) + '_' + str(num_rounds)
    subname = 'UCB1_' + str(hyperpara)
    print(name + subname)
    p = IntProgress(max = num_exper)
    p.description = 'Running'
    display(p)
    results[name][subname], bounds= simulate(rewards_env[key], medians[key], UCB1_os, num_exper, num_rounds, est_var, hyperpara, evaluation, p)
    


Comb_50_1000UCB1_[1, 1]


IntProgress(value=0, description='Running', max=50)

In [6]:
import pickle 
saving = results

with open('os_saving.pickle', 'wb') as handle:
    pickle.dump(saving, handle, protocol=pickle.HIGHEST_PROTOCOL)