In [1]:
import numpy as np
from UCB_discrete import UCB_discrete, UCB_os_gau, UCB_os_exp, UCB1_os, UCB_os_comb
from collections import defaultdict
import Environment
from Environment import AbsGau, Exp, Comb, AbsGau_Outlier, Exp_Outlier
from SimulatedGames import simulate
from ipywidgets import IntProgress
from IPython.display import display
import matplotlib.pyplot as plt

%matplotlib inline

# UCB for order statistics

This notebook is designed to show the idea using order statistics to design UCB policies works empirically. The experiment is designed on simulated data, for two special reward distributions proved by [notes](https://github.com/chengsoonong/eheye/tree/master/writing/QuantUCB_orderstat): 

(1) Absolute value of center Gaussian rewards  
(2) Exponential rewards  
(3) Arbtrary distributions (combing 1 and 2)

For each case, we show the results for both estiamted parameter and true parameter.

Settings: 3 independent arms, 10000 rounds with 50 indepenent experiments. Evaluated by 
a. expected sub-optimal draws.
b. the percent of best arm selected. 

Policy (Empirical policy): In round t+1, select the arm with index, 
$$argmax_{i \in \mathcal{K}} \hat{m}_{i, T_i(t)} + \beta(\sqrt{2v_t \varepsilon} + 2 \varepsilon \sqrt{v_t/T_i(t)})$$

where $\hat{m}_{i, T_i(t)}$ is the empirical median for arm i at the round t, $\varepsilon = \alpha \log t$, $v_t$ depends on reward distributions. $T_i(t)$ is the number of times arm i has been played until round t. 

We fix the parameters $\alpha, \beta$ in this notebook, see [here](https://github.com/chengsoonong/eheye/blob/master/UCB/OS_UCB/ucb_os_hyperparameter_tuning_outliers_testing.ipynb) for experiments on tuning parameters, [here](https://github.com/chengsoonong/eheye/blob/master/UCB/OS_UCB/Variance_Sensitiveness_test.ipynb) for experiments test the policy senstivity for different variance of reward distribution, [here](https://github.com/chengsoonong/eheye/blob/master/UCB/OS_UCB/Sanity_test.ipynb) for some sanity test of our policy.  

In [2]:
# setting

num_rounds = 1000
num_exper = 100
num_arms = 3

# environment

'''
environments = {#AbsGau: [0.5, 1, 1.5], 
                #Exp: [2.0, 1, 3.0/2],
                #Comb: [0.7, 0.7, 1], 
                AbsGau_Outlier: [0.5, 1, 1.5],
                Exp_Outlier: [2.0, 1, 3.0/2]
               }
'''

environments = [
                {AbsGau: [0.5, 1.0, 1.5]}, 
                {Exp:    [2.0, 1.0, 1.5]},
                {AbsGau_Outlier: [0.5, 1, 1.5]},
                {Exp_Outlier: [2.0, 1, 1.5]},
                {AbsGau: [0.5], Exp: [1.0, 1.5]},
                {AbsGau_Outlier: [0.5], Exp_Outlier: [1.0, 1.5]}
               ]

rewards_env, medians = Environment.setup_env(num_arms, environments)

# policy
'''
policy = {'AbsGau': UCB_discrete, #UCB_os_gau,
          'Exp': UCB_discrete, #UCB_os_exp,
          'Comb': UCB_discrete,     #UCB_os_comb,
          'AbsGau_Outlier': UCB_discrete, #UCB_os_gau
          'Exp_Outlier': UCB_discrete
         }
'''
policy = UCB_discrete

# hyper-parameters for policy

hyperpara_list = [[0.5, 0.2]]

evaluation = ['sd', 'r', 'bd']

In [3]:
rewards_env

defaultdict(list,
            {'AbsGau_[0.5, 1.0, 1.5]': [<Environment.AbsGau at 0x7f7cf15555c0>,
              <Environment.AbsGau at 0x7f7cf15555f8>,
              <Environment.AbsGau at 0x7f7cf15557f0>],
             'Exp_[2.0, 1.0, 1.5]': [<Environment.Exp at 0x7f7cf1555588>,
              <Environment.Exp at 0x7f7cf1555940>,
              <Environment.Exp at 0x7f7cf15559e8>],
             'AbsGau_Outlier_[0.5, 1, 1.5]': [<Environment.AbsGau_Outlier at 0x7f7cf1555470>,
              <Environment.AbsGau_Outlier at 0x7f7cf1555b00>,
              <Environment.AbsGau_Outlier at 0x7f7cf1555ba8>],
             'Exp_Outlier_[2.0, 1, 1.5]': [<Environment.Exp_Outlier at 0x7f7cf1555c50>,
              <Environment.Exp_Outlier at 0x7f7cf1555cf8>,
              <Environment.Exp_Outlier at 0x7f7cf1555da0>],
             'AbsGau_[0.5]Exp_[1.0, 1.5]': [<Environment.AbsGau at 0x7f7cf1555eb8>,
              <Environment.Exp at 0x7f7cf1555f28>,
              <Environment.Exp at 0x7f7cf1555fd0>],
   

In [4]:
medians

defaultdict(list,
            {'AbsGau_[0.5, 1.0, 1.5]': [0.33768250246487946,
              0.6779741829902828,
              1.0482185901809726],
             'Exp_[2.0, 1.0, 1.5]': [0.3442511163834113,
              0.6945948508548029,
              0.463267103155804],
             'AbsGau_Outlier_[0.5, 1, 1.5]': [0.36232618932374916,
              0.7103015016924226,
              1.071861730649036],
             'Exp_Outlier_[2.0, 1, 1.5]': [0.37474706195628593,
              0.7474124253143246,
              0.501862807963664],
             'AbsGau_[0.5]Exp_[1.0, 1.5]': [0.3345746092000958,
              0.7032169736281475,
              0.4767995009321356],
             'AbsGau_Outlier_[0.5]Exp_Outlier_[1.0, 1.5]': [0.3521068672347205,
              0.7379046813313457,
              0.4874996239659134]})

In [5]:
results = defaultdict(dict)

for key in rewards_env.keys():
    for hyperpara in hyperpara_list:
        for est_flag in [False]:
            name = key + '_' + str(num_exper) + '_' + str(num_rounds) 
            subname = str(est_flag)+ str(hyperpara)
            print(name + subname)
            p = IntProgress(max = num_exper)
            p.description = 'Running'
            display(p)
            results[name][subname], results[name]['bound']= \
                    simulate(rewards_env[key], medians[key], policy, num_exper, num_rounds, est_flag, hyperpara, evaluation, p)

AbsGau_[0.5, 1.0, 1.5]_100_1000False[0.5, 0.2]


IntProgress(value=0, description='Running')

Exp_[2.0, 1.0, 1.5]_100_1000False[0.5, 0.2]


IntProgress(value=0, description='Running')

AbsGau_Outlier_[0.5, 1, 1.5]_100_1000False[0.5, 0.2]


IntProgress(value=0, description='Running')

Exp_Outlier_[2.0, 1, 1.5]_100_1000False[0.5, 0.2]


IntProgress(value=0, description='Running')

AbsGau_[0.5]Exp_[1.0, 1.5]_100_1000False[0.5, 0.2]


IntProgress(value=0, description='Running')

AbsGau_Outlier_[0.5]Exp_Outlier_[1.0, 1.5]_100_1000False[0.5, 0.2]


IntProgress(value=0, description='Running')

In [6]:
#with open('os_saving.pickle', 'rb') as handle:
#    b = pickle.load(handle)

In [7]:

hyperpara_list = [[1]]
est_var = False

for key in rewards_env.keys():
    for hyperpara in hyperpara_list:
        name = key + '_' + str(num_exper) + '_' + str(num_rounds)
        subname = 'UCB1_' + str(hyperpara)
        print(name + subname)
        p = IntProgress(max = num_exper)
        p.description = 'Running'
        display(p)
        results[name][subname], bounds= simulate(rewards_env[key], medians[key], UCB1_os, num_exper, num_rounds, est_var, hyperpara, evaluation, p)
    


AbsGau_[0.5, 1.0, 1.5]_100_1000UCB1_[1]


IntProgress(value=0, description='Running')

Exp_[2.0, 1.0, 1.5]_100_1000UCB1_[1]


IntProgress(value=0, description='Running')

AbsGau_Outlier_[0.5, 1, 1.5]_100_1000UCB1_[1]


IntProgress(value=0, description='Running')

Exp_Outlier_[2.0, 1, 1.5]_100_1000UCB1_[1]


IntProgress(value=0, description='Running')

AbsGau_[0.5]Exp_[1.0, 1.5]_100_1000UCB1_[1]


IntProgress(value=0, description='Running')

AbsGau_Outlier_[0.5]Exp_Outlier_[1.0, 1.5]_100_1000UCB1_[1]


IntProgress(value=0, description='Running')

In [8]:
import pickle 
saving = results

with open('os_saving_6.pickle', 'wb') as handle:
    pickle.dump(saving, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
'''
for key, value in estimated_para.items():
    x = np.arange(0, len(value))
    plt.plot(x,value,label = key)
    plt.plot(x, np.ones_like(x) * 1.0/environments[Exp][key], label = str(key) + ' true' )
plt.legend()
'''

"\nfor key, value in estimated_para.items():\n    x = np.arange(0, len(value))\n    plt.plot(x,value,label = key)\n    plt.plot(x, np.ones_like(x) * 1.0/environments[Exp][key], label = str(key) + ' true' )\nplt.legend()\n"