In [1]:
import logging
from pathlib import Path
import pickle
import sys

import matplotlib.pyplot as plt
from obp.dataset.real import OpenBanditDataset
from obp.ope import ReplayMethod, InverseProbabilityWeighting, SelfNormalizedInverseProbabilityWeighting, DirectMethod
from obp.policy import EpsilonGreedy, BernoulliTS, Random, LinEpsilonGreedy
from sklearn.linear_model import LogisticRegression

from sd_bandits.experiment import OBDExperiment
from sd_bandits.obp_extensions.policy import ExploreThenCommit, KLUpperConfidenceBound

In [2]:
logging.basicConfig(
    format="%(asctime)s %(levelname)s: %(message)s",
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(sys.stdout),
    ],
    datefmt="%-I:%M:%S",
)

# Ad hoc Zozo experiments


In [3]:
obp_dataset = OpenBanditDataset(
    behavior_policy="random",
    campaign="all",
    data_path=Path("../data/open_bandit_dataset/"),
)

  mask |= (ar1 == a)


In [4]:
policies_basic = [
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=1,
        epsilon=0.01,
        policy_name="egreedy_exploit",
    ),
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=1,
        epsilon=0.1,
        policy_name="egreedy_explore",
    ),
    BernoulliTS(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=1,
        alpha=1,
        beta=1,
        policy_name="ts_naive",
    ),
    BernoulliTS(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=1,
        alpha=1,
        beta=100,
        policy_name="ts_pessimistic",
    ),
    LinEpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=1,
        dim=26,
        epsilon=0.01,
        #policy_name="lin_egreedy_explore",
    ),
    LinEpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=1,
        dim=26,
        epsilon=0.1,
        #policy_name="lin_egreedy_exploit",
    ),
    ExploreThenCommit(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=1,
        min_n=20,
        policy_name="etc_exploit",
    ),
    ExploreThenCommit(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=1,
        min_n=100,
        policy_name="etc_explore",
    ),
    KLUpperConfidenceBound(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=1,
        policy_name="kl_ucp",
    ),
]

In [5]:
replay_experiment_basic = OBDExperiment(
    dataset=obp_dataset,
    policies=[(policy, {}) for policy in policies_basic],
    estimators=[
        ReplayMethod(),
        InverseProbabilityWeighting(),
        SelfNormalizedInverseProbabilityWeighting(),
        DirectMethod(),
    ],
    regression_base_model=LogisticRegression(
        max_iter=10000, C=1000, random_state=12345
    ),
)


In [6]:
replay_experiment_basic.run_experiment()

5:18:04 INFO: Running experiment
5:18:04 INFO: Obtaining logged feedback
5:18:04 INFO: Done in 0.0 seconds
5:18:04 INFO: Fitting regression model
5:23:47 INFO: Done in 342.61 seconds
5:23:47 INFO: Running simulations
5:23:47 INFO: [1 of 9] Running simulation for egreedy_exploit


100%|██████████| 1374327/1374327 [00:19<00:00, 72071.18it/s]


5:24:10 INFO: [2 of 9] Running simulation for egreedy_explore


100%|██████████| 1374327/1374327 [00:23<00:00, 59587.99it/s]


5:24:38 INFO: [3 of 9] Running simulation for ts_naive


100%|██████████| 1374327/1374327 [00:59<00:00, 23067.26it/s]


5:25:42 INFO: [4 of 9] Running simulation for ts_pessimistic


100%|██████████| 1374327/1374327 [01:01<00:00, 22515.04it/s]


5:26:48 INFO: [5 of 9] Running simulation for linear_epsilon_greedy_0.01


100%|██████████| 1374327/1374327 [21:48<00:00, 1050.39it/s]


5:48:41 INFO: [6 of 9] Running simulation for linear_epsilon_greedy_0.1


100%|██████████| 1374327/1374327 [19:32<00:00, 1172.35it/s]


6:08:17 INFO: [7 of 9] Running simulation for etc_exploit


100%|██████████| 1374327/1374327 [00:33<00:00, 41522.43it/s]


6:08:55 INFO: [8 of 9] Running simulation for etc_explore


100%|██████████| 1374327/1374327 [00:33<00:00, 40916.62it/s]


6:09:32 INFO: [9 of 9] Running simulation for kl_ucp


100%|██████████| 1374327/1374327 [00:20<00:00, 65774.36it/s]


6:09:58 INFO: Done in 2770.86 seconds
6:09:58 INFO: Estimating rewards
6:09:58 INFO: [1 of 4] Estimator ReplayMethod
6:09:58 INFO:   [1 of 10] Estimating reward confidence interval for logged
6:10:01 INFO:   [2 of 10] Estimating rewards and reward confidence interval for egreedy_exploit
6:10:07 INFO:   [3 of 10] Estimating rewards and reward confidence interval for egreedy_explore
6:10:13 INFO:   [4 of 10] Estimating rewards and reward confidence interval for ts_naive
6:10:18 INFO:   [5 of 10] Estimating rewards and reward confidence interval for ts_pessimistic
6:10:23 INFO:   [6 of 10] Estimating rewards and reward confidence interval for linear_epsilon_greedy_0.01
6:10:29 INFO:   [7 of 10] Estimating rewards and reward confidence interval for linear_epsilon_greedy_0.1
6:10:34 INFO:   [8 of 10] Estimating rewards and reward confidence interval for etc_exploit
6:10:39 INFO:   [9 of 10] Estimating rewards and reward confidence interval for etc_explore
6:10:45 INFO:   [10 of 10] Estimati

In [7]:
with open("../results_obp_basic.pickle", "wb") as f:
    pickle.dump(replay_experiment_basic.output, f)

In [8]:
replay_experiment_basic.output

{'policy_feedback': {'logged': {'n_rounds': 1374327,
   'n_actions': 80,
   'reward': array([0, 0, 0, ..., 0, 0, 0])},
  'egreedy_exploit': {'reward': {'ReplayMethod': array([0., 0., 0., ..., 0., 0., 0.]),
    'InverseProbabilityWeighting': array([0., 0., 0., ..., 0., 0., 0.]),
    'SelfNormalizedInverseProbabilityWeighting': array([0., 0., 0., ..., 0., 0., 0.]),
    'DirectMethod': array([0.00603657, 0.00520461, 0.00470689, ..., 0.00232626, 0.00187273,
           0.00306799])}},
  'egreedy_explore': {'reward': {'ReplayMethod': array([0., 0., 0., ..., 0., 0., 0.]),
    'InverseProbabilityWeighting': array([0., 0., 0., ..., 0., 0., 0.]),
    'SelfNormalizedInverseProbabilityWeighting': array([0., 0., 0., ..., 0., 0., 0.]),
    'DirectMethod': array([0.00603657, 0.00520461, 0.00470689, ..., 0.00216984, 0.00230421,
           0.00281619])}},
  'ts_naive': {'reward': {'ReplayMethod': array([0., 0., 0., ..., 0., 0., 0.]),
    'InverseProbabilityWeighting': array([0., 0., 0., ..., 0., 0., 0.