In [1]:
import logging
from pathlib import Path
import sys

from obp.dataset.real import OpenBanditDataset
from obp.ope import DirectMethod, ReplayMethod
from obp.policy import EpsilonGreedy
from sklearn.linear_model import LinearRegression

from sd_bandits.obp_extensions.dataset import DeezerDataset
from sd_bandits.experiment import DeezerExperiment, OBDExperiment

In [2]:
logging.basicConfig(
    format="%(asctime)s %(levelname)s: %(message)s",
    level=logging.INFO,
    stream=sys.stdout,
    datefmt="%-I:%M:%S",
)

# Experiment interface

In this notebook, we show off the `Experiment` interface, which is implemented as `OBDExperiment` and `DeezerExperiment`. It provides uniform instantiation and a `run_experiment` method to make it easier for our script to run lots of experiments.

## 1. An experiment on OBD dataset with two epsilon greedy policies and direct method

OBD experiments are done using Zozo's logged data. Policies are learned offline using the `run_bandit_simulation` method from OBP, then are evaluated using off-policy estimators. In this experiment, we use an off-policy estimator which requires a regression model.

In [3]:
obp_dataset = OpenBanditDataset(
    behavior_policy="random",
    campaign="all",
    data_path=Path("../data/open_bandit_dataset/"),
)


  mask |= (ar1 == a)


In [4]:
policies = [
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.05,
        policy_name="egreedy_0.05"
    ),
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.7,
        policy_name="egreedy_0.7"
    ),
]

In [5]:
obd_experiment = OBDExperiment(
    dataset=obp_dataset,
    policies=policies,
    estimator=DirectMethod(),
    regression_base_model=LinearRegression(),
)

In [6]:
obd_experiment.run_experiment()

12:11:01 INFO: Obtaining logged feedback
12:11:01 INFO: Done in 0.0 seconds
12:11:01 INFO: Fitting regression model
12:12:18 INFO: Done in 76.91 seconds
12:12:18 INFO: Running simulations
12:12:18 INFO: [1 of 2] Running simulation for egreedy_0.05


100%|██████████| 1374327/1374327 [00:20<00:00, 66949.48it/s]


12:12:42 INFO: [2 of 2] Running simulation for egreedy_0.7


100%|██████████| 1374327/1374327 [00:32<00:00, 42649.81it/s]


12:13:19 INFO: Done in 61.22 seconds
12:13:19 INFO: Estimating rewards
12:13:19 INFO: Estimating reward confidence interval for logged feedback
12:13:23 INFO: [1 of 2] Estimating rewards for egreedy_0.05
12:19:06 INFO: [2 of 2] Estimating rewards for egreedy_0.7
12:24:37 INFO: Done in 677.96 seconds
12:24:37 INFO: Experiment finished in 816.1 seconds


In [7]:
obd_experiment.rewards

{'logged': {'mean': 0.0034739403358880383,
  '95.0% CI (lower)': 0.0033819462180398113,
  '95.0% CI (upper)': 0.0035743312908790995},
 'egreedy_0.05': {'mean': 0.0037086705687790038,
  '95.0% CI (lower)': 0.003706822027564767,
  '95.0% CI (upper)': 0.003710537497724463},
 'egreedy_0.7': {'mean': 0.00346541602435126,
  '95.0% CI (lower)': 0.003463509143184019,
  '95.0% CI (upper)': 0.0034672870551908943}}

## 2. An experiment on OBP dataset with two epsilon greedy policies and replay method (no regression)

Similar to experiment 1, but much faster as we use a (less accurate) estimator which does not require a regression model.

In [10]:
policies = [
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.05,
        policy_name="egreedy_0.05"
    ),
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.7,
        policy_name="egreedy_0.7"
    ),
]

In [11]:
obd_experiment_replay = OBDExperiment(
    dataset=obp_dataset,
    policies=policies,
    estimator=ReplayMethod(),
)

In [12]:
obd_experiment_replay.run_experiment()

12:25:42 INFO: Obtaining logged feedback
12:25:42 INFO: Done in 0.0 seconds
12:25:42 INFO: Running simulations
12:25:42 INFO: [1 of 2] Running simulation for egreedy_0.05


100%|██████████| 1374327/1374327 [00:19<00:00, 71630.70it/s]


12:26:06 INFO: [2 of 2] Running simulation for egreedy_0.7


100%|██████████| 1374327/1374327 [00:34<00:00, 39646.64it/s]


12:26:44 INFO: Done in 62.21 seconds
12:26:44 INFO: Estimating rewards
12:26:44 INFO: Estimating reward confidence interval for logged feedback
12:26:48 INFO: [1 of 2] Estimating rewards for egreedy_0.05
12:26:52 INFO: [2 of 2] Estimating rewards for egreedy_0.7
12:26:55 INFO: Done in 11.06 seconds
12:26:55 INFO: Experiment finished in 73.28 seconds


In [13]:
obd_experiment_replay.rewards

{'logged': {'mean': 0.0034739403358880383,
  '95.0% CI (lower)': 0.0033819462180398113,
  '95.0% CI (upper)': 0.0035743312908790995},
 'egreedy_0.05': {'mean': 0.0026614182480465287,
  '95.0% CI (lower)': 0.0018462487515422113,
  '95.0% CI (upper)': 0.003466306327477821},
 'egreedy_0.7': {'mean': 0.0037580516532728075,
  '95.0% CI (lower)': 0.0028510520826943147,
  '95.0% CI (upper)': 0.004694497270106132}}

## 3. An experiment on Deezer dataset

Deezer experiments are run by generating random feedback as a baseline, then simulating online policy learning for the supplied policies. 

No estimators are needed since the learning is always done online. Instead we just obtain bootstrap estimates of the mean.

In [14]:
deezer_dataset = DeezerDataset("../data/deezer_carousel_bandits/user_features.csv","../data/deezer_carousel_bandits/playlist_features.csv")

In [15]:
policies = [
    EpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.05,
        policy_name="egreedy_0.05"
    ),
    EpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.7,
        policy_name="egreedy_0.7"
    ),
]

In [16]:
deezer_experiment = DeezerExperiment(
    dataset=deezer_dataset,
    policies=policies,
)

In [17]:
deezer_experiment.run_experiment()

12:40:13 INFO: Obtaining random baseline feedback


Calculating click probabilities: 100%|██████████| 100000/100000 [00:16<00:00, 5913.63it/s]
Generating feedback: 100%|██████████| 100000/100000 [00:04<00:00, 23213.81it/s]


12:40:36 INFO: Done in 22.74 seconds
12:40:36 INFO: Learning and obtaining policy feedback
12:40:36 INFO: [1 of 2] Learning and obtaining egreedy_0.05 feedback


Simulating online learning: 100%|██████████| 100000/100000 [00:18<00:00, 5440.78it/s]


12:40:57 INFO: [2 of 2] Learning and obtaining egreedy_0.7 feedback


Simulating online learning: 100%|██████████| 100000/100000 [00:19<00:00, 5223.87it/s]


12:41:17 INFO: Done in 41.19 seconds
12:41:17 INFO: Estimating reward confidence interval for random baseline feedback
12:41:18 INFO: [1 of 2] Estimating reward confindence interval for egreedy_0.05 feedback
12:41:19 INFO: [2 of 2] Estimating reward confindence interval for egreedy_0.7 feedback
12:41:20 INFO: Done in 2.19 seconds
12:41:20 INFO: Experiment finished in 66.13 seconds


In [18]:
deezer_experiment.rewards

{'random': {'mean': 0.02699252613151486,
  '95.0% CI (lower)': 0.026517594669501272,
  '95.0% CI (upper)': 0.027457833749215527},
 'egreedy_0.05': {'mean': 0.08347886720843115,
  '95.0% CI (lower)': 0.08258542383748002,
  '95.0% CI (upper)': 0.08438901745330148},
 'egreedy_0.7': {'mean': 0.05765443921227229,
  '95.0% CI (lower)': 0.056970242570535394,
  '95.0% CI (upper)': 0.058470447020959465}}