In [1]:
import logging
from pathlib import Path
import sys

from obp.dataset.real import OpenBanditDataset
from obp.ope import DirectMethod, ReplayMethod
from obp.policy import EpsilonGreedy
from sklearn.linear_model import LinearRegression

from sd_bandits.obp_extensions.dataset import DeezerDataset
from sd_bandits.experiment import DeezerExperiment, OBDExperiment

In [2]:
logging.basicConfig(
    format="%(asctime)s %(levelname)s: %(message)s",
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(sys.stdout),
    ],
    datefmt="%-I:%M:%S",
)

# Experiment interface

In this notebook, we show off the `Experiment` interface, which is implemented as `OBDExperiment` and `DeezerExperiment`. It provides uniform instantiation and a `run_experiment` method to make it easier for our script to run lots of experiments.

## 1. An experiment on OBD dataset with two epsilon greedy policies and direct method

OBD experiments are done using Zozo's logged data. Policies are learned offline using the `run_bandit_simulation` method from OBP, then are evaluated using off-policy estimators. In this experiment, we use an off-policy estimator which requires a regression model.

In [3]:
obp_dataset = OpenBanditDataset(
    behavior_policy="random",
    campaign="all",
    data_path=Path("../data/open_bandit_dataset/"),
)


  mask |= (ar1 == a)


In [4]:
policies = [
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.05,
        policy_name="egreedy_0.05"
    ),
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.7,
        policy_name="egreedy_0.7"
    ),
]

In [5]:
obd_experiment = OBDExperiment(
    dataset=obp_dataset,
    policies=[(policy,{}) for policy in policies],
    estimator=DirectMethod(),
    regression_base_model=LinearRegression(),
)

In [6]:
obd_experiment.run_experiment()

6:30:30 INFO: Running experiment
6:30:30 INFO: Obtaining logged feedback
6:30:30 INFO: Done in 0.0 seconds
6:30:30 INFO: Fitting regression model
6:31:38 INFO: Done in 67.46 seconds
6:31:38 INFO: Running simulations
6:31:38 INFO: [1 of 2] Running simulation for egreedy_0.05


100%|██████████| 1374327/1374327 [00:20<00:00, 68294.55it/s]


6:32:01 INFO: [2 of 2] Running simulation for egreedy_0.7


100%|██████████| 1374327/1374327 [00:30<00:00, 45296.33it/s]


6:32:36 INFO: Done in 57.96 seconds
6:32:36 INFO: Estimating rewards
6:32:36 INFO: [1 of 3] Estimating reward confidence interval for logged
6:32:39 INFO: [2 of 3] Estimating rewards and reward confidence interval for egreedy_0.05
6:32:46 INFO: [3 of 3] Estimating rewards and reward confidence interval for egreedy_0.7
6:32:52 INFO: Done in 16.29 seconds
6:32:52 INFO: Experiment finished in 141.72 seconds


In [7]:
obd_experiment.reward_summary

{'logged': {'mean': 0.0034761887090917964,
  '95.0% CI (lower)': 0.0033740150633728362,
  '95.0% CI (upper)': 0.003591994481662661},
 'egreedy_0.05': {'mean': 0.0037086448762773437,
  '95.0% CI (lower)': 0.0037064967699232875,
  '95.0% CI (upper)': 0.0037106488262537212},
 'egreedy_0.7': {'mean': 0.0034654071772142175,
  '95.0% CI (lower)': 0.003463383846547121,
  '95.0% CI (upper)': 0.0034670289864836834}}

In [8]:
for policy_name, feedback in obd_experiment.policy_feedback.items():
    print(f"Feedback for {policy_name}:")
    print("\n".join([f"  {key}" for key in feedback.keys()]))

Feedback for logged:
  n_rounds
  n_actions
  action
  position
  reward
  reward_test
  pscore
  context
  action_context
Feedback for egreedy_0.05:
  action
  reward
Feedback for egreedy_0.7:
  action
  reward


## 2. An experiment on OBP dataset with two epsilon greedy policies and replay method (no regression)

Similar to experiment 1, but much faster as we use a (less accurate) estimator which does not require a regression model.

In [9]:
policies = [
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.05,
        policy_name="egreedy_0.05"
    ),
    EpsilonGreedy(
        n_actions=obp_dataset.n_actions,
        len_list=obp_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.7,
        policy_name="egreedy_0.7"
    ),
]

In [10]:
obd_experiment_replay = OBDExperiment(
    dataset=obp_dataset,
    policies=[(policy,{}) for policy in policies],
    estimator=ReplayMethod(),
)

In [11]:
obd_experiment_replay.run_experiment()

6:32:52 INFO: Running experiment
6:32:52 INFO: Obtaining logged feedback
6:32:52 INFO: Done in 0.0 seconds
6:32:52 INFO: Running simulations
6:32:52 INFO: [1 of 2] Running simulation for egreedy_0.05


100%|██████████| 1374327/1374327 [00:19<00:00, 70059.91it/s]


6:33:15 INFO: [2 of 2] Running simulation for egreedy_0.7


100%|██████████| 1374327/1374327 [00:30<00:00, 44713.49it/s]


6:33:50 INFO: Done in 58.27 seconds
6:33:50 INFO: Estimating rewards
6:33:50 INFO: [1 of 3] Estimating reward confidence interval for logged
6:33:54 INFO: [2 of 3] Estimating rewards and reward confidence interval for egreedy_0.05
6:33:57 INFO: [3 of 3] Estimating rewards and reward confidence interval for egreedy_0.7
6:34:00 INFO: Done in 9.94 seconds
6:34:00 INFO: Experiment finished in 68.22 seconds


In [12]:
obd_experiment_replay.reward_summary

{'logged': {'mean': 0.0034761887090917964,
  '95.0% CI (lower)': 0.0033740150633728362,
  '95.0% CI (upper)': 0.003591994481662661},
 'egreedy_0.05': {'mean': 0.0027236942600317245,
  '95.0% CI (lower)': 0.0020254391633864038,
  '95.0% CI (upper)': 0.003497150578696902},
 'egreedy_0.7': {'mean': 0.0037752285135881263,
  '95.0% CI (lower)': 0.0028832586957855365,
  '95.0% CI (upper)': 0.004600944727317347}}

## 3. An experiment on Deezer dataset

Deezer experiments are run by generating random feedback as a baseline, then simulating online policy learning for the supplied policies. 

No estimators are needed since the learning is always done online. Instead we just obtain bootstrap estimates of the mean.

In [13]:
deezer_dataset = DeezerDataset("../data/deezer_carousel_bandits/user_features.csv","../data/deezer_carousel_bandits/playlist_features.csv")

In [14]:
policies = [
    EpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.05,
        policy_name="egreedy_0.05"
    ),
    EpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=0,
        epsilon=0.7,
        policy_name="egreedy_0.7"
    ),
]

In [15]:
deezer_experiment = DeezerExperiment(
    dataset=deezer_dataset,
    policies=[(policy, {"users_per_batch": 1000}) for policy in policies],
)

In [16]:
deezer_experiment.run_experiment()

6:34:20 INFO: Running experiment
6:34:20 INFO: Learning and obtaining policy feedback
6:34:20 INFO: [1 of 2] Learning and obtaining egreedy_0.05 feedback


Simulating online learning: 100%|██████████| 100000/100000 [00:18<00:00, 5436.01it/s]


6:34:40 INFO: [2 of 2] Learning and obtaining egreedy_0.7 feedback


Simulating online learning: 100%|██████████| 100000/100000 [00:19<00:00, 5160.39it/s]


6:35:01 INFO: Done in 41.24 seconds
6:35:01 INFO: Estimating reward confidence interval for random baseline feedback
6:35:01 INFO: [1 of 2] Estimating reward confindence interval for egreedy_0.05 feedback
6:35:02 INFO: [2 of 2] Estimating reward confindence interval for egreedy_0.7 feedback
6:35:02 INFO: Done in 1.35 seconds
6:35:02 INFO: Experiment finished in 42.6 seconds


In [17]:
deezer_experiment.reward_summary

{'egreedy_0.05': {'mean': 0.08347886720843115,
  '95.0% CI (lower)': 0.08258542383748002,
  '95.0% CI (upper)': 0.08438901745330148},
 'egreedy_0.7': {'mean': 0.05765443921227229,
  '95.0% CI (lower)': 0.056970242570535394,
  '95.0% CI (upper)': 0.058470447020959465}}