In [6]:
import numpy as np
import pandas as pd

from obp.ope import ReplayMethod
from obp.policy import EpsilonGreedy, BernoulliTS
from sd_bandits.obp_extensions.policy import ExploreThenCommit, SegmentPolicy, KLUpperConfidenceBound
from obp.simulator import run_bandit_simulation
from obp.utils import convert_to_action_dist

from sd_bandits.obp_extensions.dataset import DeezerDataset

# Load Deezer Data

In [3]:
user_features_path = "../data/deezer_carousel_bandits/user_features.csv"
playlist_features_path = "../data/deezer_carousel_bandits/playlist_features.csv"

deezer_data = DeezerDataset(
    user_features_path,
    playlist_features_path,
    len_list=12,
    len_init=3,
)

Get random baseline

In [4]:
random_deezer_feedback = deezer_data.obtain_batch_bandit_feedback(
    n_batches=100,
    users_per_batch=1000,
    cascade=True,
    seed=1,
)

print("\ncascade is enabled, so we observe at least 3 items per user per user session")
print("min number of actions is thus 100 batches * 1000 users * 3 items = 300,000")
print("feedback dict:")
for key, value in random_deezer_feedback.items():
    if key[0:2] != "n_":
        print(f"  {key}: {type(value)}, {value.shape}")
    else:
        print(f"  {key}: {value}")

Calculating click probabilities: 100%|██████████| 100000/100000 [00:12<00:00, 8139.69it/s]
Generating feedback: 100%|██████████| 100000/100000 [00:02<00:00, 41458.55it/s]



cascade is enabled, so we observe at least 3 items per user per user session
min number of actions is thus 100 batches * 1000 users * 3 items = 300,000
feedback dict:
  action: <class 'numpy.ndarray'>, (333027,)
  reward: <class 'numpy.ndarray'>, (333027,)
  position: <class 'numpy.ndarray'>, (333027,)
  context: <class 'numpy.ndarray'>, (333027, 97)
  action_context: <class 'numpy.ndarray'>, (333027, 97)
  pscore: <class 'numpy.ndarray'>, (333027,)
  n_rounds: 333027
  n_actions: 862
  users: <class 'numpy.ndarray'>, (100000,)
  segments: <class 'numpy.ndarray'>, (333027,)
  batches: <class 'numpy.ndarray'>, (333027,)


In [5]:
exp_rand_reward = round(random_deezer_feedback["reward"].mean(),4)
print(f"Expected reward for uniform random actions: {exp_rand_reward}")

Expected reward for uniform random actions: 0.027


## 2. Do online bandit learning on context-free policies and segment-based policies

Tried with 3 different policies options, each with two different parameter options (parameters are from Deezer paper). Each of the 6 policies is also tried as a segment-based policy

In [12]:
e_greedy_explore = EpsilonGreedy(
    n_actions=deezer_data.n_actions,
    len_list=12,
    # this batch_size setting will be ignored because supplying the policy
    # to `deezer_data.obtain_batch_bandit_feedback` will manually update
    # once per batch of *users*
    batch_size=1, 
    random_state=1,
    epsilon=0.1,
    policy_name='e_greedy_explore'
)

e_greedy_exploit = EpsilonGreedy(
    n_actions=deezer_data.n_actions,
    len_list=12,
    # this batch_size setting will be ignored because supplying the policy
    # to `deezer_data.obtain_batch_bandit_feedback` will manually update
    # once per batch of *users*
    batch_size=1, 
    random_state=1,
    epsilon=0.01,
    policy_name='e_greedy_exploit'
)

e_greedy_explore_seg = SegmentPolicy(e_greedy_explore, n_segments = 100)
e_greedy_exploit_seg = SegmentPolicy(e_greedy_exploit, n_segments = 100)

etc_explore = ExploreThenCommit(
    n_actions=deezer_data.n_actions,
    len_list=12,
    batch_size=1,
    random_state=1,
    min_n=100,
    policy_name='etc_explore'
)

etc_exploit = ExploreThenCommit(
    n_actions=deezer_data.n_actions,
    len_list=12,
    batch_size=1,
    random_state=1,
    min_n=20,
    policy_name='etc_exploit'
)

etc_explore_seg = SegmentPolicy(etc_explore, n_segments=100)
etc_exploit_seg = SegmentPolicy(etc_exploit, n_segments=100)

ts_naive = BernoulliTS(
    n_actions=deezer_data.n_actions,
    len_list=12,
    batch_size=1,
    random_state=1,
    alpha=np.ones(deezer_data.n_actions),
    beta=np.ones(deezer_data.n_actions),
    policy_name='ts_naive'
)

ts_pessimistic = BernoulliTS(
    n_actions=deezer_data.n_actions,
    len_list=12,
    batch_size=1,
    random_state=1,
    alpha=np.ones(deezer_data.n_actions),
    beta=np.ones(deezer_data.n_actions)*99,
    policy_name='ts_pessimistic')

ts_naive_seg = SegmentPolicy(ts_naive, n_segments=100)
ts_pessimistic_seg = SegmentPolicy(ts_pessimistic, n_segments=100)

kl_ucb = KLUpperConfidenceBound(
    n_actions=deezer_data.n_actions,
    len_list = deezer_data.len_list,
    batch_size=1000,
    random_state=0,
)
kl_ucb_seg = SegmentPolicy(kl_ucb, n_segments=100)

policies = [e_greedy_explore, e_greedy_exploit, 
            e_greedy_explore_seg, e_greedy_exploit_seg,
            etc_explore, etc_exploit,
            etc_explore_seg, etc_exploit_seg,
            ts_naive, ts_pessimistic,
            ts_naive_seg, ts_pessimistic_seg,
            kl_ucb, kl_ucb_seg]

policy_dict = dict([(policy.policy_name, policy) for policy in policies])

In [13]:
feedback_dict = {}
for policy in policies:
    print(policy.policy_name)
    feedback = deezer_data.obtain_batch_bandit_feedback(
        policy=policy,
        n_batches=100,
        users_per_batch=1000,
        cascade=True,
        seed=1
    )
    feedback_dict[policy.policy_name] = feedback

Simulating online learning:   0%|          | 0/100000 [00:00<?, ?it/s]

e_greedy_explore


Simulating online learning: 100%|██████████| 100000/100000 [00:13<00:00, 7471.21it/s]
Simulating online learning:   1%|          | 801/100000 [00:00<00:12, 8005.54it/s]

e_greedy_exploit


Simulating online learning: 100%|██████████| 100000/100000 [00:12<00:00, 8278.22it/s]
Simulating online learning:   1%|          | 736/100000 [00:00<00:13, 7353.22it/s]

e_greedy_explore_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:14<00:00, 6734.06it/s]
Simulating online learning:   1%|          | 730/100000 [00:00<00:13, 7292.81it/s]

e_greedy_exploit_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:15<00:00, 6649.59it/s]
Simulating online learning:   1%|          | 639/100000 [00:00<00:15, 6386.35it/s]

etc_explore


Simulating online learning: 100%|██████████| 100000/100000 [00:20<00:00, 4790.89it/s]
Simulating online learning:   1%|          | 675/100000 [00:00<00:14, 6744.89it/s]

etc_exploit


Simulating online learning: 100%|██████████| 100000/100000 [00:18<00:00, 5359.70it/s]
Simulating online learning:   1%|          | 614/100000 [00:00<00:16, 6134.08it/s]

etc_explore_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:19<00:00, 5020.98it/s]
Simulating online learning:   1%|          | 631/100000 [00:00<00:15, 6302.72it/s]

etc_exploit_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:19<00:00, 5078.01it/s]
Simulating online learning:   0%|          | 468/100000 [00:00<00:21, 4679.56it/s]

ts_naive


Simulating online learning: 100%|██████████| 100000/100000 [00:24<00:00, 4027.85it/s]
Simulating online learning:   0%|          | 342/100000 [00:00<00:29, 3419.10it/s]

ts_pessimistic


Simulating online learning: 100%|██████████| 100000/100000 [00:25<00:00, 3892.46it/s]
Simulating online learning:   0%|          | 370/100000 [00:00<00:26, 3699.84it/s]

ts_naive_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:28<00:00, 3497.38it/s]
Simulating online learning:   0%|          | 387/100000 [00:00<00:25, 3867.90it/s]

ts_pessimistic_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:27<00:00, 3610.35it/s]
Simulating online learning:   0%|          | 215/100000 [00:00<00:46, 2147.32it/s]

kl_ucb


Simulating online learning: 100%|██████████| 100000/100000 [00:47<00:00, 2105.52it/s]
Simulating online learning:   1%|          | 623/100000 [00:00<00:15, 6227.94it/s]

kl_ucb_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:45<00:00, 2196.52it/s]


Now we've generated a dataset that contains the actions and rewards generated by an online experiment with our epsilon-greedy bandit.

Using the `ReplayMethod` here isn't strictly necessary: since we did online learning, our logs always match our actions and so we could just as easily get `mean_eps_greedy_online_reward = eg_deezer_feedback["reward"].mean()`.

In [14]:
replay_estimator = ReplayMethod()
for policy_name in feedback_dict:
    feedback = feedback_dict[policy_name]
    estimates = replay_estimator.estimate_interval(
        reward=feedback["reward"],
        action=feedback["action"],
        position=feedback["position"],
        action_dist=convert_to_action_dist(deezer_data.n_actions, feedback["selected_actions"])
    )

    mean_reward = np.round(estimates["mean"], 4)
    online_relative = np.round(estimates["mean"] / random_deezer_feedback["reward"].mean(), 2)

    print(f"Expected reward for {policy_name} trained online: {mean_reward}",
          f"({online_relative}x random baseline)!")

    lo_online_reward = np.round(estimates["95.0% CI (lower)"], 4)
    hi_online_reward = np.round(estimates["95.0% CI (upper)"], 4)
    print(f"95% confidence interval is {lo_online_reward}-{hi_online_reward}")
    print()

Expected reward for e_greedy_explore trained online: 0.0613 (2.27x random baseline)!
95% confidence interval is 0.0605-0.0621

Expected reward for e_greedy_exploit trained online: 0.1157 (4.28x random baseline)!
95% confidence interval is 0.1149-0.1166

Expected reward for e_greedy_explore_seg trained online: 0.0281 (1.04x random baseline)!
95% confidence interval is 0.0276-0.0287

Expected reward for e_greedy_exploit_seg trained online: 0.0283 (1.05x random baseline)!
95% confidence interval is 0.0278-0.0289

Expected reward for etc_explore trained online: 0.0261 (0.96x random baseline)!
95% confidence interval is 0.0256-0.0266

Expected reward for etc_exploit trained online: 0.1874 (6.93x random baseline)!
95% confidence interval is 0.1859-0.1885

Expected reward for etc_explore_seg trained online: 0.0267 (0.99x random baseline)!
95% confidence interval is 0.0262-0.0272

Expected reward for etc_exploit_seg trained online: 0.0267 (0.99x random baseline)!
95% confidence interval is 0.0

Results are interesting. The segment-based policies seem to almost always perform worse than the context-free ones.

Now we can try updating parameters every round

In [16]:
feedback_dict = {}
for policy in policies:
    print(policy.policy_name)
    feedback = deezer_data.obtain_batch_bandit_feedback(
        policy=policy,
        n_batches=100000,
        users_per_batch=1,
        cascade=True,
        seed=1
    )
    feedback_dict[policy.policy_name] = feedback

Simulating online learning:   0%|          | 324/100000 [00:00<00:30, 3234.22it/s]

e_greedy_explore


Simulating online learning: 100%|██████████| 100000/100000 [00:22<00:00, 4409.28it/s]
Simulating online learning:   0%|          | 375/100000 [00:00<00:26, 3743.60it/s]

e_greedy_exploit


Simulating online learning: 100%|██████████| 100000/100000 [00:21<00:00, 4672.39it/s]
Simulating online learning:   0%|          | 298/100000 [00:00<00:33, 2975.89it/s]

e_greedy_explore_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:29<00:00, 3375.80it/s]
Simulating online learning:   0%|          | 176/100000 [00:00<00:56, 1752.84it/s]

e_greedy_exploit_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:42<00:00, 2358.87it/s]
Simulating online learning:   0%|          | 227/100000 [00:00<00:44, 2265.65it/s]

etc_explore


Simulating online learning: 100%|██████████| 100000/100000 [00:28<00:00, 3501.78it/s]
Simulating online learning:   0%|          | 307/100000 [00:00<00:32, 3069.73it/s]

etc_exploit


Simulating online learning: 100%|██████████| 100000/100000 [00:26<00:00, 3802.88it/s]
Simulating online learning:   0%|          | 152/100000 [00:00<01:05, 1517.35it/s]

etc_explore_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:35<00:00, 2805.83it/s]
Simulating online learning:   0%|          | 0/100000 [00:00<?, ?it/s]

etc_exploit_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:36<00:00, 2727.87it/s]
Simulating online learning:   0%|          | 172/100000 [00:00<00:58, 1716.19it/s]

ts_naive


Simulating online learning: 100%|██████████| 100000/100000 [00:41<00:00, 2405.25it/s]
Simulating online learning:   0%|          | 282/100000 [00:00<00:35, 2816.20it/s]

ts_pessimistic


Simulating online learning: 100%|██████████| 100000/100000 [00:36<00:00, 2708.62it/s]
Simulating online learning:   0%|          | 91/100000 [00:00<01:50, 905.77it/s]

ts_naive_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:48<00:00, 2049.66it/s]
Simulating online learning:   0%|          | 115/100000 [00:00<01:27, 1141.25it/s]

ts_pessimistic_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:56<00:00, 1766.38it/s]
Simulating online learning:   0%|          | 109/100000 [00:00<01:32, 1082.97it/s]

kl_ucb


Simulating online learning: 100%|██████████| 100000/100000 [01:32<00:00, 1084.80it/s]
Simulating online learning:   0%|          | 23/100000 [00:00<07:21, 226.39it/s]

kl_ucb_seg


Simulating online learning: 100%|██████████| 100000/100000 [01:20<00:00, 1243.42it/s]


In [17]:
replay_estimator = ReplayMethod()
for policy_name in feedback_dict:
    feedback = feedback_dict[policy_name]
    estimates = replay_estimator.estimate_interval(
        reward=feedback["reward"],
        action=feedback["action"],
        position=feedback["position"],
        action_dist=convert_to_action_dist(deezer_data.n_actions, feedback["selected_actions"])
    )

    mean_reward = np.round(estimates["mean"], 4)
    online_relative = np.round(estimates["mean"] / random_deezer_feedback["reward"].mean(), 2)

    print(f"Expected reward for {policy_name} trained online: {mean_reward}",
          f"({online_relative}x random baseline)!")

    lo_online_reward = np.round(estimates["95.0% CI (lower)"], 4)
    hi_online_reward = np.round(estimates["95.0% CI (upper)"], 4)
    print(f"95% confidence interval is {lo_online_reward}-{hi_online_reward}")
    print()

Expected reward for e_greedy_explore trained online: 0.0849 (3.14x random baseline)!
95% confidence interval is 0.0841-0.0855

Expected reward for e_greedy_exploit trained online: 0.1155 (4.27x random baseline)!
95% confidence interval is 0.1145-0.1165

Expected reward for e_greedy_explore_seg trained online: 0.0828 (3.06x random baseline)!
95% confidence interval is 0.0817-0.0839

Expected reward for e_greedy_exploit_seg trained online: 0.0488 (1.8x random baseline)!
95% confidence interval is 0.048-0.0494

Expected reward for etc_explore trained online: 0.2739 (10.13x random baseline)!
95% confidence interval is 0.2723-0.2754

Expected reward for etc_exploit trained online: 0.2815 (10.41x random baseline)!
95% confidence interval is 0.28-0.2829

Expected reward for etc_explore_seg trained online: 0.0231 (0.86x random baseline)!
95% confidence interval is 0.0226-0.0236

Expected reward for etc_exploit_seg trained online: 0.0298 (1.1x random baseline)!
95% confidence interval is 0.0293

An improvement for all policies except for etc_explore_seg.