In [1]:
import numpy as np
import pandas as pd

from obp.ope import ReplayMethod
from obp.policy import EpsilonGreedy, BernoulliTS
from sd_bandits.obp_extensions.policy import ExploreThenCommit, SegmentPolicy 
from obp.simulator import run_bandit_simulation
from obp.utils import convert_to_action_dist

from sd_bandits.obp_extensions.dataset import DeezerDataset

# Load Deezer Data

In [2]:
user_features_path = "../data/deezer_carousel_bandits/user_features.csv"
playlist_features_path = "../data/deezer_carousel_bandits/playlist_features.csv"

deezer_data = DeezerDataset(
    user_features_path,
    playlist_features_path,
    len_list=12,
    len_init=3,
)

Get random baseline

In [3]:
random_deezer_feedback = deezer_data.obtain_batch_bandit_feedback(
    n_batches=100,
    users_per_batch=1000,
    cascade=True,
    seed=1,
)

print("\ncascade is enabled, so we observe at least 3 items per user per user session")
print("min number of actions is thus 100 batches * 1000 users * 3 items = 300,000")
print("feedback dict:")
for key, value in random_deezer_feedback.items():
    if key[0:2] != "n_":
        print(f"  {key}: {type(value)}, {value.shape}")
    else:
        print(f"  {key}: {value}")

Calculating click probabilities: 100%|██████████| 100000/100000 [00:12<00:00, 7840.59it/s]
Generating feedback: 100%|██████████| 100000/100000 [00:02<00:00, 42668.57it/s]



cascade is enabled, so we observe at least 3 items per user per user session
min number of actions is thus 100 batches * 1000 users * 3 items = 300,000
feedback dict:
  action: <class 'numpy.ndarray'>, (333027,)
  reward: <class 'numpy.ndarray'>, (333027,)
  position: <class 'numpy.ndarray'>, (333027,)
  context: <class 'numpy.ndarray'>, (333027, 97)
  action_context: <class 'numpy.ndarray'>, (333027, 97)
  pscore: <class 'numpy.ndarray'>, (333027,)
  n_rounds: 333027
  n_actions: 862
  users: <class 'numpy.ndarray'>, (100000,)
  segments: <class 'numpy.ndarray'>, (333027,)


In [4]:
exp_rand_reward = round(random_deezer_feedback["reward"].mean(),4)
print(f"Expected reward for uniform random actions: {exp_rand_reward}")

Expected reward for uniform random actions: 0.027


## 2. Do online bandit learning on context-free policies and segment-based policies

Tried with 3 different policies options, each with two different parameter options (parameters are from Deezer paper). Each of the 6 policies is also tried as a segment-based policy

In [5]:
e_greedy_explore = EpsilonGreedy(
    n_actions=deezer_data.n_actions,
    len_list=12,
    # this batch_size setting will be ignored because supplying the policy
    # to `deezer_data.obtain_batch_bandit_feedback` will manually update
    # once per batch of *users*
    batch_size=1, 
    random_state=1,
    epsilon=0.1,
    policy_name='e_greedy_explore'
)

e_greedy_exploit = EpsilonGreedy(
    n_actions=deezer_data.n_actions,
    len_list=12,
    # this batch_size setting will be ignored because supplying the policy
    # to `deezer_data.obtain_batch_bandit_feedback` will manually update
    # once per batch of *users*
    batch_size=1, 
    random_state=1,
    epsilon=0.01,
    policy_name='e_greedy_exploit'
)

e_greedy_explore_seg = SegmentPolicy(e_greedy_explore, n_segments = 100)
e_greedy_exploit_seg = SegmentPolicy(e_greedy_exploit, n_segments = 100)

etc_explore = ExploreThenCommit(
    n_actions=deezer_data.n_actions,
    len_list=12,
    batch_size=1,
    random_state=1,
    min_n=100,
    policy_name='etc_explore'
)

etc_exploit = ExploreThenCommit(
    n_actions=deezer_data.n_actions,
    len_list=12,
    batch_size=1,
    random_state=1,
    min_n=20,
    policy_name='etc_exploit'
)

etc_explore_seg = SegmentPolicy(etc_explore, n_segments=100)
etc_exploit_seg = SegmentPolicy(etc_exploit, n_segments=100)

ts_naive = BernoulliTS(
    n_actions=deezer_data.n_actions,
    len_list=12,
    batch_size=1,
    random_state=1,
    alpha=np.ones(deezer_data.n_actions),
    beta=np.ones(deezer_data.n_actions),
    policy_name='ts_naive'
)

ts_pessimistic = BernoulliTS(
    n_actions=deezer_data.n_actions,
    len_list=12,
    batch_size=1,
    random_state=1,
    alpha=np.ones(deezer_data.n_actions),
    beta=np.ones(deezer_data.n_actions)*99,
    policy_name='ts_pessimistic')

ts_naive_seg = SegmentPolicy(ts_naive, n_segments=100)
ts_pessimistic_seg = SegmentPolicy(ts_pessimistic, n_segments=100)

policies = [e_greedy_explore, e_greedy_exploit, 
            e_greedy_explore_seg, e_greedy_exploit_seg,
            etc_explore, etc_exploit,
            etc_explore_seg, etc_exploit_seg,
            ts_naive, ts_pessimistic,
            ts_naive_seg, ts_pessimistic_seg]

policy_dict = [(policy.policy_name, policy) for policy in policies]

In [6]:
feedback_dict = {}
for policy in policies:
    print(policy.policy_name)
    feedback = deezer_data.obtain_batch_bandit_feedback(
        policy=policy,
        n_batches=100,
        users_per_batch=1000,
        cascade=True,
        seed=1
    )
    feedback_dict[policy.policy_name] = feedback

Simulating online learning:   1%|          | 800/100000 [00:00<00:12, 7996.27it/s]

e_greedy_explore


Simulating online learning: 100%|██████████| 100000/100000 [00:12<00:00, 8163.64it/s]
Simulating online learning:   0%|          | 0/100000 [00:00<?, ?it/s]

e_greedy_exploit


Simulating online learning: 100%|██████████| 100000/100000 [00:11<00:00, 9074.40it/s]
Simulating online learning:   1%|          | 723/100000 [00:00<00:13, 7224.75it/s]

e_greedy_explore_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:14<00:00, 6836.19it/s]
Simulating online learning:   1%|          | 777/100000 [00:00<00:12, 7769.97it/s]

e_greedy_exploit_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:15<00:00, 6448.12it/s]
Simulating online learning:   1%|          | 662/100000 [00:00<00:15, 6618.71it/s]

etc_explore


Simulating online learning: 100%|██████████| 100000/100000 [00:20<00:00, 4897.25it/s]
Simulating online learning:   1%|          | 600/100000 [00:00<00:16, 5995.03it/s]

etc_exploit


Simulating online learning: 100%|██████████| 100000/100000 [00:18<00:00, 5482.75it/s]
Simulating online learning:   1%|          | 645/100000 [00:00<00:15, 6449.27it/s]

etc_explore_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:19<00:00, 5115.56it/s]
Simulating online learning:   1%|          | 547/100000 [00:00<00:18, 5469.35it/s]

etc_exploit_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:19<00:00, 5223.44it/s]
Simulating online learning:   0%|          | 476/100000 [00:00<00:20, 4756.73it/s]

ts_naive


Simulating online learning: 100%|██████████| 100000/100000 [00:23<00:00, 4189.89it/s]
Simulating online learning:   0%|          | 0/100000 [00:00<?, ?it/s]

ts_pessimistic


Simulating online learning: 100%|██████████| 100000/100000 [00:23<00:00, 4275.19it/s]
Simulating online learning:   0%|          | 0/100000 [00:00<?, ?it/s]

ts_naive_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:25<00:00, 3867.49it/s]
Simulating online learning:   0%|          | 362/100000 [00:00<00:27, 3617.18it/s]

ts_pessimistic_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:25<00:00, 3949.46it/s]


Now we've generated a dataset that contains the actions and rewards generated by an online experiment with our epsilon-greedy bandit.

Using the `ReplayMethod` here isn't strictly necessary: since we did online learning, our logs always match our actions and so we could just as easily get `mean_eps_greedy_online_reward = eg_deezer_feedback["reward"].mean()`.

In [7]:
replay_estimator = ReplayMethod()
for policy_name in feedback_dict:
    feedback = feedback_dict[policy_name]
    estimates = replay_estimator.estimate_interval(
        reward=feedback["reward"],
        action=feedback["action"],
        position=feedback["position"],
        action_dist=convert_to_action_dist(deezer_data.n_actions, feedback["selected_actions"])
    )

    mean_reward = np.round(estimates["mean"], 4)
    online_relative = np.round(estimates["mean"] / random_deezer_feedback["reward"].mean(), 2)

    print(f"Expected reward for {policy_name} trained online: {mean_reward}",
          f"({online_relative}x random baseline)!")

    lo_online_reward = np.round(estimates["95.0% CI (lower)"], 4)
    hi_online_reward = np.round(estimates["95.0% CI (upper)"], 4)
    print(f"95% confidence interval is {lo_online_reward}-{hi_online_reward}")
    print()

Expected reward for e_greedy_explore trained online: 0.0614 (2.27x random baseline)!
95% confidence interval is 0.0607-0.062

Expected reward for e_greedy_exploit trained online: 0.1156 (4.28x random baseline)!
95% confidence interval is 0.1146-0.1166

Expected reward for e_greedy_explore_seg trained online: 0.0281 (1.04x random baseline)!
95% confidence interval is 0.0276-0.0287

Expected reward for e_greedy_exploit_seg trained online: 0.0282 (1.04x random baseline)!
95% confidence interval is 0.0278-0.0287

Expected reward for etc_explore trained online: 0.026 (0.96x random baseline)!
95% confidence interval is 0.0255-0.0265

Expected reward for etc_exploit trained online: 0.1874 (6.93x random baseline)!
95% confidence interval is 0.1862-0.1886

Expected reward for etc_explore_seg trained online: 0.0267 (0.99x random baseline)!
95% confidence interval is 0.0262-0.0272

Expected reward for etc_exploit_seg trained online: 0.0267 (0.99x random baseline)!
95% confidence interval is 0.026

Results are interesting. The segment-based policies seem to almost always perform worse than the context-free ones.

Now we can try updating parameters every round

In [8]:
feedback_dict = {}
for policy in policies:
    print(policy.policy_name)
    feedback = deezer_data.obtain_batch_bandit_feedback(
        policy=policy,
        n_batches=100000,
        users_per_batch=1,
        cascade=True,
        seed=1
    )
    feedback_dict[policy.policy_name] = feedback

e_greedy_explore


Simulating online learning: 100%|██████████| 100000/100000 [00:26<00:00, 3785.74it/s]
Simulating online learning:   1%|          | 719/100000 [00:00<00:13, 7185.71it/s]

e_greedy_exploit


Simulating online learning: 100%|██████████| 100000/100000 [00:12<00:00, 8111.96it/s]
Simulating online learning:   0%|          | 305/100000 [00:00<00:32, 3049.23it/s]

e_greedy_explore_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:21<00:00, 4736.46it/s]
Simulating online learning:   0%|          | 402/100000 [00:00<00:24, 4014.33it/s]

e_greedy_exploit_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:19<00:00, 5235.09it/s]
Simulating online learning:   0%|          | 461/100000 [00:00<00:21, 4606.71it/s]

etc_explore


Simulating online learning: 100%|██████████| 100000/100000 [00:19<00:00, 5019.44it/s]
Simulating online learning:   1%|          | 528/100000 [00:00<00:18, 5275.20it/s]

etc_exploit


Simulating online learning: 100%|██████████| 100000/100000 [00:19<00:00, 5216.35it/s]
Simulating online learning:   0%|          | 326/100000 [00:00<00:30, 3255.36it/s]

etc_explore_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:25<00:00, 3948.46it/s]
Simulating online learning:   0%|          | 228/100000 [00:00<00:43, 2276.14it/s]

etc_exploit_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:25<00:00, 3951.06it/s]
Simulating online learning:   0%|          | 0/100000 [00:00<?, ?it/s]

ts_naive


Simulating online learning: 100%|██████████| 100000/100000 [00:25<00:00, 3964.85it/s]
Simulating online learning:   0%|          | 342/100000 [00:00<00:29, 3418.41it/s]

ts_pessimistic


Simulating online learning: 100%|██████████| 100000/100000 [00:25<00:00, 3959.49it/s]
Simulating online learning:   0%|          | 170/100000 [00:00<00:58, 1697.76it/s]

ts_naive_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:31<00:00, 3195.65it/s]
Simulating online learning:   0%|          | 123/100000 [00:00<01:21, 1227.46it/s]

ts_pessimistic_seg


Simulating online learning: 100%|██████████| 100000/100000 [00:30<00:00, 3229.34it/s]


In [9]:
replay_estimator = ReplayMethod()
for policy_name in feedback_dict:
    feedback = feedback_dict[policy_name]
    estimates = replay_estimator.estimate_interval(
        reward=feedback["reward"],
        action=feedback["action"],
        position=feedback["position"],
        action_dist=convert_to_action_dist(deezer_data.n_actions, feedback["selected_actions"])
    )

    mean_reward = np.round(estimates["mean"], 4)
    online_relative = np.round(estimates["mean"] / random_deezer_feedback["reward"].mean(), 2)

    print(f"Expected reward for {policy_name} trained online: {mean_reward}",
          f"({online_relative}x random baseline)!")

    lo_online_reward = np.round(estimates["95.0% CI (lower)"], 4)
    hi_online_reward = np.round(estimates["95.0% CI (upper)"], 4)
    print(f"95% confidence interval is {lo_online_reward}-{hi_online_reward}")
    print()

Expected reward for e_greedy_explore trained online: 0.1053 (3.89x random baseline)!
95% confidence interval is 0.1044-0.1063

Expected reward for e_greedy_exploit trained online: 0.1337 (4.94x random baseline)!
95% confidence interval is 0.1326-0.1347

Expected reward for e_greedy_explore_seg trained online: 0.0432 (1.6x random baseline)!
95% confidence interval is 0.0426-0.0439

Expected reward for e_greedy_exploit_seg trained online: 0.0456 (1.69x random baseline)!
95% confidence interval is 0.0449-0.0462

Expected reward for etc_explore trained online: 0.2739 (10.13x random baseline)!
95% confidence interval is 0.2726-0.2753

Expected reward for etc_exploit trained online: 0.2815 (10.41x random baseline)!
95% confidence interval is 0.2802-0.2831

Expected reward for etc_explore_seg trained online: 0.0231 (0.86x random baseline)!
95% confidence interval is 0.0227-0.0237

Expected reward for etc_exploit_seg trained online: 0.0298 (1.1x random baseline)!
95% confidence interval is 0.0

An improvement for all policies except for etc_explore_seg.