In [1]:
import logging
from pathlib import Path
import pickle
import sys

import matplotlib.pyplot as plt
from obp.policy import EpsilonGreedy, BernoulliTS, Random, LinEpsilonGreedy
from sklearn.linear_model import LogisticRegression


from sd_bandits.experiment import DeezerExperiment
from sd_bandits.obp_extensions.dataset import DeezerDataset
from sd_bandits.obp_extensions.policy import ExploreThenCommit, KLUpperConfidenceBound, SegmentPolicy

In [2]:
logging.basicConfig(
    format="%(asctime)s %(levelname)s: %(message)s",
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(sys.stdout),
    ],
    datefmt="%-I:%M:%S",
)

# Ad hoc deezer experiments


In [3]:
deezer_dataset = DeezerDataset(
    "../data/deezer_carousel_bandits/user_features.csv",
    "../data/deezer_carousel_bandits/playlist_features.csv",
)

In [4]:
policies = [
    Random(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
    ),
    EpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        epsilon=0.01,
        policy_name="egreedy_exploit",
    ),
    EpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        epsilon=0.1,
        policy_name="egreedy_explore",
    ),
    BernoulliTS(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        alpha=1,
        beta=1,
        policy_name="ts_naive",
    ),
    BernoulliTS(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        alpha=1,
        beta=100,
        policy_name="ts_pessimistic",
    ),
    ExploreThenCommit(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        min_n=20,
        policy_name="etc_exploit",
    ),
    ExploreThenCommit(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        min_n=100,
        policy_name="etc_explore",
    ),
    LinEpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        dim=96,
        epsilon=0.01,
        # policy_name="lin_egreedy_explore",
    ),
    LinEpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        dim=96,
        epsilon=0.1,
        # policy_name="lin_egreedy_exploit",
    ),
    KLUpperConfidenceBound(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        policy_name="kl_ucp",
    ),
    SegmentPolicy(
        EpsilonGreedy(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            epsilon=0.01,
        ),
        n_segments=100,
        policy_name="seg_egreedy_exploit",
    ),
    SegmentPolicy(
        EpsilonGreedy(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            epsilon=0.1,
        ),
        n_segments=100,
        policy_name="seg_egreedy_explore",
    ),
    SegmentPolicy(
        BernoulliTS(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            alpha=1,
            beta=1,
        ),
        n_segments=100,
        policy_name="seg_ts_naive",
    ),
    SegmentPolicy(
        BernoulliTS(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            alpha=1,
            beta=100,
        ),
        n_segments=100,
        policy_name="seg_ts_pessimistic",
    ),
    SegmentPolicy(
        ExploreThenCommit(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            min_n=20,
        ),
        n_segments=100,
        policy_name="seg_etc_exploit",
    ),
    SegmentPolicy(
        ExploreThenCommit(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            min_n=100,
        ),
        n_segments=100,
        policy_name="seg_etc_explore",
    ),
    SegmentPolicy(
        KLUpperConfidenceBound(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            policy_name="kl_ucp",
        ),
        n_segments=100,
        policy_name="seg_kl_ucp",
    ),
]

In [5]:
deezer_experiment = DeezerExperiment(
    dataset=deezer_dataset,
    policies=[
        (policy, {"users_per_batch": 20000})
        if policy.policy_name[:3] != "lin"
        else (policy, {"users_per_batch": 5000})
        for policy in policies
    ],
)

(Accidentally ran with linear dimension set to 96 instead of 97; so stopped early and reran the rest as a separate experiment)

In [6]:
deezer_experiment.run_experiment()

6:41:51 INFO: Running experiment
6:41:51 INFO: Learning and obtaining policy feedback
6:41:51 INFO: [1 of 17] Learning and obtaining random feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [06:05<00:00, 5471.77it/s]


6:48:57 INFO: [2 of 17] Learning and obtaining egreedy_exploit feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [06:22<00:00, 5232.57it/s]


6:56:17 INFO: [3 of 17] Learning and obtaining egreedy_explore feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [06:33<00:00, 5077.49it/s]


7:03:42 INFO: [4 of 17] Learning and obtaining ts_naive feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [11:26<00:00, 2912.92it/s]


7:18:52 INFO: [5 of 17] Learning and obtaining ts_pessimistic feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [10:51<00:00, 3071.47it/s]


7:30:34 INFO: [6 of 17] Learning and obtaining etc_exploit feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [09:20<00:00, 3567.55it/s]


7:40:59 INFO: [7 of 17] Learning and obtaining etc_explore feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [10:15<00:00, 3246.77it/s]


7:52:07 INFO: [8 of 17] Learning and obtaining linear_epsilon_greedy_0.01 feedback


Simulating online learning:   0%|          | 0/500000 [00:00<?, ?it/s]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 96 is different from 97)

In [7]:
deezer_experiment.get_policy_rewards()

7:54:45 INFO: Estimating reward confidence interval for random baseline feedback
7:54:45 INFO: [1 of 17] Estimating reward confindence interval for random feedback
7:55:00 INFO: [2 of 17] Estimating reward confindence interval for egreedy_exploit feedback
7:55:18 INFO: [3 of 17] Estimating reward confindence interval for egreedy_explore feedback
7:55:37 INFO: [4 of 17] Estimating reward confindence interval for ts_naive feedback
7:55:55 INFO: [5 of 17] Estimating reward confindence interval for ts_pessimistic feedback
7:56:13 INFO: [6 of 17] Estimating reward confindence interval for etc_exploit feedback
7:56:32 INFO: [7 of 17] Estimating reward confindence interval for etc_explore feedback
7:56:47 INFO: [8 of 17] Estimating reward confindence interval for linear_epsilon_greedy_0.01 feedback


KeyError: 'reward'

In [9]:
import pickle
with open("../results_deezer_1_7.pickle","wb") as f:
    pickle.dump(deezer_experiment.output, f)

In [8]:
deezer_experiment.output

{'policy_feedback': {'random': {'reward': array([0, 0, 0, ..., 0, 0, 0]),
   'n_rounds': 6649827,
   'n_actions': 862,
   'segments': array([34, 34, 34, ..., 66, 66, 66], dtype=int8),
   'batches': array([ 0,  0,  0, ..., 99, 99, 99], dtype=int8)},
  'egreedy_exploit': {'reward': array([0, 0, 0, ..., 0, 0, 0]),
   'n_rounds': 6980548,
   'n_actions': 862,
   'segments': array([34, 34, 34, ..., 66, 66, 66], dtype=int8),
   'batches': array([ 0,  0,  0, ..., 99, 99, 99], dtype=int8)},
  'egreedy_explore': {'reward': array([0, 0, 0, ..., 0, 0, 0]),
   'n_rounds': 7269248,
   'n_actions': 862,
   'segments': array([34, 34, 34, ..., 66, 66, 66], dtype=int8),
   'batches': array([ 0,  0,  0, ..., 99, 99, 99], dtype=int8)},
  'ts_naive': {'reward': array([0, 0, 0, ..., 1, 0, 1]),
   'n_rounds': 6883744,
   'n_actions': 862,
   'segments': array([34, 34, 34, ..., 66, 66, 66], dtype=int8),
   'batches': array([ 0,  0,  0, ..., 99, 99, 99], dtype=int8)},
  'ts_pessimistic': {'reward': array([0, 

(Resuming from where we left off, but this time with correct dimension for linear policies)

In [10]:
policies_cont = [
    LinEpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        dim=97,
        epsilon=0.01,
        # policy_name="lin_egreedy_explore",
    ),
    LinEpsilonGreedy(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        dim=97,
        epsilon=0.1,
        # policy_name="lin_egreedy_exploit",
    ),
    KLUpperConfidenceBound(
        n_actions=deezer_dataset.n_actions,
        len_list=deezer_dataset.len_list,
        batch_size=1,
        random_state=1,
        policy_name="kl_ucp",
    ),
    SegmentPolicy(
        EpsilonGreedy(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            epsilon=0.01,
        ),
        n_segments=100,
        policy_name="seg_egreedy_exploit",
    ),
    SegmentPolicy(
        EpsilonGreedy(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            epsilon=0.1,
        ),
        n_segments=100,
        policy_name="seg_egreedy_explore",
    ),
    SegmentPolicy(
        BernoulliTS(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            alpha=1,
            beta=1,
        ),
        n_segments=100,
        policy_name="seg_ts_naive",
    ),
    SegmentPolicy(
        BernoulliTS(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            alpha=1,
            beta=100,
        ),
        n_segments=100,
        policy_name="seg_ts_pessimistic",
    ),
    SegmentPolicy(
        ExploreThenCommit(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            min_n=20,
        ),
        n_segments=100,
        policy_name="seg_etc_exploit",
    ),
    SegmentPolicy(
        ExploreThenCommit(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            min_n=100,
        ),
        n_segments=100,
        policy_name="seg_etc_explore",
    ),
    SegmentPolicy(
        KLUpperConfidenceBound(
            n_actions=deezer_dataset.n_actions,
            len_list=deezer_dataset.len_list,
            batch_size=1,
            random_state=1,
            policy_name="kl_ucp",
        ),
        n_segments=100,
        policy_name="seg_kl_ucp",
    ),
]

In [14]:
deezer_experiment = DeezerExperiment(
    dataset=deezer_dataset,
    policies=[
        (policy, {"users_per_batch": 20000})
        if policy.policy_name[:3] != "lin"
        else (policy, {"users_per_batch": 1000})
        for policy in policies_cont
    ],
)

In [15]:
deezer_experiment.run_experiment()

8:01:26 INFO: Running experiment
8:01:26 INFO: Learning and obtaining policy feedback
8:01:26 INFO: [1 of 10] Learning and obtaining linear_epsilon_greedy_0.01 feedback


Simulating online learning: 100%|██████████| 100000/100000 [28:56<00:00, 57.60it/s] 


8:30:23 INFO: [2 of 10] Learning and obtaining linear_epsilon_greedy_0.1 feedback


Simulating online learning: 100%|██████████| 100000/100000 [53:04<00:00, 31.40it/s]    

9:23:28 INFO: [3 of 10] Learning and obtaining kl_ucp feedback



Simulating online learning: 100%|██████████| 2000000/2000000 [22:28<00:00, 1483.60it/s]


9:47:06 INFO: [4 of 10] Learning and obtaining seg_egreedy_exploit feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [06:37<00:00, 5029.07it/s]


9:54:24 INFO: [5 of 10] Learning and obtaining seg_egreedy_explore feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [06:47<00:00, 4912.56it/s]


10:01:55 INFO: [6 of 10] Learning and obtaining seg_ts_naive feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [10:49<00:00, 3078.94it/s]


10:13:28 INFO: [7 of 10] Learning and obtaining seg_ts_pessimistic feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [10:33<00:00, 3156.04it/s]


10:24:44 INFO: [8 of 10] Learning and obtaining seg_etc_exploit feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [09:36<00:00, 3470.69it/s]


10:35:06 INFO: [9 of 10] Learning and obtaining seg_etc_explore feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [10:52<00:00, 3067.17it/s]


10:46:47 INFO: [10 of 10] Learning and obtaining seg_kl_ucp feedback


Simulating online learning: 100%|██████████| 2000000/2000000 [19:18<00:00, 1725.83it/s] 


11:06:54 INFO: Done in 9382.91 seconds
11:06:54 INFO: Estimating reward confidence interval for random baseline feedback
11:06:54 INFO: [1 of 10] Estimating reward confindence interval for linear_epsilon_greedy_0.01 feedback
11:06:55 INFO: [2 of 10] Estimating reward confindence interval for linear_epsilon_greedy_0.1 feedback
11:06:56 INFO: [3 of 10] Estimating reward confindence interval for kl_ucp feedback
11:07:12 INFO: [4 of 10] Estimating reward confindence interval for seg_egreedy_exploit feedback
11:07:28 INFO: [5 of 10] Estimating reward confindence interval for seg_egreedy_explore feedback
11:07:51 INFO: [6 of 10] Estimating reward confindence interval for seg_ts_naive feedback
11:08:11 INFO: [7 of 10] Estimating reward confindence interval for seg_ts_pessimistic feedback
11:08:27 INFO: [8 of 10] Estimating reward confindence interval for seg_etc_exploit feedback
11:08:43 INFO: [9 of 10] Estimating reward confindence interval for seg_etc_explore feedback
11:08:59 INFO: [10 of 

In [16]:
import pickle
with open("../results_deezer_8_17.pickle","wb") as f:
    pickle.dump(deezer_experiment.output, f)

In [17]:
deezer_experiment.output

{'policy_feedback': {'linear_epsilon_greedy_0.01': {'reward': array([0, 0, 0, ..., 1, 1, 1]),
   'n_rounds': 345584,
   'n_actions': 862,
   'segments': array([34, 34, 34, ..., 65, 65, 65], dtype=int8),
   'batches': array([ 0,  0,  0, ..., 99, 99, 99], dtype=int8)},
  'linear_epsilon_greedy_0.1': {'reward': array([0, 0, 0, ..., 1, 1, 1]),
   'n_rounds': 359584,
   'n_actions': 862,
   'segments': array([34, 34, 34, ..., 65, 65, 65], dtype=int8),
   'batches': array([ 0,  0,  0, ..., 99, 99, 99], dtype=int8)},
  'kl_ucp': {'reward': array([0, 0, 0, ..., 0, 0, 0]),
   'n_rounds': 7560780,
   'n_actions': 862,
   'segments': array([34, 34, 34, ..., 66, 66, 66], dtype=int8),
   'batches': array([ 0,  0,  0, ..., 99, 99, 99], dtype=int8)},
  'seg_egreedy_exploit': {'reward': array([0, 0, 0, ..., 0, 0, 0]),
   'n_rounds': 7870019,
   'n_actions': 862,
   'segments': array([34, 34, 34, ..., 66, 66, 66], dtype=int8),
   'batches': array([ 0,  0,  0, ..., 99, 99, 99], dtype=int8)},
  'seg_egre