In [2]:
import numpy as np
import pandas as pd
import altair as alt
from bandits.environment.cascade.context_free import CascadeContextFreeBandit
from bandits.policy.context_free import BernoulliTS, Random
from bandits.plotting import plot_beta_dist
import vegafusion as vf
from typing import TypedDict, Union
from dataclasses import dataclass
from functools import partial
import tqdm

In [3]:
alt.data_transformers.enable('json') # saves the data locally in .json file so notebook doesn't get large
# or can allow altair to keep the data in the notebook using alt.data_transformers.disable_max_rows() but it will create large notebooks!

DataTransformerRegistry.enable('json')

# helper functions

In [4]:
def plot_pdf_vs_actual(
    pdf_df: pd.DataFrame,
    actuals_df: pd.DataFrame,
    width: int =500,
    height: int =250
) -> alt.Chart:
        
    pdf_charts = alt.Chart(pdf_df).mark_line().encode(
        y=alt.Y('pdf'),
        x=alt.X('x', title='θ'),
        color=alt.Color('action:N', legend=None),
        tooltip = [
            'action',
            alt.Tooltip('pdf', format='0.4', title='θ'),
        ]
    )

    actual_charts = alt.Chart(actuals_df).mark_rule().encode(
        x=alt.X('w', title='θ'),
        color=alt.Color('action:N', legend=None),
        tooltip = [
            'action',
            alt.Tooltip('w', format='0.4', title='θ'),
        ]        
    )

    final_chart = (pdf_charts + actual_charts).properties(
        width=width, height=height
    )

    return final_chart

In [5]:
def plot_actual_vs_predicted(
    policy: BernoulliTS,
    env: CascadeContextFreeBandit
) -> alt.Chart:

    pred = (policy.reward_counts / policy.action_counts)
    act = env.weights

    chart_df = pd.DataFrame(dict(pred=pred, act=act)).assign(
        diff=lambda x: (x.act - x.pred).abs()
    ).assign(
        arm=lambda x: x.index,
        optimal_arm=lambda x: x.arm.isin(env.optimal_action),
        act_sort=lambda x: x.act,
    ).sort_values(
        ['diff'], ascending=False
    ).reset_index(drop=True)

    chart_ff_df = chart_df.melt(id_vars=['arm', 'optimal_arm', 'diff', 'act_sort'])

    return alt.Chart(chart_ff_df).mark_point().encode(
        y=alt.Y('arm:O', sort=alt.SortField("act_sort", "descending")),
        x=alt.X('value'),
        color=alt.Color('variable'),
    )

In [6]:
def plot_observed_optimal_action_prob(
    reporting_df: pd.DataFrame,
    height: int = 275,
    width: int = 675,    
) -> tuple[pd.DataFrame, alt.Chart]:
    policy_prob_df = reporting_df.assign(
        n_trials=1
    ).groupby(
        ['policy_batch_check'], as_index=False
    )[['n_trials','optimal_action_id']].sum().assign(
        prob_of_optimal_action=lambda x: x['optimal_action_id'] /  x['n_trials']
    )

    chart = alt.Chart(policy_prob_df).mark_line().encode(
        x='policy_batch_check', y='prob_of_optimal_action'
    ).properties(
        width=width, 
        height=height,
    )
    return policy_prob_df, chart

In [7]:
def plot_pdf_with_actuals(
    policy: BernoulliTS,
    env: CascadeContextFreeBandit,
    width: int = 500,
    height: int = 250,
) -> tuple[pd.DataFrame, pd.DataFrame, alt.Chart]:
    actuals_df = pd.DataFrame(dict(w=env.weights)).assign(action=lambda x: x.index)

    all_pdf = []
    for idx in range(policy.n_actions):
        pdf = plot_beta_dist(
            alpha=policy.alpha[idx] + policy.reward_counts[idx],
            beta=policy.beta[idx] + (policy.action_counts[idx] - policy.reward_counts[idx])
        ).assign(
            action=idx,
        )
        all_pdf.append(pdf)

    all_pdf_df = pd.concat(all_pdf, axis=0).reset_index(drop=True)    

    pdf_charts = alt.Chart(all_pdf_df).mark_line().encode(
        y=alt.Y('pdf'),
        x=alt.X('x', title='θ'),
        color=alt.Color('action:N', legend=None)
    )

    actual_charts = alt.Chart(actuals_df).mark_rule().encode(
        x=alt.X('w', title='θ'),
        color=alt.Color('action:N', legend=None)
    )

    final_chart = (pdf_charts + actual_charts).properties(
        width=width, height=height
    )

    return all_pdf_df, actuals_df, final_chart

In [8]:
class ActionRewardLogging(TypedDict):
    action: list[int]
    reward: float
    prob_of_click: float

def harmonise_reporting(
    reporting: list[ActionRewardLogging],
    env: CascadeContextFreeBandit,
    policy_batch_check: int = 500,
) -> pd.DataFrame:
    
    reporting_df = pd.DataFrame(reporting).assign(
        optimal_prob_of_click=env.optimal_reward,
        time_idx=lambda x: x.index
    )
    reporting_df['cumulative_reward'] = reporting_df['reward'].cumsum()
    reporting_df['avg_cumulative_reward'] = reporting_df['cumulative_reward'] / (reporting_df['time_idx'] + 1)
    reporting_df['avg_cumulative_reward'] = reporting_df['avg_cumulative_reward'].fillna(0)

    reporting_df['action_as_str'] = reporting_df['action'].apply(lambda x: '|'.join([str(y) for y in x]))

    reporting_df['optimal_action_id'] = reporting_df['action'].apply(lambda x: all(x == env.optimal_action))
    reporting_df['cumulative_optimal_action_taken'] = reporting_df['optimal_action_id'].cumsum()
    reporting_df['avg_cumulative_optimal_action_taken'] = reporting_df['cumulative_optimal_action_taken']/ (reporting_df['time_idx'] + 1)
    reporting_df['avg_cumulative_optimal_action_taken'] = reporting_df['avg_cumulative_optimal_action_taken'].fillna(0)
    
    reporting_df['policy_batch_check'] = (reporting_df['time_idx']  - reporting_df['time_idx'] % policy_batch_check)
    return reporting_df

# Env setup

In [9]:
N_ACTIONS = 50
LEN_LIST = 5

In [10]:
env = CascadeContextFreeBandit(
    weights=np.random.beta(a=1, b=99, size=N_ACTIONS),
    max_steps=1_000_000,
    len_list=LEN_LIST,
)

In [11]:
outputs_of_policies = {}

In [11]:
actuals_df = pd.DataFrame(dict(w=env.weights)).assign(action=lambda x: x.index)
actuals_df.head()

Unnamed: 0,w,action
0,0.00699,0
1,0.006181,1
2,0.010974,2
3,0.00254,3
4,0.023079,4


In [12]:
width = N_ACTIONS * 13
actual_charts = alt.Chart(actuals_df).mark_rule().encode(
    x=alt.X(
        'action:N',
        sort=alt.SortField("w", order='descending'),
        axis=alt.Axis(orient='bottom', labelAngle=0)
    ),
    color=alt.Color('action:N', legend=None),
    y=alt.Y('w', title='θ'),
    text=alt.Text('w', format='0.3'),
    tooltip=[
        'action',
        alt.Tooltip('w', format='0.3', title='θ')
    ]
)

final_chart = (
    actual_charts +
    actual_charts.mark_point(filled=True, size=50) + 
    actual_charts.mark_text(align='left', angle=45*7, dx=5)
).properties(
    width=width, height=225
)

final_chart

# Random Policy

In [13]:
policy = Random(
    n_actions=env.n_actions,
    len_list=env.len_list,
    random_state=1234,
    batch_size=1,
)

In [14]:
observation, info = env.reset(seed=1234)
action = policy.select_action()
action

array([37, 45,  6, 44, 13])

In [15]:
reporting = []

while True:
    _, reward, terminated, truncated, info = env.step(action=action)
    reporting.append(dict(
        action=action,
        reward=reward,
        prob_of_click=info["prob_of_click"],        
    ))

    policy.cascade_params_update(
        action=action,
        reward_position=info["position_of_click"]
    )

    if truncated:
        break    

    action = policy.select_action()

In [16]:
reporting_df = harmonise_reporting(
    reporting=reporting,
    env=env,
    policy_batch_check=500,
)

In [17]:
policy_name = 'random'
outputs_of_policies[policy_name] = reporting_df.assign(policy=policy_name)


In [18]:
reporting_ff_df = reporting_df.melt(
    id_vars=['time_idx', 'action'],
    value_vars=['prob_of_click', 'optimal_prob_of_click']
)

reporting_ff_df.head()

Unnamed: 0,time_idx,action,variable,value
0,0,"[37, 45, 6, 44, 13]",prob_of_click,0.024118
1,1,"[38, 35, 44, 13, 43]",prob_of_click,0.049109
2,2,"[46, 7, 38, 6, 17]",prob_of_click,0.060059
3,3,"[42, 43, 10, 3, 33]",prob_of_click,0.027847
4,4,"[11, 12, 45, 6, 44]",prob_of_click,0.019623


In [19]:
plot_every = 1_000
mask_df = (reporting_df['time_idx'] % plot_every) == 0
mask_ff_df = (reporting_ff_df['time_idx'] % plot_every) == 0

In [20]:
chart = alt.Chart(reporting_ff_df[mask_ff_df]).mark_line().encode(
    y=alt.Y('value'),
    x=alt.X('time_idx'),
    color='variable'
).properties(width=600)

vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__prob_of_click.png",
)

chart


In [21]:
chart = alt.Chart(reporting_df[mask_df]).mark_line().encode(
    x='time_idx',
    y='avg_cumulative_reward',
).properties(
    width=700, 
)

vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__avg_reward.png",
)

chart


In [22]:
chart = plot_actual_vs_predicted(policy=policy, env=env)
vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__action_probs_actual_vs_predicted.png",
)

chart

In [23]:
policy_prob_df, chart = plot_observed_optimal_action_prob(
    reporting_df=reporting_df,
    height=275,
    width=675
)

chart

In [24]:
policy_prob_df['optimal_action_id'].value_counts()

optimal_action_id
0    2000
Name: count, dtype: int64

# Thompson Sampling Policy

## Uninformed Prior

In [25]:
policy = BernoulliTS(
    n_actions=env.n_actions,
    len_list=env.len_list,
    random_state=1234,
    batch_size=1,
)

In [26]:
policy

BernoulliTS(n_actions=50, len_list=5, batch_size=1, random_state=1234, alpha=array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), beta=array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))

In [27]:
prior_df = plot_beta_dist(alpha=1, beta=1)
actuals_df = pd.DataFrame(dict(w=env.weights)).assign(action=lambda x: x.index)

actuals_chart = alt.Chart(actuals_df).mark_rule(opacity=0.5).encode(
    x=alt.X('w', title='θ'),
)
priors_chart = alt.Chart(prior_df).mark_area(opacity=0.5).encode(
    x=alt.X('x', title='θ'),
    y=alt.Y('pdf')
)

final_chart = (
    priors_chart + actuals_chart
).properties(
    width=500, height=200
)

final_chart

In [28]:
observation, info = env.reset(seed=34325)
action = policy.select_action()
action

array([13,  3, 43, 16, 48])

In [29]:
reporting = []

while True:
    _, reward, terminated, truncated, info = env.step(action=action)
    reporting.append(dict(
        action=action,
        reward=reward,
        prob_of_click=info["prob_of_click"],        
    ))

    policy.cascade_params_update(
        action=action,
        reward_position=info["position_of_click"]
    )

    if truncated:
        break    
    
    action = policy.select_action()

In [30]:
reporting_df = harmonise_reporting(reporting=reporting, env=env, policy_batch_check=500)

In [31]:
policy_name = 'ts-priors-Beta(1, 1)'
outputs_of_policies[policy_name] = reporting_df.assign(policy=policy_name)

In [32]:
reporting_ff_df = reporting_df.melt(
    id_vars=['time_idx', 'action'],
    value_vars=['prob_of_click', 'optimal_prob_of_click']
)

reporting_ff_df.head()

Unnamed: 0,time_idx,action,variable,value
0,0,"[13, 3, 43, 16, 48]",prob_of_click,0.033725
1,1,"[35, 30, 14, 18, 39]",prob_of_click,0.046942
2,2,"[5, 47, 27, 19, 9]",prob_of_click,0.024355
3,3,"[8, 10, 24, 23, 2]",prob_of_click,0.041735
4,4,"[29, 23, 38, 0, 4]",prob_of_click,0.070604


In [33]:
plot_every = 100
mask_df = (reporting_df['time_idx'] % plot_every) == 0
mask_ff_df = (reporting_ff_df['time_idx'] % plot_every) == 0

In [34]:
chart = alt.Chart(reporting_ff_df[mask_ff_df]).mark_line().encode(
    y=alt.Y('value'),
    x=alt.X('time_idx'),
    color='variable'
).properties(width=600)

vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__prob_of_click.png",
)

chart

In [35]:
chart = alt.Chart(reporting_df[mask_df]).mark_line().encode(
    x='time_idx',
    y='avg_cumulative_reward',
).properties(
    width=700, 
)

vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__avg_reward.png",
)

chart

In [36]:
chart = plot_actual_vs_predicted(policy=policy, env=env)

vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__action_probs_actual_vs_predicted.png",
)

chart

In [37]:
policy_prob_df, chart = plot_observed_optimal_action_prob(
    reporting_df=reporting_df,
    height=275,
    width=675
)


vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__observed_optimal_action_distribution.png",
)


chart

In [38]:
all_pdf_df, actuals_df, chart = plot_pdf_with_actuals(env=env, policy=policy)


vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__action_beta_distributions.png",
)


chart

## Pessimistic priors

In [39]:
policy = BernoulliTS(
    n_actions=env.n_actions,
    len_list=env.len_list,
    random_state=1234,
    batch_size=1,
    alpha=np.ones(env.n_actions) * 1,
    beta=np.ones(env.n_actions) * 99,
)

In [40]:
prior_df = plot_beta_dist(alpha=1, beta=99)
actuals_df = pd.DataFrame(dict(w=env.weights)).assign(action=lambda x: x.index)

actuals_chart = alt.Chart(actuals_df).mark_rule(opacity=0.5).encode(
    x=alt.X('w', title='θ'),
)
priors_chart = alt.Chart(prior_df).mark_area(opacity=0.5).encode(
    x=alt.X('x', title='θ'),
    y=alt.Y('pdf')
)

final_chart = (
    priors_chart + actuals_chart
).properties(
    width=500, height=200
)

final_chart

In [41]:
observation, info = env.reset(seed=1234)
action = policy.select_action()
action

array([41, 46, 43, 31,  7])

In [42]:
reporting = []

while True:
    _, reward, terminated, truncated, info = env.step(action=action)
    reporting.append(dict(
        action=action,
        reward=reward,
        prob_of_click=info["prob_of_click"],        
    ))

    policy.cascade_params_update(
        action=action,
        reward_position=info["position_of_click"]
    )

    if truncated:
        break    
    
    action = policy.select_action()

In [43]:
reporting_df = harmonise_reporting(reporting=reporting, env=env, policy_batch_check=500)

In [44]:
policy_name = 'ts-priors-Beta(1, 99)'
outputs_of_policies[policy_name] = reporting_df.assign(policy=policy_name)

reporting_ff_df = reporting_df.melt(
    id_vars=['time_idx', 'action'],
    value_vars=['prob_of_click', 'optimal_prob_of_click']
)

reporting_ff_df.head()

Unnamed: 0,time_idx,action,variable,value
0,0,"[41, 46, 43, 31, 7]",prob_of_click,0.034918
1,1,"[46, 12, 35, 0, 49]",prob_of_click,0.018589
2,2,"[42, 9, 31, 27, 21]",prob_of_click,0.021633
3,3,"[30, 39, 10, 19, 8]",prob_of_click,0.041203
4,4,"[17, 26, 48, 35, 15]",prob_of_click,0.062146


In [45]:
plot_every = 100
mask_df = (reporting_df['time_idx'] % plot_every) == 0
mask_ff_df = (reporting_ff_df['time_idx'] % plot_every) == 0


chart = alt.Chart(reporting_ff_df[mask_ff_df]).mark_line().encode(
    y=alt.Y('value'),
    x=alt.X('time_idx'),
    color='variable'
).properties(width=600)

vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__prob_of_click.png",
)
chart


In [46]:
chart = alt.Chart(reporting_df[mask_df]).mark_line().encode(
    x='time_idx',
    y='avg_cumulative_reward',
).properties(
    width=700, 
)

vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__avg_reward.png",
)

chart

In [47]:
chart = plot_actual_vs_predicted(policy=policy, env=env)

vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__action_probs_actual_vs_predicted.png",
)

chart

In [48]:
policy_prob_df, chart = plot_observed_optimal_action_prob(
    reporting_df=reporting_df,
    height=275,
    width=675
)


vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__observed_optimal_action_distribution.png",
)


chart

In [49]:
all_pdf_df, actuals_df, chart = plot_pdf_with_actuals(env=env, policy=policy)


vf.save_png(
    chart, 
    f"context_free_outputs/{policy_name}__action_beta_distributions.png",
)


chart

# Running for N episodes

In [12]:
def get_alpha(mean: float, beta: float) -> float:
    """
    mean of Beta distribution is given by: alpha/(alpha+beta)
    
    We have re-written the formula to give alpha
    as a function of the mean and beta

    Args:
        target_p (float): probability of success
        beta (float): 

    """
    return ((mean/(1-mean)) * beta) + ((1/(1-mean)) * (1-2 * mean))

In [13]:
def get_random_policy(
    seed: int,
    env: CascadeContextFreeBandit,
    batch_size: int = 1
) -> Random:

    return Random(
        n_actions=env.n_actions,
        len_list=env.len_list,
        random_state=seed,
        batch_size=batch_size,
    )    

def get_ts_policy(
    seed: int,
    env: CascadeContextFreeBandit,
    batch_size: int = 1,
    alpha: int = 1,
    beta: int = 1,
) -> BernoulliTS:
    return BernoulliTS(
        n_actions=env.n_actions,
        len_list=env.len_list,
        random_state=seed,
        batch_size=batch_size,
        alpha=np.ones(env.n_actions) * alpha,
        beta=np.ones(env.n_actions) * beta,
    )
    
def play_episode(
    env: CascadeContextFreeBandit,
    policy: Union[Random, BernoulliTS],
    seed: int,
) -> list[ActionRewardLogging]:
    observation, info = env.reset(seed=seed)
    action = policy.select_action()

    reporting = []

    while True:
        _, reward, terminated, truncated, info = env.step(action=action)
        reporting.append(dict(
            action=action,
            reward=reward,
            prob_of_click=info["prob_of_click"],        
        ))

        policy.cascade_params_update(
            action=action,
            reward_position=info["position_of_click"]
        )

        if truncated:
            break    
        
        action = policy.select_action()
    return reporting

In [17]:
# reduce time of an episode!
n_episodes = 20
episode_steps = 500_000
env = CascadeContextFreeBandit(
    weights=np.random.beta(a=1, b=99, size=N_ACTIONS),
    max_steps=episode_steps,
    len_list=LEN_LIST,
)

In [15]:
# I will simulate Beta(1, 1) and Beta(1, 99) but would like to consider another option as well.
# Below we plot PDFs for different combinations of alpha and beta given a fixed sum of alpha + beta = 10 or 100.
# NOTE: Beta (1, 1) assumes n_trials = successes + failures = 0 as by definition we have alpha = successes+1, beta = failures+1.
# However, we could think of alpha=beta=1 as being the same as one success and one failure.
# Also, the Beta distribution is defined for any alpha>0 and beta>0 - i.e. Beta(0.1, 0.1) would also create a valid distribution.
# THE LATER can be exploited when warm starting a Beta Distribution for observed conversion rates

pdfs = []
alpha_max = 10
for alpha in range(1, alpha_max):
    for base in [alpha_max, 100]:
        beta = base - alpha
        mean = alpha / (alpha + beta)
        title = f'Beta({alpha}, {beta})'
        pdf_case_df = plot_beta_dist(alpha=alpha, beta=beta, size=100)
        pdf_case_df = pdf_case_df.assign(
            mean=mean, alpha=alpha, beta=beta, title=title, n_trials=alpha+beta,# really minus 2!
        )
        pdfs.append(pdf_case_df)
pdfs_df = pd.concat(pdfs, axis=0).reset_index(drop=True)
pdfs_df.head()

Unnamed: 0,x,pdf,mean,alpha,beta,title,n_trials
0,0.000111,8.992,0.1,1,9,"Beta(1, 9)",10
1,0.005523,8.609976,0.1,1,9,"Beta(1, 9)",10
2,0.010934,8.24223,0.1,1,9,"Beta(1, 9)",10
3,0.016345,7.888302,0.1,1,9,"Beta(1, 9)",10
4,0.021757,7.547744,0.1,1,9,"Beta(1, 9)",10


In [16]:
chart = alt.Chart(pdfs_df).mark_line().encode(
    y=alt.Y('pdf'),
    x=alt.X('x', title='θ'),
    color=alt.Color(
        'title',
        scale=alt.Scale(
            scheme="lightgreyred", reverse=False
        )
    ),
    tooltip=['title']
).properties(width=700, height=200)
chart = chart + chart.mark_bar(opacity=0.05)
chart.facet(row='n_trials').resolve_scale(x='independent', y='independent')

In [18]:

POLICY_TYPE_TO_FUNCTION: dict[str, callable] = {
    #"random": get_random_policy,
    "ts-priors-Beta(1, 1)": partial(get_ts_policy, alpha=1, beta=1),
    "ts-priors-Beta(1, 9)": partial(get_ts_policy, alpha=1, beta=9),    
    "ts-priors-Beta(1, 99)": partial(get_ts_policy, alpha=1, beta=99),
}

episode_results = {
    policy_type: []
    for policy_type in POLICY_TYPE_TO_FUNCTION.keys()
}


for policy_type, policy_function in POLICY_TYPE_TO_FUNCTION.items():
    for episode in tqdm.tqdm(range(n_episodes)):
        policy = policy_function(env=env, seed=episode)
        reporting = play_episode(
            env=env,
            policy=policy,
            seed=episode,
        )
        episode_results[policy_type].append(reporting)

100%|██████████| 20/20 [05:15<00:00, 15.78s/it]
100%|██████████| 20/20 [05:15<00:00, 15.78s/it]
100%|██████████| 20/20 [05:11<00:00, 15.60s/it]


In [24]:
reporting = []

for policy_type, all_episode_reporting in episode_results.items():
    for episode, episode_reporting in enumerate(all_episode_reporting):
        print(f'processing: policy_type={policy_type}, episode={episode}', end='\r')
        
        episode_reporting_df = harmonise_reporting(
            reporting=episode_reporting,
            env=env,
            policy_batch_check=500
        ).assign(episode=episode, policy_type=policy_type)
        
        reporting.append(episode_reporting_df)
        
reporting_df = pd.concat(reporting, axis=0).reset_index(drop=True)
reporting_df.head()

processing: policy_type=ts-priors-Beta(1, 99), episode=19

Unnamed: 0,action,reward,prob_of_click,optimal_prob_of_click,time_idx,cumulative_reward,avg_cumulative_reward,action_as_str,optimal_action_id,cumulative_optimal_action_taken,avg_cumulative_optimal_action_taken,policy_batch_check,episode,policy_type
0,"[28, 27, 42, 17, 6]",0,0.054857,0.164943,0,0,0.0,28|27|42|17|6,False,0,0.0,0,0,"ts-priors-Beta(1, 1)"
1,"[26, 18, 5, 41, 12]",0,0.020569,0.164943,1,0,0.0,26|18|5|41|12,False,0,0.0,0,0,"ts-priors-Beta(1, 1)"
2,"[48, 13, 1, 29, 16]",0,0.043854,0.164943,2,0,0.0,48|13|1|29|16,False,0,0.0,0,0,"ts-priors-Beta(1, 1)"
3,"[34, 22, 9, 37, 20]",1,0.068668,0.164943,3,1,0.25,34|22|9|37|20,False,0,0.0,0,0,"ts-priors-Beta(1, 1)"
4,"[22, 3, 48, 49, 47]",0,0.04896,0.164943,4,1,0.2,22|3|48|49|47,False,0,0.0,0,0,"ts-priors-Beta(1, 1)"


In [28]:
reporting_df.groupby(['policy_type'], as_index=False)['episode'].agg(['min', 'max'])

Unnamed: 0,policy_type,min,max
0,"ts-priors-Beta(1, 1)",0,19
1,"ts-priors-Beta(1, 9)",0,19
2,"ts-priors-Beta(1, 99)",0,19


In [26]:
summary_fields = [
    'optimal_action_id',
    'reward',
    'cumulative_reward',
    'avg_cumulative_reward',
    'prob_of_click',
    'optimal_prob_of_click',
    'cumulative_optimal_action_taken',
    'avg_cumulative_optimal_action_taken'
]
reporting_df2 = reporting_df.groupby(['policy_type', 'time_idx', ], as_index=False)[summary_fields].mean()

In [27]:
reporting_df2.head()

Unnamed: 0,policy_type,time_idx,optimal_action_id,reward,cumulative_reward,avg_cumulative_reward,prob_of_click,optimal_prob_of_click,cumulative_optimal_action_taken,avg_cumulative_optimal_action_taken
0,"ts-priors-Beta(1, 1)",0,0.0,0.05,0.05,0.05,0.047813,0.164943,0.0,0.0
1,"ts-priors-Beta(1, 1)",1,0.0,0.05,0.1,0.05,0.049794,0.164943,0.0,0.0
2,"ts-priors-Beta(1, 1)",2,0.0,0.05,0.15,0.05,0.043061,0.164943,0.0,0.0
3,"ts-priors-Beta(1, 1)",3,0.0,0.1,0.25,0.0625,0.046309,0.164943,0.0,0.0
4,"ts-priors-Beta(1, 1)",4,0.0,0.0,0.25,0.05,0.049524,0.164943,0.0,0.0


In [29]:
plot_every = 100
mask_df = (reporting_df2['time_idx'] % plot_every) == 0
mask_df.sum(), reporting_df2.shape[0]

(15000, 1500000)

In [46]:
chart = alt.Chart(reporting_df2[mask_df].query('time_idx>100')).mark_line().encode(
    y=alt.Y('avg_cumulative_reward'),
    x=alt.X('time_idx'),
    color=alt.Color('policy_type')
).properties(width=400)


chart_zoom = alt.Chart(reporting_df2[mask_df].query('time_idx>100000')).mark_line().encode(
    y=alt.Y('avg_cumulative_reward', scale=alt.Scale(zero=False)),
    x=alt.X('time_idx'),
    color=alt.Color('policy_type')
).properties(width=400)
final_chart = chart | chart_zoom


vf.save_png(
    final_chart, 
    f"context_free_outputs/comparing_policies__avg_cumulative_reward.png",
)

final_chart

In [52]:

chart = alt.Chart(reporting_df2[mask_df]).mark_line().encode(
    y=alt.Y('prob_of_click', scale=alt.Scale(zero=False)),    
    x=alt.X('time_idx'),
    color=alt.Color('policy_type')
).properties(width=400)


chart_zoom = alt.Chart(reporting_df2[mask_df].query('time_idx>100000')).mark_line(opacity=0.8).encode(
    y=alt.Y('prob_of_click', scale=alt.Scale(zero=False)),    
    x=alt.X('time_idx'),
    color=alt.Color('policy_type')
).properties(width=400)

final_chart = chart | chart_zoom



vf.save_png(
    final_chart, 
    f"context_free_outputs/comparing_policies__prob_of_click.png",
)

final_chart

In [53]:
chart_1 = alt.Chart(reporting_df2[mask_df]).mark_line().encode(
    y=alt.Y('cumulative_optimal_action_taken'),
    x=alt.X('time_idx'),
    color=alt.Color('policy_type')
).properties(width=400)

chart_2 = alt.Chart(reporting_df2[mask_df]).mark_line().encode(
    y=alt.Y('avg_cumulative_optimal_action_taken'),
    x=alt.X('time_idx'),
    color=alt.Color('policy_type')
).properties(width=400)

chart = chart_1 | chart_2

vf.save_png(
    chart, 
    f"context_free_outputs/comparing_policies__avg_cumulative_optimal_action_prob.png",
)

chart

In [54]:
optimal_action_prob_df = reporting_df.assign(n=1).groupby(['episode', 'policy_type', 'policy_batch_check'], as_index=False)[['n','optimal_action_id']].sum()
optimal_action_prob_df.head()

Unnamed: 0,episode,policy_type,policy_batch_check,n,optimal_action_id
0,0,"ts-priors-Beta(1, 1)",0,500,0
1,0,"ts-priors-Beta(1, 1)",500,500,0
2,0,"ts-priors-Beta(1, 1)",1000,500,0
3,0,"ts-priors-Beta(1, 1)",1500,500,0
4,0,"ts-priors-Beta(1, 1)",2000,500,0


In [55]:
optimal_action_prob_df = optimal_action_prob_df.assign(prob_optimal_action=lambda x: x['optimal_action_id']/x['n'])
optimal_action_prob_df2 = optimal_action_prob_df.groupby(['policy_type', 'policy_batch_check', ], as_index=False)['prob_optimal_action'].mean()
optimal_action_prob_df2.head()

Unnamed: 0,policy_type,policy_batch_check,prob_optimal_action
0,"ts-priors-Beta(1, 1)",0,0.0
1,"ts-priors-Beta(1, 1)",500,0.0
2,"ts-priors-Beta(1, 1)",1000,0.0
3,"ts-priors-Beta(1, 1)",1500,0.0
4,"ts-priors-Beta(1, 1)",2000,0.0001


In [64]:
chart = alt.Chart(optimal_action_prob_df2).mark_line().encode(
    y=alt.Y('prob_optimal_action'),
    x=alt.X('policy_batch_check'),
    color=alt.Color('policy_type')
).properties(width=400)

chart_zoom = alt.Chart(optimal_action_prob_df2.query('policy_batch_check>200000')).mark_line().encode(
    y=alt.Y('prob_optimal_action', scale=alt.Scale(zero=False)),    
    x=alt.X('policy_batch_check'),
    color=alt.Color('policy_type')
).properties(width=400)

final_chart = chart | chart_zoom

vf.save_png(
    final_chart, 
    f"context_free_outputs/comparing_policies__batch_optimal_action_prob.png",
)

final_chart

# IGNORE FOR NOW:

A way to get an alpha and beta given you have
* an expected p/mean prob value
* a belief of what the CDF should be given a p case which you wish the prob to be close to zero at that point!

In [37]:
# now cdf of beta distribution with above being alpha
from scipy import stats
from scipy.optimize import brentq
from functools import partial

In [119]:
def observed_cdf_vs_expected(
    beta: float,
    mean: float,
    cdf_value_evaluated_at_p: float = 0.8,
    expected_cdf_value: float = 0.0001
):
    """

    Args:
        beta (float): _description_
        mean (float): _description_
        cdf_value_evaluated_at_p (float, optional): _description_. Defaults to 0.8.
        expected_cdf_value (float, optional): _description_. Defaults to 0.0001.

    Returns:
        _type_: _description_
    """
    alpha = get_alpha(mean=mean, beta=beta)
    observed_cdf = stats.beta.cdf(cdf_value_evaluated_at_p, alpha, beta)
    return  observed_cdf - expected_cdf_value


In [120]:
brute_force_approach = []
for beta in range(1, 100):
    error = observed_cdf_vs_expected(
        beta=beta, mean=0.95, cdf_value_evaluated_at_p=0.8, expected_cdf_value=0.0001
    )
    brute_force_approach.append(dict(beta=beta, error=error))

pd.DataFrame(brute_force_approach).sort_values('error').query('error>0').reset_index().head()

Unnamed: 0,index,beta,error
0,4,5,1.4e-05
1,3,4,0.00075
2,2,3,0.006546
3,1,2,0.057546
4,0,1,0.7999


In [121]:
brentq(
    partial(
        observed_cdf_vs_expected,
        mean=0.95,
        cdf_value_evaluated_at_p=0.8,
        expected_cdf_value=0.0001,
    ),
    1,
    100,
)

5.06516056109555