In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src.bandit.algorithms.standard import EpsilonGreedy
from src.bandit.arms.bernoulli import BernoulliArm
from src.bandit.testing.monte_carlo import test_algorithm

##### Initialize algorithm with the best known counts and values

In [29]:
arm_names=["10 then 40", "20 then 40", "10 only", "40 only"]

obs_conv_rates  = [0.10, 0.00, 0.03, 0.00]
obs_rewards     = [75.0, 70.0, 90.0, 60.0]
true_conv_rates = [0.15, 0.20, 0.14, 0.17]
true_rewards    = [85.0, 75.0, 90.0, 60.0]
starting_counts = [1, 1, 1, 1]

algo = EpsilonGreedy(
    epsilon=0.2,
    n_arms=4,
    rewards= obs_rewards,
    conv_rates=obs_conv_rates,
    counts=starting_counts
)

arms = [BernoulliArm(i) for i in true_conv_rates]

In [30]:
for i in getattr(algo, "values"):
    print(i)

7.5
0.0
2.6999999999999997
0.0


##### Assign discount for each user

In [19]:
latest_performance = users.groupby(["arm"]).mean(["conversion","reward"]).reset_index()[["arm","conversion","reward"]]
latest_performance["count"] = users.groupby(["arm"]).count().reset_index()["reward"]

latest_performance

Unnamed: 0,arm,conversion,reward,count
0,0,0.155556,13.222222,90
1,1,0.0,0.0,2
2,2,0.0,0.0,3
3,3,0.2,12.0,5


In [47]:
results = pd.DataFrame(columns=["sim","arm","conversion","reward"])
prev_counts = starting_counts
prev_rewards = obs_rewards
prev_conv_rates = obs_conv_rates

for i in range(10):
    # generate 100 users
    users = pd.DataFrame(
        data={
            "ID": range(0,100)
        }
    )

    # simulate user outcomes
    users["arm"] = [algo.select_arm() for i in users["ID"]]
    users["conversion"] = [arms[i].draw() for i in users["arm"]]
    users["reward"] = [true_rewards[i]*j for i,j in zip(users["arm"], users["conversion"])]
    latest_performance = users.groupby(["arm"]).mean(["conversion","reward"]).reset_index()[["arm","conversion"]]
    latest_performance["reward"] = users.query("reward > 0.0").groupby(["arm"]).mean(["reward"]).reset_index()[["reward"]]
    latest_performance["count"] = users.groupby(["arm"]).count().reset_index()["reward"]
    latest_performance["sim"] = i
    latest_performance.fillna(0, inplace=True)

    # update values
    next_counts = [a+b for a,b in zip(prev_counts, latest_performance["count"])]
    next_rewards = [(a/(a+b)*c)+(b/(a+b)*d) for a,b,c,d in zip(prev_counts, next_counts, prev_rewards, latest_performance["reward"])]
    next_conv_rates = [(a/(a+b)*c)+(b/(a+b)*d) for a,b,c,d in zip(prev_counts, next_counts, prev_conv_rates, latest_performance["conversion"])]

    # save latest performance
    results = pd.concat([results, latest_performance])
    
    # update assumptions for bandit
    algo = EpsilonGreedy(
        epsilon=0.2,
        n_arms=4,
        rewards= next_rewards,
        conv_rates=next_conv_rates,
        counts=next_counts
    )

In [51]:
results.query("arm==3")

Unnamed: 0,sim,arm,conversion,reward,count
3,0,3,0.2,60.0,5.0
3,1,3,0.0,0.0,5.0
3,2,3,0.0,0.0,4.0
3,3,3,0.6,0.0,5.0
3,4,3,0.5,60.0,2.0
3,5,3,0.0,0.0,6.0
3,6,3,0.666667,0.0,3.0
3,7,3,0.0,0.0,5.0
3,8,3,0.333333,0.0,3.0
3,9,3,0.142857,0.0,7.0


##### Gather conversion rates and update assumptions

new