In [103]:
import numpy as np
from scipy.stats import beta as beta_dist
from random import random
import itertools
from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.weightstats import ztest
%matplotlib inline
from matplotlib import pyplot as plt

In [111]:
class User:
    def __init__(self):
        self.condition = None
        self.data = None
        self.p = None   

In [134]:
class Treatment:
    def __init__(self, condition, alpha, beta, p_exit = 0.3, p_donate_decay = 0.9):
        self.condition = condition
        self.alpha = alpha
        self.beta = beta
        self.p_exit = p_exit
        self.p_donate_decay = p_donate_decay
        
    def treat(self, user):
        user.condition = self.condition
        user.p = beta_dist.rvs(self.alpha, self.beta)
        user.data = []
        
        for i in range(10):
            if random() < (user.p * self.p_donate_decay ** i):
                user.data.append(1)
                return user
            else:
                user.data.append(0)      
            if random() < self.p_exit:
                return user
        return user

In [135]:
def run_experiment(A, B, n_samples):
    users_A = []
    users_B = []

    for i in range(2 * n_samples):
        user = User()
        if random() < 0.5:
            users_A.append(A.treat(user))
        else:
            users_B.append(B.treat(user))
    return users_A, users_B    

In [160]:
def validate_test(test, n_runs, A, B, n_samples, alpha=0.95):
    ps = []
    for i in range(n_runs):
        users_A, users_B = run_experiment(A, B, n_samples)
        ps.append(test(users_A, users_B))
        
    n_significant = (np.array(ps)<(1-alpha)).sum()
    return proportion_confint(n_significant, n_runs)

In [136]:
def significance_test_1(users_A, users_B):
    
    def helper(users):
        data = [u.data for u in users]
        return list(itertools.chain.from_iterable(data))
    return ztest(helper(users_A), helper(users_B))[1]

In [148]:
n_runs = 10000
n_samples = 100
alpha = 0.95

In [149]:
A = Treatment('A', 10, 10, p_exit=1.0)
B = Treatment('B', 10, 10, p_exit=1.0)
validate_test(significance_test_1, n_runs, A, B, n_samples, alpha=alpha)

(0.048801211998194885, 0.05759878800180511)

In [150]:
A = Treatment('A', 1, 1, p_exit=1.0)
B = Treatment('B', 10, 10, p_exit=1.0)
validate_test(significance_test_1, n_runs, A, B, n_samples, alpha=alpha)

(0.049185638944810886, 0.058014361055189118)

In [151]:
A = Treatment('A', 10, 10, p_exit=0.3)
B = Treatment('B', 10, 10, p_exit=0.3)
validate_test(significance_test_1, n_runs, A, B, n_samples, alpha=alpha)

(0.079242779179822032, 0.090157220820177963)

In [153]:
A = Treatment('A', 10, 10, p_exit=0.2)
B = Treatment('B', 10, 10, p_exit=0.2)
validate_test(significance_test_1, n_runs, A, B, n_samples, alpha=alpha)

(0.093730575731756666, 0.10546942426824332)

In [166]:
def significance_test_2(users_A, users_B):
    
    def helper(users):
        data = []
        for u in users:
            if sum(u.data) > 0.0:
                data.append(1.0)
            else:
                data.append(0.0)
        return data
    
    return ztest(helper(users_A), helper(users_B))[1]

In [167]:
A = Treatment('A', 10, 10, p_exit=1.0)
B = Treatment('B', 10, 10, p_exit=1.0)
validate_test(significance_test_2, n_runs, A, B, n_samples, alpha=alpha)

(0.049377878196110218, 0.058222121803889783)

In [168]:
A = Treatment('A', 10, 10, p_exit=0.3)
B = Treatment('B', 10, 10, p_exit=0.3)
validate_test(significance_test_2, n_runs, A, B, n_samples, alpha=alpha)

(0.048224701846025037, 0.056975298153974965)

In [169]:
A = Treatment('A', 10, 10, p_exit=0.2)
B = Treatment('B', 10, 10, p_exit=0.2)
validate_test(significance_test_2, n_runs, A, B, n_samples, alpha=alpha)

(0.045632406717193867, 0.054167593282806133)

In [172]:
def significance_test_3(users_A, users_B):
    
    def helper(users):
        return [sum(u.data) for u in users]
    
    return ztest(helper(users_A), helper(users_B))[1]

In [173]:
A = Treatment('A', 10, 10, p_exit=0.2)
B = Treatment('B', 10, 10, p_exit=0.2)
validate_test(significance_test_3, n_runs, A, B, n_samples, alpha=alpha)

(0.048609024485726904, 0.057390975514273093)