In [30]:
# IPython extension to reload modules before executing user code.
# useful to see immediate results in notebook when modifying imported scripts
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import sys
import os
import numpy as np
import pandas as pd

# Add the ../py directory to sys.path
# This allows importing modules from the py directory as if they were in the current directory
module_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'py'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Verify it works
print(f"Added to path: {module_path}")

Added to path: /Users/eyal/projects/elzurdo/precision-goal/py


In [None]:
import sys
import os
import numpy as np
import pytest

from utils_experiments import (
    stop_decision_multiple_experiments_multiple_methods,
    BinomialSimulation,
    BinomialHypothesis,
    BinaryAccounting,
    create_decision_correctness_df
    )

from utils_stats import continuous_hdi_ci_limits


# HDI Calculations

## Binomial

## Continuous

In [None]:
# Example: mean=100, std=15, n=30
hdi_min, hdi_max = continuous_hdi_ci_limits(100, 15, 30)
print(f"95% HDI: [{hdi_min:.2f}, {hdi_max:.2f}]")
print(f"HDI width: {hdi_max - hdi_min:.2f}")

95% HDI: [94.40, 105.60]
HDI width: 11.20


In [None]:
pdf = beta.pdf(pp, successes, failures)

Visualisation idea:
* data distribution
* Parameter values

# `stop_decision_multiple_experiments_multiple_methods`

## Hand Picked

In [8]:
n_experiments = 1
n_samples = 810  # this is long enough to see the differences between the three methods.

sequence_ = "101101000110010000101111111110010101101110001111110010100110111111110111001111001110011110001010001011110101111110001111111111100000101001001100000001101000100010000000010010111001110100111000010010110011010000101011110011111111011100101011011100100101010011110101001111011100101110010011001010010001001011010101010100111100110011011011101110010100010110011001100101111001111101110101010001101110111100010110101010101010111100001000111011001010101100100110010001101101111100111000010011001000001010110010101101000001100101000110101110010101101000100110100100100110110100101011100001101000111111001001111100100011100011000101001010101110010000110111101111011100111011010010001001001111011100100000100011100000010010111111011110101000110110010001100101011110000001001101111100000001010011001001110001010100000101111100101110011011010111001000011110010011111110011111111100111011010000101110110001100111001000010011101100111000110010100000001101110000110011100111011100101001101010011001010100011000000011001100101100101000001101100111000000101010000110100100111110101101110010000100011101011011001110011100111011101010100101100001101100010111010010101000011000100111111010010111001100001001000110111011001011100100001001011111010011111101111001010000110011010101111001011110100001000100000010000011001110100110100100101000001100110111011011111010100111101111101010001010110010001000110111000101000010001011000100001101111011000000111010011000101001011110111101111010011101010111001111010101111011000110"
sequence_ = sequence_[:n_samples]  # Truncate to n_samples

len(sequence_)

810

In [9]:
samples = np.zeros((n_experiments, n_samples), dtype=int)
samples[0] = [int(bit) for bit in sequence_]

In [10]:
# Define parameters (as used in paper)
rope_min = 0.45
rope_max = 0.55
precision_goal = 0.08
min_iter = 30

# Run the function
# Note: binary_accounting is optional, skipping for simplicity in this basic test
method_stats, method_roperesult_iteration = stop_decision_multiple_experiments_multiple_methods(
    samples, 
    rope_min=rope_min, 
    rope_max=rope_max, 
    precision_goal=precision_goal,
    min_iter=min_iter,
    viz=False
)

In [11]:
method_stats.keys()

dict_keys(['pitg', 'epitg', 'hdi_rope'])

In [12]:
# Check if keys exist
assert 0 in method_stats['pitg'], "Experiment 0 missing from PitG stats"
assert 0 in method_stats['epitg'], "Experiment 0 missing from ePitG stats"
assert 0 in method_stats['hdi_rope'], "Experiment 0 missing from HDI+ROPE stats"

In [13]:
pitg_res = method_stats['pitg'][0]
epitg_res = method_stats['epitg'][0]
hdi_rope_res = method_stats['hdi_rope'][0]

In [14]:
# Consistency checks
assert pitg_res['precision_goal_achieved'] == True, "PitG should have achieved precision goal"
assert pitg_res['hdi_max'] - pitg_res['hdi_min'] < precision_goal, "Calculated precision should be < goal"

assert epitg_res['decision_iteration'] >= pitg_res['decision_iteration'], "ePitG should not stop before PitG"

In [15]:
# Asserting known values
assert hdi_rope_res["decision_iteration"] == 126
assert hdi_rope_res["reject_above"] == True

assert pitg_res["decision_iteration"] == 598
assert pitg_res["inconclusive"] == True

assert epitg_res["decision_iteration"] == 804
assert epitg_res["accept"] == True

In [16]:
method_stats['hdi_rope']

{0: {'decision_iteration': 126,
  'accept': False,
  'reject_below': False,
  'reject_above': True,
  'conclusive': True,
  'inconclusive': False,
  'successes': 80,
  'failures': 46,
  'hdi_min': 0.5508244626218101,
  'hdi_max': 0.717876210358378,
  'precision_goal_achieved': False}}

In [17]:
method_stats['pitg']

{0: {'decision_iteration': 598,
  'accept': False,
  'reject_below': False,
  'reject_above': False,
  'conclusive': False,
  'inconclusive': True,
  'successes': 314,
  'failures': 284,
  'hdi_min': 0.4850822706487851,
  'hdi_max': 0.5650347292260883,
  'precision_goal_achieved': True}}

In [18]:
method_stats['epitg']

{0: {'decision_iteration': 804,
  'accept': True,
  'reject_below': False,
  'reject_above': False,
  'conclusive': True,
  'inconclusive': False,
  'successes': 414,
  'failures': 390,
  'hdi_min': 0.4803983603190735,
  'hdi_max': 0.5494290170209941,
  'precision_goal_achieved': True}}

## Multiple Sequences

In [19]:
true_rate = 0.57
N_SAMPLES = 1500
N_EXPERIMENTS = 3
SEED = 42

synth_0pt5 = BinomialSimulation(
    success_rate=true_rate,
    n_samples = N_SAMPLES,
    n_experiments = N_EXPERIMENTS,
    seed=SEED)

Generating synthetic data with parameter values:
0.57: true success rate
3: experiments
1500: sample size per experiment


In [22]:
%%time

method_stats, method_roperesult_iteration = stop_decision_multiple_experiments_multiple_methods(
    synth_0pt5.experiments, 
    rope_min=rope_min, 
    rope_max=rope_max, 
    precision_goal=precision_goal,
    min_iter=min_iter,
    viz=False
)

CPU times: user 20.9 s, sys: 63.9 ms, total: 20.9 s
Wall time: 21 s


In [23]:
method_stats['epitg'][0]

{'decision_iteration': 1500,
 'accept': False,
 'reject_below': False,
 'reject_above': False,
 'conclusive': False,
 'inconclusive': True,
 'successes': 840,
 'failures': 660,
 'hdi_min': 0.534867547044455,
 'hdi_max': 0.5850835156227976,
 'precision_goal_achieved': True}

In [24]:
df_experiment_correctness = create_decision_correctness_df(method_stats, true_rate, rope_min, rope_max)

df_experiment_correctness.head(4)

Unnamed: 0_level_0,hdi_rope_decision_iteration,hdi_rope_accept,hdi_rope_reject_below,hdi_rope_reject_above,hdi_rope_inconclusive,hdi_rope_success_rate,hdi_rope_decision_correct,pitg_decision_iteration,pitg_accept,pitg_reject_below,...,pitg_inconclusive,pitg_success_rate,pitg_decision_correct,epitg_decision_iteration,epitg_accept,epitg_reject_below,epitg_reject_above,epitg_inconclusive,epitg_success_rate,epitg_decision_correct
experiment_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1500,False,False,False,True,0.56,True,593,False,False,...,True,0.55312,True,1500,False,False,False,True,0.56,True
1,1500,False,False,False,True,0.568667,True,584,False,False,...,True,0.578767,True,1500,False,False,False,True,0.568667,True
2,1500,False,False,False,True,0.572667,True,592,False,False,...,True,0.554054,True,1500,False,False,False,True,0.572667,True


In [25]:
df_experiment_correctness[['hdi_rope_decision_correct', 'pitg_decision_correct', 'epitg_decision_correct']]

Unnamed: 0_level_0,hdi_rope_decision_correct,pitg_decision_correct,epitg_decision_correct
experiment_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,True,True,True
1,True,True,True
2,True,True,True


# Abstract Methods

In [26]:
null_rate = 0.5
dsuccess_rate = 0.05
rope_precision_fraction = 0.8

hypothesis_ = BinomialHypothesis(
    success_rate_null=null_rate, 
    dsuccess_rate=dsuccess_rate, 
    rope_precision_fraction=rope_precision_fraction)

0.5: null hypothesis
0.45: ROPE min
0.55: ROPE max
--------------------
0.08: Precision Goal


In [27]:
true_rate = 0.57
N_SAMPLES = 1500
N_EXPERIMENTS = 100
SEED = 42

synth_data = BinomialSimulation(
    success_rate=true_rate,
    n_samples = N_SAMPLES,
    n_experiments = N_EXPERIMENTS,
    seed=SEED)

Generating synthetic data with parameter values:
0.57: true success rate
100: experiments
1500: sample size per experiment


In [28]:
binary_accounting = BinaryAccounting()
binary_accounting

<utils_experiments.BinaryAccounting at 0x10eb229d0>

In [29]:
%%time
hypothesis_.run_hypothesis_on_experiments(synth_data.experiments, binary_accounting)

Unnamed: 0,accept,reject,conclusive,inconclusive,stop_iter_mean,stop_iter_std,success_rate_mean,success_rate_std
hdi_rope,0.0,0.58,0.58,0.42,819.72,642.815525,0.606467,0.057314
pitg,0.0,0.2,0.2,0.8,586.28,7.334821,0.570501,0.021422
epitg,0.0,0.47,0.47,0.53,1138.68,415.215993,0.57484,0.018304


CPU times: user 3min 38s, sys: 529 ms, total: 3min 38s
Wall time: 3min 40s


In [96]:
hypothesis_.decision_correctness(true_rate)
hypothesis_.df_experiment_correctness

Unnamed: 0_level_0,hdi_rope_decision_iteration,hdi_rope_accept,hdi_rope_reject_below,hdi_rope_reject_above,hdi_rope_inconclusive,hdi_rope_success_rate,hdi_rope_decision_correct,pitg_decision_iteration,pitg_accept,pitg_reject_below,...,pitg_inconclusive,pitg_success_rate,pitg_decision_correct,epitg_decision_iteration,epitg_accept,epitg_reject_below,epitg_reject_above,epitg_inconclusive,epitg_success_rate,epitg_decision_correct
experiment_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1500,False,False,False,True,0.56,True,593,False,False,...,True,0.55312,True,1500,False,False,False,True,0.56,True
1,1500,False,False,False,True,0.568667,True,584,False,False,...,True,0.578767,True,1500,False,False,False,True,0.568667,True
2,1500,False,False,False,True,0.572667,True,592,False,False,...,True,0.554054,True,1500,False,False,False,True,0.572667,True
3,834,False,False,True,False,0.583933,True,584,False,False,...,True,0.578767,True,834,False,False,True,False,0.583933,True
4,165,False,False,True,False,0.624242,True,575,False,False,...,False,0.601739,True,575,False,False,True,False,0.601739,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1500,False,False,False,True,0.540667,False,594,False,False,...,True,0.548822,False,1500,False,False,False,True,0.540667,False
96,40,False,False,True,False,0.7,True,596,False,False,...,True,0.538591,False,1500,False,False,False,True,0.557333,True
97,30,False,False,True,False,0.733333,True,587,False,False,...,True,0.572402,True,1500,False,False,False,True,0.566,True
98,1394,False,False,True,False,0.57604,True,587,False,False,...,True,0.570698,True,1394,False,False,True,False,0.57604,True


In [97]:
hypothesis_.df_experiment_correctness[['hdi_rope_decision_correct', 'pitg_decision_correct', 'epitg_decision_correct']]

Unnamed: 0_level_0,hdi_rope_decision_correct,pitg_decision_correct,epitg_decision_correct
experiment_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,True
4,True,True,True
...,...,...,...
95,False,False,False
96,True,False,True
97,True,True,True
98,True,True,True


In [104]:
hypothesis_.df_experiment_correctness[['hdi_rope_inconclusive', 'pitg_inconclusive', 'epitg_inconclusive']].astype(int).mean()

hdi_rope_inconclusive    0.42
pitg_inconclusive        0.80
epitg_inconclusive       0.53
dtype: float64

In [99]:
hypothesis_.df_experiment_correctness[['hdi_rope_decision_correct', 'pitg_decision_correct', 'epitg_decision_correct']].astype(int).mean()

hdi_rope_decision_correct    0.95
pitg_decision_correct        0.79
epitg_decision_correct       0.93
dtype: float64

In [107]:
df_correctness_pitg_inc = hypothesis_.df_experiment_correctness.query("pitg_inconclusive == True")

df_correctness_pitg_inc[['hdi_rope_inconclusive', 'pitg_inconclusive', 'epitg_inconclusive']].astype(int).mean()

hdi_rope_inconclusive    0.5250
pitg_inconclusive        1.0000
epitg_inconclusive       0.6625
dtype: float64

In [108]:
df_correctness_pitg_inc[['hdi_rope_decision_correct', 'pitg_decision_correct', 'epitg_decision_correct']].astype(int).mean()

hdi_rope_decision_correct    0.9375
pitg_decision_correct        0.7375
epitg_decision_correct       0.9125
dtype: float64

In [4]:
aa = None
bool(aa) if aa is not None else aa

In [109]:
method_stats

{'pitg': {0: {'decision_iteration': 593,
   'accept': False,
   'reject_below': False,
   'reject_above': False,
   'conclusive': False,
   'inconclusive': True,
   'successes': 328,
   'failures': 265,
   'hdi_min': 0.5131058955630885,
   'hdi_max': 0.5930386826397187,
   'precision_goal_achieved': True},
  1: {'decision_iteration': 584,
   'accept': False,
   'reject_below': False,
   'reject_above': False,
   'conclusive': False,
   'inconclusive': True,
   'successes': 338,
   'failures': 246,
   'hdi_min': 0.538693608192975,
   'hdi_max': 0.61868303981102,
   'precision_goal_achieved': True},
  2: {'decision_iteration': 592,
   'accept': False,
   'reject_below': False,
   'reject_above': False,
   'conclusive': False,
   'inconclusive': True,
   'successes': 328,
   'failures': 264,
   'hdi_min': 0.5139978978511637,
   'hdi_max': 0.5939817426861769,
   'precision_goal_achieved': True}},
 'epitg': {0: {'decision_iteration': 1500,
   'accept': False,
   'reject_below': False,
   'r