<a href="https://colab.research.google.com/github/chloeoconnell1/langchain-tutorials/blob/main/Test_set_power_calc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.metrics import f1_score
from scipy.stats import ttest_rel

# Constants
positive_rate = 0.015      # 1.5% of test set is positive
n_trials = 1000
alpha = 0.05

def generate_predictions(y_true, precision, recall):
    """Simulate predictions given precision and recall on a fixed y_true."""
    y_pred = []

    for y in y_true:
        if y == 1:
            y_pred.append(1 if np.random.rand() < recall else 0)
        else:
            # Derive false positive rate from precision
            # precision = TP / (TP + FP) => FP = TP * (1 - prec) / prec
            # To get a false positive rate we can use:
            fp_rate = (1 - precision) / precision * positive_rate / (1 - positive_rate)
            y_pred.append(1 if np.random.rand() < fp_rate else 0)

    return np.array(y_pred)

def simulate_power(n, p1, r1, p2, r2):
    f1_a = []
    f1_b = []

    for _ in range(n_trials):
        # Shared true labels
        y_true = np.random.choice([1, 0], size=n, p=[positive_rate, 1 - positive_rate])
        y_pred_a = generate_predictions(y_true, p1, r1)
        y_pred_b = generate_predictions(y_true, p2, r2)

        f1_a.append(f1_score(y_true, y_pred_a, zero_division=0))
        f1_b.append(f1_score(y_true, y_pred_b, zero_division=0))

    # Paired t-test (more appropriate since each model sees the same data)
    _, p_val = ttest_rel(f1_a, f1_b)
    power = np.mean(np.array(f1_b) > np.array(f1_a)) * (p_val < alpha)
    mean_diff = np.mean(np.array(f1_b) - np.array(f1_a))
    return power, mean_diff

# Try different test set sizes
for n in [50, 100, 150, 200, 300, 400, 500, 550]:
    power, diff = simulate_power(
        n=n,
        p1=0.3, r1=0.3,    # Model A
        p2=0.45, r2=0.45   # Model B
    )
    print(f"Test size {n}: Power = {power:.3f}, Mean F1 diff = {diff:.3f}")

    # 0.015 prevalence is likely reasonable for all risk factors except:
      # Placenta accreta
      # Chorio - however this is likely underreported by nursing risk assessments
      # Low platelets


Test size 50: Power = 0.262, Mean F1 diff = 0.106
Test size 100: Power = 0.421, Mean F1 diff = 0.120
Test size 150: Power = 0.550, Mean F1 diff = 0.143
Test size 200: Power = 0.658, Mean F1 diff = 0.160
Test size 300: Power = 0.738, Mean F1 diff = 0.158
Test size 400: Power = 0.805, Mean F1 diff = 0.158
Test size 500: Power = 0.842, Mean F1 diff = 0.164
Test size 550: Power = 0.848, Mean F1 diff = 0.162
