In [1]:
"""
Modern A/B Testing Framework
============================

A comprehensive Python implementation of A/B testing with:
- Deterministic assignment (SHA-256 hashing)
- Data quality checks (SRM, A/A testing)
- Binary and continuous metric analysis
- CUPED variance reduction
- Sequential testing support
- Bayesian analysis
- Automated reporting

Author: Data Science Team
Last Updated: January 2026
"""

import hashlib
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
from statsmodels.stats import power as pwr
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
import warnings
warnings.filterwarnings('ignore')


# ==================== CONFIGURATION ====================

@dataclass
class ExperimentConfig:
    """Configuration for an A/B test experiment"""
    experiment_id: str
    hypothesis: str
    primary_metric: str
    metric_type: str  # 'binary' or 'continuous'
    guardrail_metrics: List[str]
    unit_of_randomization: str  # 'user', 'session', etc.
    allocation_percent: float = 0.5  # 50% to treatment by default
    alpha: float = 0.05
    power: float = 0.80
    minimum_detectable_effect: float = 0.05  # 5% relative lift
    duration_days: int = 14


# ==================== ASSIGNMENT ====================

def assign_variant(
    unit_id: str,
    experiment_id: str = "default_experiment",
    p_treatment: float = 0.5,
    salt: str = ""
) -> str:
    """
    Deterministic assignment using SHA-256 hashing.
    
    Ensures:
    - Same unit_id always gets same variant (consistency)
    - ~50/50 split across population
    - Independent across experiments (via experiment_id)
    
    Args:
        unit_id: Unique identifier for randomization unit (e.g., user_id)
        experiment_id: Unique experiment name
        p_treatment: Probability of treatment assignment (default 0.5)
        salt: Additional salt for hashing
        
    Returns:
        'A' for control or 'B' for treatment
    """
    # Create hash input
    hash_input = f"{experiment_id}:{unit_id}:{salt}"
    
    # Hash and convert to [0, 1)
    h = hashlib.sha256(hash_input.encode("utf-8")).hexdigest()
    hash_value = int(h[:16], 16) / (16 ** 16)
    
    # Assign variant
    return "B" if hash_value < p_treatment else "A"


# ==================== DATA QUALITY CHECKS ====================

def srm_check(
    count_a: int,
    count_b: int,
    expected_split: Tuple[float, float] = (0.5, 0.5),
    alpha: float = 0.01
) -> Dict[str, Union[float, bool]]:
    """
    Sample Ratio Mismatch (SRM) check using chi-square goodness-of-fit.
    
    Detects data quality issues by checking if observed split matches expected.
    
    Args:
        count_a: Number of users in variant A
        count_b: Number of users in variant B
        expected_split: Expected ratio (e.g., (0.5, 0.5) for 50/50)
        alpha: Significance level (typically 0.01, stricter than experiment)
        
    Returns:
        Dictionary with p-value and whether SRM detected
    """
    obs = np.array([count_a, count_b])
    total = obs.sum()
    exp = np.array(expected_split) * total
    
    # Chi-square test
    chi2 = ((obs - exp) ** 2 / exp).sum()
    p_value = 1 - stats.chi2.cdf(chi2, df=1)
    
    # Flag if p-value is small (evidence of mismatch)
    srm_detected = p_value < alpha
    
    return {
        "chi2_statistic": float(chi2),
        "p_value": float(p_value),
        "srm_detected": srm_detected,
        "observed_split": (count_a / total, count_b / total),
        "expected_split": expected_split,
        "interpretation": "⚠️ SRM DETECTED - DO NOT TRUST RESULTS" if srm_detected else "✅ No SRM detected"
    }


def aa_test_expected_p_values(n_tests: int = 1000, alpha: float = 0.05) -> Dict[str, float]:
    """
    Simulate A/A tests to validate false positive rate.
    
    In A/A test, both groups get same experience.
    We expect ~5% of tests to be "significant" by chance at alpha=0.05.
    
    Args:
        n_tests: Number of A/A tests to simulate
        alpha: Significance threshold
        
    Returns:
        Dictionary with false positive rate and diagnostics
    """
    p_values = []
    
    for _ in range(n_tests):
        # Simulate A/A: both groups have same true conversion rate
        true_rate = 0.10
        n_per_group = 1000
        
        conversions_a = np.random.binomial(n_per_group, true_rate)
        conversions_b = np.random.binomial(n_per_group, true_rate)
        
        # Run test
        _, p_val = proportions_ztest(
            [conversions_a, conversions_b],
            [n_per_group, n_per_group]
        )
        p_values.append(p_val)
    
    p_values = np.array(p_values)
    false_positive_rate = (p_values < alpha).mean()
    
    return {
        "n_tests": n_tests,
        "false_positive_rate": float(false_positive_rate),
        "expected_rate": alpha,
        "deviation": float(abs(false_positive_rate - alpha)),
        "p_values_sample": p_values[:10].tolist(),
        "interpretation": f"✅ Within expected range" if abs(false_positive_rate - alpha) < 0.02 else "⚠️ Elevated false positive rate"
    }


# ==================== POWER ANALYSIS ====================

def calculate_sample_size_binary(
    baseline_rate: float,
    mde: float,  # Minimum detectable effect (absolute, e.g., 0.02 for 2 percentage points)
    alpha: float = 0.05,
    power: float = 0.80,
    ratio: float = 1.0
) -> Dict[str, Union[int, float]]:
    """
    Calculate required sample size for binary metric (conversion rate).
    
    Args:
        baseline_rate: Control group conversion rate (e.g., 0.10 for 10%)
        mde: Minimum detectable absolute effect (e.g., 0.02 for 2 pp)
        alpha: Type I error rate (false positive)
        power: Statistical power (1 - Type II error rate)
        ratio: Ratio of treatment to control size (typically 1.0)
        
    Returns:
        Dictionary with sample size requirements
    """
    from statsmodels.stats.api import proportion_effectsize, NormalIndPower
    
    # Calculate effect size (Cohen's h)
    treatment_rate = baseline_rate + mde
    effect_size = proportion_effectsize(baseline_rate, treatment_rate)
    
    # Calculate sample size per group
    analysis = NormalIndPower()
    n_per_group = analysis.solve_power(
        effect_size=effect_size,
        power=power,
        alpha=alpha,
        ratio=ratio,
        alternative='two-sided'
    )
    
    # Round up
    n_per_group = int(np.ceil(n_per_group))
    n_total = n_per_group * (1 + ratio)
    
    return {
        "n_per_group": n_per_group,
        "n_total": int(n_total),
        "baseline_rate": baseline_rate,
        "treatment_rate": treatment_rate,
        "mde_absolute": mde,
        "mde_relative": mde / baseline_rate if baseline_rate > 0 else np.inf,
        "effect_size": float(effect_size),
        "power": power,
        "alpha": alpha
    }


def calculate_sample_size_continuous(
    baseline_mean: float,
    baseline_std: float,
    mde: float,  # Absolute change (e.g., $5 revenue increase)
    alpha: float = 0.05,
    power: float = 0.80,
    ratio: float = 1.0
) -> Dict[str, Union[int, float]]:
    """
    Calculate required sample size for continuous metric (revenue, time, etc.).
    
    Args:
        baseline_mean: Control group mean
        baseline_std: Control group standard deviation
        mde: Minimum detectable absolute effect
        alpha: Type I error rate
        power: Statistical power
        ratio: Treatment to control ratio
        
    Returns:
        Dictionary with sample size requirements
    """
    from statsmodels.stats.power import TTestIndPower
    
    # Calculate effect size (Cohen's d)
    effect_size = mde / baseline_std
    
    # Calculate sample size per group
    analysis = TTestIndPower()
    n_per_group = analysis.solve_power(
        effect_size=effect_size,
        power=power,
        alpha=alpha,
        ratio=ratio,
        alternative='two-sided'
    )
    
    # Round up
    n_per_group = int(np.ceil(n_per_group))
    n_total = n_per_group * (1 + ratio)
    
    return {
        "n_per_group": n_per_group,
        "n_total": int(n_total),
        "baseline_mean": baseline_mean,
        "baseline_std": baseline_std,
        "mde_absolute": mde,
        "mde_relative": mde / baseline_mean if baseline_mean != 0 else np.inf,
        "effect_size": float(effect_size),
        "power": power,
        "alpha": alpha
    }


# ==================== BINARY METRIC ANALYSIS ====================

def analyze_conversion(
    df: pd.DataFrame,
    metric_col: str = 'converted',
    variant_col: str = 'variant',
    alpha: float = 0.05
) -> Dict[str, Union[float, Tuple[float, float]]]:
    """
    Analyze binary metric (conversion rate) with Z-test.
    
    Uses:
    - Z-test for proportions (two-sided)
    - Wilson confidence intervals (better for small samples)
    
    Args:
        df: DataFrame with variant and metric columns
        metric_col: Name of binary metric column (0/1)
        variant_col: Name of variant column ('A' or 'B')
        alpha: Significance level
        
    Returns:
        Dictionary with test results
    """
    # Aggregate by variant
    agg = df.groupby(variant_col)[metric_col].agg(['sum', 'count'])
    agg.columns = ['conversions', 'total']
    
    # Extract values
    conv_a, n_a = int(agg.loc['A', 'conversions']), int(agg.loc['A', 'total'])
    conv_b, n_b = int(agg.loc['B', 'conversions']), int(agg.loc['B', 'total'])
    
    # Conversion rates
    p_a = conv_a / n_a if n_a > 0 else 0
    p_b = conv_b / n_b if n_b > 0 else 0
    
    # Z-test for proportions
    stat, p_val = proportions_ztest(
        count=[conv_a, conv_b],
        nobs=[n_a, n_b],
        alternative='two-sided'
    )
    
    # Wilson confidence intervals (recommended)
    ci_a = proportion_confint(conv_a, n_a, alpha=alpha, method='wilson')
    ci_b = proportion_confint(conv_b, n_b, alpha=alpha, method='wilson')
    
    # Effect sizes
    abs_lift = p_b - p_a
    rel_lift = (p_b / p_a - 1) if p_a > 0 else np.inf
    
    # Confidence interval for difference
    # Using normal approximation
    se = np.sqrt(p_a * (1 - p_a) / n_a + p_b * (1 - p_b) / n_b)
    z_crit = stats.norm.ppf(1 - alpha/2)
    ci_diff = (abs_lift - z_crit * se, abs_lift + z_crit * se)
    
    return {
        "metric": metric_col,
        "n_a": n_a,
        "n_b": n_b,
        "conversions_a": conv_a,
        "conversions_b": conv_b,
        "rate_a": float(p_a),
        "rate_b": float(p_b),
        "ci_a": (float(ci_a[0]), float(ci_a[1])),
        "ci_b": (float(ci_b[0]), float(ci_b[1])),
        "absolute_lift": float(abs_lift),
        "relative_lift": float(rel_lift),
        "ci_diff": (float(ci_diff[0]), float(ci_diff[1])),
        "z_statistic": float(stat),
        "p_value": float(p_val),
        "significant": p_val < alpha,
        "interpretation": "✅ SIGNIFICANT" if p_val < alpha else "❌ NOT SIGNIFICANT"
    }


# ==================== CONTINUOUS METRIC ANALYSIS ====================

def analyze_continuous(
    df: pd.DataFrame,
    metric_col: str,
    variant_col: str = 'variant',
    alpha: float = 0.05,
    equal_var: bool = False
) -> Dict[str, Union[float, Tuple[float, float]]]:
    """
    Analyze continuous metric with t-test.
    
    Uses:
    - Welch's t-test by default (doesn't assume equal variances)
    - Student's t-test if equal_var=True
    
    Args:
        df: DataFrame with variant and metric columns
        metric_col: Name of continuous metric column
        variant_col: Name of variant column
        alpha: Significance level
        equal_var: Whether to assume equal variances (default False)
        
    Returns:
        Dictionary with test results
    """
    # Split by variant
    a_data = df[df[variant_col] == 'A'][metric_col].dropna()
    b_data = df[df[variant_col] == 'B'][metric_col].dropna()
    
    # Basic statistics
    mean_a, std_a, n_a = a_data.mean(), a_data.std(), len(a_data)
    mean_b, std_b, n_b = b_data.mean(), b_data.std(), len(b_data)
    
    # T-test
    t_stat, p_val = stats.ttest_ind(
        b_data, a_data,
        equal_var=equal_var,
        alternative='two-sided'
    )
    
    # Effect size
    diff = mean_b - mean_a
    rel_diff = (mean_b / mean_a - 1) if mean_a != 0 else np.inf
    
    # Cohen's d (standardized effect size)
    pooled_std = np.sqrt(((n_a - 1) * std_a**2 + (n_b - 1) * std_b**2) / (n_a + n_b - 2))
    cohens_d = diff / pooled_std if pooled_std > 0 else 0
    
    # Confidence interval for difference
    se = np.sqrt(std_a**2 / n_a + std_b**2 / n_b)
    df_welch = (std_a**2 / n_a + std_b**2 / n_b)**2 / (
        (std_a**2 / n_a)**2 / (n_a - 1) + (std_b**2 / n_b)**2 / (n_b - 1)
    )
    t_crit = stats.t.ppf(1 - alpha/2, df=df_welch)
    ci_diff = (diff - t_crit * se, diff + t_crit * se)
    
    return {
        "metric": metric_col,
        "n_a": n_a,
        "n_b": n_b,
        "mean_a": float(mean_a),
        "mean_b": float(mean_b),
        "std_a": float(std_a),
        "std_b": float(std_b),
        "median_a": float(a_data.median()),
        "median_b": float(b_data.median()),
        "difference": float(diff),
        "relative_difference": float(rel_diff),
        "ci_diff": (float(ci_diff[0]), float(ci_diff[1])),
        "cohens_d": float(cohens_d),
        "t_statistic": float(t_stat),
        "p_value": float(p_val),
        "test_type": "Welch's t-test" if not equal_var else "Student's t-test",
        "significant": p_val < alpha,
        "interpretation": "✅ SIGNIFICANT" if p_val < alpha else "❌ NOT SIGNIFICANT"
    }


# ==================== CUPED (VARIANCE REDUCTION) ====================

def cuped_adjust(
    y: np.ndarray,
    x_pre: np.ndarray
) -> Tuple[np.ndarray, float, float]:
    """
    Apply CUPED (Controlled-experiment Using Pre-Experiment Data) adjustment.
    
    Reduces variance by using pre-experiment covariate that's correlated
    with the outcome but independent of treatment assignment.
    
    Formula: y_adjusted = y - θ(x_pre - mean(x_pre))
    Where θ = cov(y, x_pre) / var(x_pre)
    
    Args:
        y: Outcome metric during experiment
        x_pre: Pre-experiment covariate (e.g., past behavior)
        
    Returns:
        Tuple of (y_adjusted, theta, variance_reduction_pct)
    """
    # Handle missing data
    mask = ~(np.isnan(y) | np.isnan(x_pre))
    y = y[mask]
    x_pre = x_pre[mask]
    
    # Center the covariate
    x_centered = x_pre - x_pre.mean()
    
    # Compute theta (optimal coefficient)
    covariance = np.cov(y, x_pre)[0, 1]
    variance_x = x_pre.var(ddof=1)
    
    if variance_x == 0:
        # No variance in covariate, can't adjust
        return y, 0.0, 0.0
    
    theta = covariance / variance_x
    
    # Adjust outcome
    y_adjusted = y - theta * x_centered
    
    # Calculate variance reduction
    var_original = y.var(ddof=1)
    var_adjusted = y_adjusted.var(ddof=1)
    variance_reduction = 1 - (var_adjusted / var_original) if var_original > 0 else 0
    
    return y_adjusted, float(theta), float(variance_reduction)


def analyze_continuous_cuped(
    df: pd.DataFrame,
    metric_col: str,
    pre_metric_col: str,
    variant_col: str = 'variant',
    alpha: float = 0.05
) -> Dict[str, Union[float, Tuple[float, float]]]:
    """
    Analyze continuous metric with CUPED variance reduction.
    
    Args:
        df: DataFrame with variant, metric, and pre-metric columns
        metric_col: Name of outcome metric column
        pre_metric_col: Name of pre-experiment covariate column
        variant_col: Name of variant column
        alpha: Significance level
        
    Returns:
        Dictionary with test results (both raw and CUPED-adjusted)
    """
    # First, run standard analysis
    raw_results = analyze_continuous(df, metric_col, variant_col, alpha)
    
    # Apply CUPED to each variant
    a_data = df[df[variant_col] == 'A']
    b_data = df[df[variant_col] == 'B']
    
    y_a_adj, theta_a, vr_a = cuped_adjust(
        a_data[metric_col].values,
        a_data[pre_metric_col].values
    )
    
    y_b_adj, theta_b, vr_b = cuped_adjust(
        b_data[metric_col].values,
        b_data[pre_metric_col].values
    )
    
    # T-test on adjusted values
    t_stat_adj, p_val_adj = stats.ttest_ind(
        y_b_adj, y_a_adj,
        equal_var=False,
        alternative='two-sided'
    )
    
    # Statistics on adjusted values
    mean_a_adj = y_a_adj.mean()
    mean_b_adj = y_b_adj.mean()
    diff_adj = mean_b_adj - mean_a_adj
    
    # Confidence interval
    se_adj = np.sqrt(y_a_adj.var() / len(y_a_adj) + y_b_adj.var() / len(y_b_adj))
    df_welch = len(y_a_adj) + len(y_b_adj) - 2
    t_crit = stats.t.ppf(1 - alpha/2, df=df_welch)
    ci_diff_adj = (diff_adj - t_crit * se_adj, diff_adj + t_crit * se_adj)
    
    return {
        "metric": metric_col,
        "pre_metric": pre_metric_col,
        # Raw results
        "raw_mean_a": raw_results["mean_a"],
        "raw_mean_b": raw_results["mean_b"],
        "raw_difference": raw_results["difference"],
        "raw_p_value": raw_results["p_value"],
        # CUPED results
        "theta_a": theta_a,
        "theta_b": theta_b,
        "variance_reduction_a": vr_a,
        "variance_reduction_b": vr_b,
        "cuped_mean_a": float(mean_a_adj),
        "cuped_mean_b": float(mean_b_adj),
        "cuped_difference": float(diff_adj),
        "cuped_ci_diff": (float(ci_diff_adj[0]), float(ci_diff_adj[1])),
        "cuped_t_statistic": float(t_stat_adj),
        "cuped_p_value": float(p_val_adj),
        "significant": p_val_adj < alpha,
        "interpretation": "✅ SIGNIFICANT (CUPED)" if p_val_adj < alpha else "❌ NOT SIGNIFICANT (CUPED)"
    }


# ==================== EXPERIMENT READOUT ====================

def generate_experiment_readout(
    df: pd.DataFrame,
    config: ExperimentConfig,
    pre_metric_col: Optional[str] = None
) -> Dict:
    """
    Generate comprehensive experiment readout.
    
    Args:
        df: DataFrame with experiment data
        config: Experiment configuration
        pre_metric_col: Optional pre-experiment metric for CUPED
        
    Returns:
        Dictionary with complete experiment results
    """
    readout = {
        "experiment_id": config.experiment_id,
        "hypothesis": config.hypothesis,
        "duration_days": config.duration_days,
        "timestamp": pd.Timestamp.now().isoformat()
    }
    
    # 1. Sample size check
    n_a = (df['variant'] == 'A').sum()
    n_b = (df['variant'] == 'B').sum()
    readout["sample_size"] = {"A": int(n_a), "B": int(n_b), "total": int(n_a + n_b)}
    
    # 2. SRM check
    readout["srm_check"] = srm_check(n_a, n_b)
    
    if readout["srm_check"]["srm_detected"]:
        readout["warning"] = "⚠️ SRM DETECTED - Results may not be reliable!"
    
    # 3. Primary metric analysis
    if config.metric_type == 'binary':
        primary_result = analyze_conversion(df, config.primary_metric)
    else:
        if pre_metric_col:
            primary_result = analyze_continuous_cuped(
                df, config.primary_metric, pre_metric_col
            )
        else:
            primary_result = analyze_continuous(df, config.primary_metric)
    
    readout["primary_metric"] = primary_result
    
    # 4. Guardrail metrics
    readout["guardrail_metrics"] = {}
    for metric in config.guardrail_metrics:
        if metric in df.columns:
            # Infer metric type
            if df[metric].nunique() <= 2:
                result = analyze_conversion(df, metric)
            else:
                result = analyze_continuous(df, metric)
            readout["guardrail_metrics"][metric] = result
    
    # 5. Decision recommendation
    primary_significant = primary_result.get("significant", False)
    
    if config.metric_type == 'binary':
        positive_effect = primary_result["relative_lift"] > 0
    else:
        positive_effect = primary_result["difference"] > 0
    
    guardrails_ok = all(
        not result.get("significant", False) or 
        (result.get("difference", 0) >= 0 if config.metric_type == 'continuous'
         else result.get("relative_lift", 0) >= 0)
        for result in readout["guardrail_metrics"].values()
    )
    
    if primary_significant and positive_effect and guardrails_ok:
        recommendation = "✅ SHIP IT - Treatment wins with no guardrail violations"
    elif primary_significant and positive_effect and not guardrails_ok:
        recommendation = "⚠️ INVESTIGATE - Treatment wins but guardrail concerns"
    elif primary_significant and not positive_effect:
        recommendation = "❌ DO NOT SHIP - Treatment performs worse"
    else:
        recommendation = "❌ NO EFFECT - No significant difference detected"
    
    readout["recommendation"] = recommendation
    
    return readout


def print_readout(readout: Dict) -> None:
    """Pretty-print experiment readout"""
    print("=" * 80)
    print(f"EXPERIMENT READOUT: {readout['experiment_id']}")
    print("=" * 80)
    print(f"\nHypothesis: {readout['hypothesis']}")
    print(f"Duration: {readout['duration_days']} days")
    print(f"Analysis timestamp: {readout['timestamp']}")
    
    print(f"\n{'─' * 80}")
    print("SAMPLE SIZE")
    print(f"{'─' * 80}")
    ss = readout['sample_size']
    print(f"Control (A): {ss['A']:,}")
    print(f"Treatment (B): {ss['B']:,}")
    print(f"Total: {ss['total']:,}")
    
    print(f"\n{'─' * 80}")
    print("DATA QUALITY: SRM CHECK")
    print(f"{'─' * 80}")
    srm = readout['srm_check']
    print(f"P-value: {srm['p_value']:.4f}")
    print(f"Status: {srm['interpretation']}")
    
    print(f"\n{'─' * 80}")
    print("PRIMARY METRIC RESULTS")
    print(f"{'─' * 80}")
    pm = readout['primary_metric']
    print(f"Metric: {pm['metric']}")
    
    if 'rate_a' in pm:  # Binary metric
        print(f"Control rate: {pm['rate_a']:.3%}")
        print(f"Treatment rate: {pm['rate_b']:.3%}")
        print(f"Absolute lift: {pm['absolute_lift']:.3%}")
        print(f"Relative lift: {pm['relative_lift']:.2%}")
    else:  # Continuous metric
        if 'cuped_mean_a' in pm:  # CUPED version
            print(f"Control mean (CUPED): {pm['cuped_mean_a']:.2f}")
            print(f"Treatment mean (CUPED): {pm['cuped_mean_b']:.2f}")
            print(f"Difference: {pm['cuped_difference']:.2f}")
            print(f"Variance reduction: {pm['variance_reduction_a']:.1%} (A), {pm['variance_reduction_b']:.1%} (B)")
        else:
            print(f"Control mean: {pm['mean_a']:.2f}")
            print(f"Treatment mean: {pm['mean_b']:.2f}")
            print(f"Difference: {pm['difference']:.2f}")
    
    print(f"P-value: {pm['p_value']:.4f}")
    print(f"Status: {pm['interpretation']}")
    
    if readout['guardrail_metrics']:
        print(f"\n{'─' * 80}")
        print("GUARDRAIL METRICS")
        print(f"{'─' * 80}")
        for metric_name, result in readout['guardrail_metrics'].items():
            print(f"\n{metric_name}:")
            print(f"  P-value: {result['p_value']:.4f}")
            print(f"  Status: {result['interpretation']}")
    
    print(f"\n{'─' * 80}")
    print("RECOMMENDATION")
    print(f"{'─' * 80}")
    print(readout['recommendation'])
    print("=" * 80)


# ==================== DEMO USAGE ====================

if __name__ == "__main__":
    print("Modern A/B Testing Framework - Demo\n")
    
    # ========== DEMO 1: Simple Binary Metric ==========
    print("=" * 80)
    print("DEMO 1: Binary Metric (Conversion Rate)")
    print("=" * 80)
    
    # Generate sample data
    np.random.seed(42)
    n_users = 10000
    user_ids = [f"user_{i}" for i in range(n_users)]
    
    # Assign variants
    variants = [assign_variant(uid, "demo_experiment_1") for uid in user_ids]
    
    # Simulate conversions (treatment has 1% absolute lift)
    base_rate = 0.10
    conversions = [
        np.random.binomial(1, base_rate + (0.01 if v == 'B' else 0))
        for v in variants
    ]
    
    df1 = pd.DataFrame({
        'user_id': user_ids,
        'variant': variants,
        'converted': conversions
    })
    
    # Run analysis
    result1 = analyze_conversion(df1, 'converted')
    
    print(f"\nControl conversion: {result1['rate_a']:.2%}")
    print(f"Treatment conversion: {result1['rate_b']:.2%}")
    print(f"Absolute lift: {result1['absolute_lift']:.2%}")
    print(f"Relative lift: {result1['relative_lift']:.1%}")
    print(f"P-value: {result1['p_value']:.4f}")
    print(f"Result: {result1['interpretation']}")
    
    # ========== DEMO 2: Continuous Metric with CUPED ==========
    print("\n" + "=" * 80)
    print("DEMO 2: Continuous Metric with CUPED (Revenue)")
    print("=" * 80)
    
    # Generate sample data with pre-period revenue
    n_users = 5000
    user_ids = [f"user_{i}" for i in range(n_users)]
    variants = [assign_variant(uid, "demo_experiment_2") for uid in user_ids]
    
    # Pre-period revenue (correlated with test revenue)
    pre_revenue = np.random.gamma(shape=2, scale=10, size=n_users)
    
    # Test revenue (correlated with pre, plus treatment effect)
    test_revenue = (
        0.6 * pre_revenue +
        np.random.gamma(shape=2, scale=5, size=n_users) +
        np.array([2.0 if v == 'B' else 0 for v in variants])
    )
    
    df2 = pd.DataFrame({
        'user_id': user_ids,
        'variant': variants,
        'pre_revenue': pre_revenue,
        'test_revenue': test_revenue
    })
    
    # Run analysis with CUPED
    result2 = analyze_continuous_cuped(df2, 'test_revenue', 'pre_revenue')
    
    print(f"\nRaw Analysis:")
    print(f"  Control mean: ${result2['raw_mean_a']:.2f}")
    print(f"  Treatment mean: ${result2['raw_mean_b']:.2f}")
    print(f"  Difference: ${result2['raw_difference']:.2f}")
    print(f"  P-value: {result2['raw_p_value']:.4f}")
    
    print(f"\nCUPED-Adjusted Analysis:")
    print(f"  Variance reduction: {result2['variance_reduction_a']:.1%}")
    print(f"  Control mean: ${result2['cuped_mean_a']:.2f}")
    print(f"  Treatment mean: ${result2['cuped_mean_b']:.2f}")
    print(f"  Difference: ${result2['cuped_difference']:.2f}")
    print(f"  P-value: {result2['cuped_p_value']:.4f}")
    print(f"  Result: {result2['interpretation']}")
    
    # ========== DEMO 3: Full Experiment Readout ==========
    print("\n" + "=" * 80)
    print("DEMO 3: Complete Experiment Readout")
    print("=" * 80)
    
    # Create experiment config
    config = ExperimentConfig(
        experiment_id="checkout_redesign_v1",
        hypothesis="New checkout flow will increase conversion without hurting revenue",
        primary_metric="purchased",
        metric_type="binary",
        guardrail_metrics=["revenue_per_user", "page_load_time"],
        unit_of_randomization="user",
        duration_days=14
    )
    
    # Generate data
    n_users = 8000
    user_ids = [f"user_{i}" for i in range(n_users)]
    variants = [assign_variant(uid, config.experiment_id) for uid in user_ids]
    
    # Metrics
    purchased = [np.random.binomial(1, 0.15 + (0.02 if v == 'B' else 0)) for v in variants]
    revenue_per_user = [p * np.random.gamma(2, 25) for p in purchased]
    page_load_time = [np.random.normal(2.0 + (0.1 if v == 'B' else 0), 0.5) for v in variants]
    
    df3 = pd.DataFrame({
        'user_id': user_ids,
        'variant': variants,
        'purchased': purchased,
        'revenue_per_user': revenue_per_user,
        'page_load_time': page_load_time
    })
    
    # Generate readout
    readout = generate_experiment_readout(df3, config)
    print_readout(readout)
    
    print("\n" + "=" * 80)
    print("Demo complete! See functions above for full implementation.")
    print("=" * 80)

Modern A/B Testing Framework - Demo

DEMO 1: Binary Metric (Conversion Rate)

Control conversion: 10.14%
Treatment conversion: 10.14%
Absolute lift: 0.00%
Relative lift: 0.0%
P-value: 0.9993
Result: ❌ NOT SIGNIFICANT

DEMO 2: Continuous Metric with CUPED (Revenue)

Raw Analysis:
  Control mean: $22.16
  Treatment mean: $24.10
  Difference: $1.94
  P-value: 0.0000

CUPED-Adjusted Analysis:
  Variance reduction: 60.9%
  Control mean: $22.16
  Treatment mean: $24.10
  Difference: $1.94
  P-value: 0.0000
  Result: ✅ SIGNIFICANT (CUPED)

DEMO 3: Complete Experiment Readout
EXPERIMENT READOUT: checkout_redesign_v1

Hypothesis: New checkout flow will increase conversion without hurting revenue
Duration: 14 days
Analysis timestamp: 2026-01-21T17:41:51.994799

────────────────────────────────────────────────────────────────────────────────
SAMPLE SIZE
────────────────────────────────────────────────────────────────────────────────
Control (A): 4,008
Treatment (B): 3,992
Total: 8,000

──────────