In [1]:
# Two-proportion test: Heart-attack risk (target != 0) by sex — 

# --- Core imports  ---
import pandas as pd
import numpy as np
from scipy import stats

try:
    from rich.console import Console
    from rich.table import Table
    from rich import box
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "rich"])
    from rich.console import Console
    from rich.table import Table
    from rich import box

console = Console(force_jupyter=True)

# --- Load ---
df = pd.read_csv("Heart_Attack_Cleaned.csv")

# Identify columns
sex_col = next((c for c in df.columns if c.strip().lower() == "sex"), None)
target_col = next((c for c in df.columns if c.strip().lower() == "target"), None)
if sex_col is None or target_col is None:
    raise ValueError("Required columns not found: sex, target")

# Prepare
df[sex_col] = pd.to_numeric(df[sex_col], errors="coerce")
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=[sex_col, target_col]).copy()

# Define: risk-positive := target != 0; risk-negative := target == 0
df["Risk positive"] = (df[target_col] != 0).astype(int)
df["Sex"] = np.where(df[sex_col] == 1, "Male", "Female")

# Counts and rates by sex
tab = (
    df.groupby("Sex")["Risk positive"]
      .agg(n="count", risk_pos="sum")
      .reset_index()
)
tab["risk_neg"] = tab["n"] - tab["risk_pos"]
tab["risk_rate"] = (tab["risk_pos"] / tab["n"]) * 100

# Extract male/female counts
row_m = tab.loc[tab["Sex"] == "Male"].iloc[0]
row_f = tab.loc[tab["Sex"] == "Female"].iloc[0]
n_male, x_male = int(row_m["n"]), int(row_m["risk_pos"])
n_female, x_female = int(row_f["n"]), int(row_f["risk_pos"])
p_male = x_male / n_male
p_female = x_female / n_female

# Two-proportion z-test (pooled SE for test) — stable p via survival function
p_pool = (x_male + x_female) / (n_male + n_female)
se_pool = np.sqrt(p_pool * (1 - p_pool) * (1/n_male + 1/n_female))
z_stat = (p_male - p_female) / se_pool
p_val_two = 2 * stats.norm.sf(abs(z_stat))  # numerically stable tail probability

# 95% CI for difference (unpooled SE)
zcrit = stats.norm.ppf(0.975)
diff = p_male - p_female
se_unpooled = np.sqrt(p_male*(1-p_male)/n_male + p_female*(1-p_female)/n_female)
ci_low, ci_high = diff - zcrit*se_unpooled, diff + zcrit*se_unpooled

# Risk ratio and odds ratio with 95% CIs (Wald on log scale, continuity if needed)
A, B = x_male, n_male - x_male
C, D = x_female, n_female - x_female
if min(A,B,C,D) == 0:
    A += 0.5; B += 0.5; C += 0.5; D += 0.5
rr = (A/(A+B)) / (C/(C+D))
se_log_rr = np.sqrt(1/A - 1/(A+B) + 1/C - 1/(C+D))
rr_low, rr_high = np.exp(np.log(rr) - zcrit*se_log_rr), np.exp(np.log(rr) + zcrit*se_log_rr)

odds_ratio = (A*D) / (B*C)
se_log_or = np.sqrt(1/A + 1/B + 1/C + 1/D)
or_low, or_high = np.exp(np.log(odds_ratio) - zcrit*se_log_or), np.exp(np.log(odds_ratio) + zcrit*se_log_or)

# --- Helpers: safe formatting and minimal red/green accent ---
def fmt_p(p, min_show=1e-300):
    """Never show 0; display as '< 1e-300' if underflowed or exactly zero."""
    return f"< {min_show:.0e}" if p <= 0 else f"{p:.2e}"

def rg_num(value, positive_is_good=True, fmt="{:.4f}", threshold=None):
    """
    Return value as plain black text, except:
      - green for 'good' (positive if positive_is_good, or < threshold if provided),
      - red for the opposite.
    """
    try:
        v = float(value)
    except Exception:
        return str(value)
    if threshold is not None:
        return f"[green]{fmt.format(v)}[/]" if v < threshold else f"[red]{fmt.format(v)}[/]"
    else:
        if positive_is_good and v > 0:
            return f"[green]{fmt.format(v)}[/]"
        if positive_is_good and v < 0:
            return f"[red]{fmt.format(v)}[/]"
        if not positive_is_good and v < 0:
            return f"[green]{fmt.format(v)}[/]"
        if not positive_is_good and v > 0:
            return f"[red]{fmt.format(v)}[/]"
    return fmt.format(v)

# --- Renderers ---
def render_table1(tab_df: pd.DataFrame):
    t = Table(
        title="Table 1. Heart-attack risk by sex",
        box=box.SIMPLE_HEAVY,
        header_style="bold",        # black headers
        row_styles=["none", "dim"]  # zebra without color
    )
    t.add_column("Sex", justify="left", no_wrap=True)
    t.add_column("n", justify="right")
    t.add_column("Risk-positive", justify="right")
    t.add_column("Risk-negative", justify="right")
    t.add_column("Risk rate (%)", justify="right")

    view = tab_df[["Sex","n","risk_pos","risk_neg","risk_rate"]].rename(columns={
        "risk_pos":"Risk-positive",
        "risk_neg":"Risk-negative",
        "risk_rate":"Risk rate (%)"
    }).copy()
    view["Risk rate (%)"] = view["Risk rate (%)"].round(2)

    for _, r in view.iterrows():
        t.add_row(
            str(r["Sex"]),
            f"{int(r['n'])}",
            f"{int(r['Risk-positive'])}",
            f"{int(r['Risk-negative'])}",
            f"{r['Risk rate (%)']:.2f}",
        )
    console.print(t)

def render_table2(diff, ci_low, ci_high, z_stat, p_val_two, rr, rr_low, rr_high, odds_ratio, or_low, or_high, alpha=0.05):
    t = Table(
        title="Table 2. Two-proportion z-test summary (Male − Female)",
        box=box.SIMPLE_HEAVY,
        header_style="bold",        # black headers
        row_styles=["none", "dim"]  # zebra without color
    )
    t.add_column("Metric", justify="left", no_wrap=True)
    t.add_column("Value", justify="right")

    # Targeted red/green only for risk difference and p-value
    diff_txt = rg_num(diff, positive_is_good=True, fmt="{:.4f}")
    p_txt = fmt_p(p_val_two)
    p_txt = f"[green]{p_txt}[/]" if p_val_two < alpha else f"[red]{p_txt}[/]"

    t.add_row("Risk difference (Male − Female)", diff_txt)
    t.add_row("95% CI (difference)", f"[{ci_low:.4f}, {ci_high:.4f}]")
    t.add_row("z statistic", f"{z_stat:.2f}")
    t.add_row("p-value (two-tailed)", p_txt)
    t.add_row("Risk ratio", f"{rr:.2f} [{rr_low:.2f}, {rr_high:.2f}]")
    t.add_row("Odds ratio", f"{odds_ratio:.2f} [{or_low:.2f}, {or_high:.2f}]")
    console.print(t)

    # Conclusion
    decision = "Reject H0: different risk rates by sex" if p_val_two < alpha else "Fail to reject H0"
    console.print(f"[bold]Decision (alpha={alpha:.2f}): {decision}[/]")

# --- Render both tables ---
alpha = 0.05
render_table1(tab)
render_table2(diff, ci_low, ci_high, z_stat, p_val_two, rr, rr_low, rr_high, odds_ratio, or_low, or_high, alpha=alpha)