In [6]:
# Welch's t-test: Age vs Heart-attack Risk — clean console tables (no exports)
import pandas as pd
import numpy as np
from scipy import stats

# Load
df = pd.read_csv("Heart_Attack_Cleaned.csv")

# Identify columns
age_col = next((c for c in df.columns if c.strip().lower() == "age"), None)
target_col = next((c for c in df.columns if c.strip().lower() == "target"), None)
if age_col is None or target_col is None:
    raise ValueError("Required columns not found: age, target")

# Prepare fields
df[age_col] = pd.to_numeric(df[age_col], errors="coerce")
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=[age_col, target_col]).copy()

# Define groups: risk-positive := target != 0; risk-negative := target == 0
df["Risk group"] = np.where(df[target_col] != 0, "Risk positive", "Risk negative")

# Arrays
pos = df.loc[df["Risk group"] == "Risk positive", age_col].to_numpy()
neg = df.loc[df["Risk group"] == "Risk negative", age_col].to_numpy()

# Table 1: Descriptive statistics
desc = (
    df.groupby("Risk group")[age_col]
      .agg(n="count",
           mean="mean",
           sd=lambda s: s.std(ddof=1),
           median="median",
           min="min",
           max="max")
      .reset_index()
      .rename(columns={
          "mean": "Mean age (years)",
          "sd": "SD (years)",
          "median": "Median (years)",
          "min": "Min (years)",
          "max": "Max (years)"
      })
)
for col in ["Mean age (years)", "SD (years)"]:
    desc[col] = desc[col].round(2)

# Welch’s t-test (two-tailed)
t_stat, p_val = stats.ttest_ind(pos, neg, equal_var=False, alternative="two-sided")
t_stat, p_val = float(t_stat), float(p_val)

n_pos, n_neg = len(pos), len(neg)
s1_sq, s2_sq = float(np.var(pos, ddof=1)), float(np.var(neg, ddof=1))
v1, v2 = s1_sq/n_pos, s2_sq/n_neg
df_welch = (v1 + v2)**2 / ((v1**2)/(n_pos-1) + (v2**2)/(n_neg-1))
mean_pos, mean_neg = float(np.mean(pos)), float(np.mean(neg))
mean_diff = mean_pos - mean_neg
se_diff = float(np.sqrt(v1 + v2))
tcrit = stats.t.ppf(0.975, df_welch)
ci_low, ci_high = mean_diff - tcrit*se_diff, mean_diff + tcrit*se_diff

# Minimal effect size
sp2 = ((n_pos-1)*s1_sq + (n_neg-1)*s2_sq) / (n_pos + n_neg - 2)
sp = float(np.sqrt(sp2))
hedges_g = (1 - (3/(4*(n_pos + n_neg) - 9))) * (mean_diff / sp)

# Print clean, copy-ready tables
print("Table 1. Descriptive statistics of age by risk group\n")
print(desc[["Risk group","n","Mean age (years)","SD (years)","Median (years)","Min (years)","Max (years)"]]
      .to_string(index=False))

print("\nTable 2. Welch’s t-test results (Age: risk-positive − risk-negative)\n")
res = pd.DataFrame([{
    "Mean difference (years)": round(mean_diff, 2),
    "95% CI (years)": f"[{ci_low:.2f}, {ci_high:.2f}]",
    "t (Welch)": round(t_stat, 2),
    "df (Welch)": round(df_welch, 2),
    "p-value (two-tailed)": f"{p_val:.2e}",
    "Hedges' g": round(hedges_g, 2)
}])
print(res.to_string(index=False))

# Optional one-line conclusion (kept simple for report)
alpha = 0.05
decision = "Reject H0" if p_val < alpha else "Fail to reject H0"
print(f"\nDecision (alpha=0.05): {decision}.")


Table 1. Descriptive statistics of age by risk group

   Risk group   n  Mean age (years)  SD (years)  Median (years)  Min (years)  Max (years)
Risk negative 574             51.13        9.50            52.0           28           76
Risk positive 917             55.58        8.75            57.0           29           77

Table 2. Welch’s t-test results (Age: risk-positive − risk-negative)

 Mean difference (years) 95% CI (years)  t (Welch)  df (Welch) p-value (two-tailed)  Hedges' g
                    4.45   [3.48, 5.41]       9.06     1142.07             5.55e-19       0.49

Decision (alpha=0.05): Reject H0.
