In [3]:
# Welch's t-test: Cholesterol vs Heart-attack occurrence — clean console tables (no exports)
import pandas as pd
import numpy as np
from scipy import stats

# Load data
df = pd.read_csv("Heart_Attack_Cleaned.csv")

# Identify columns
chol_col = next((c for c in df.columns if c.strip().lower() == "cholesterol"), None)
target_col = next((c for c in df.columns if c.strip().lower() == "target"), None)
if chol_col is None or target_col is None:
    raise ValueError("Required columns not found: cholesterol, target")

# Prepare fields
df[chol_col] = pd.to_numeric(df[chol_col], errors="coerce")
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=[chol_col, target_col]).copy()

# Define groups: risk-positive := target != 0; risk-negative := target == 0
df["Risk group"] = np.where(df[target_col] != 0, "Risk positive", "Risk negative")

# Arrays for Welch
pos = df.loc[df["Risk group"] == "Risk positive", chol_col].to_numpy()
neg = df.loc[df["Risk group"] == "Risk negative", chol_col].to_numpy()

# Table 1: Descriptive statistics
desc = (
    df.groupby("Risk group")[chol_col]
      .agg(n="count",
           mean="mean",
           sd=lambda s: s.std(ddof=1),
           median="median",
           min="min",
           max="max")
      .reset_index()
      .rename(columns={
          "mean": "Mean Chol (mg/dL)",
          "sd": "SD (mg/dL)",
          "median": "Median (mg/dL)",
          "min": "Min (mg/dL)",
          "max": "Max (mg/dL)"
      })
)
for col in ["Mean Chol (mg/dL)", "SD (mg/dL)"]:
    desc[col] = desc[col].round(2)

# Welch’s t-test (two-tailed)
t_stat, p_val = stats.ttest_ind(pos, neg, equal_var=False, alternative="two-sided")
t_stat, p_val = float(t_stat), float(p_val)

n_pos, n_neg = len(pos), len(neg)
s1_sq, s2_sq = float(np.var(pos, ddof=1)), float(np.var(neg, ddof=1))
v1, v2 = s1_sq/n_pos, s2_sq/n_neg
df_welch = (v1 + v2)**2 / ((v1**2)/(n_pos-1) + (v2**2)/(n_neg-1))
mean_pos, mean_neg = float(np.mean(pos)), float(np.mean(neg))
mean_diff = mean_pos - mean_neg
se_diff = float(np.sqrt(v1 + v2))
tcrit = stats.t.ppf(0.975, df_welch)
ci_low, ci_high = mean_diff - tcrit*se_diff, mean_diff + tcrit*se_diff

# Minimal effect sizes
sp2 = ((n_pos-1)*s1_sq + (n_neg-1)*s2_sq) / (n_pos + n_neg - 2)
sp = float(np.sqrt(sp2))
hedges_g = (1 - (3/(4*(n_pos + n_neg) - 9))) * (mean_diff / sp)
r_pb = t_stat / np.sqrt(t_stat**2 + df_welch)

# Print clean, copy-ready tables
print("Table 1. Descriptive statistics of cholesterol by risk group\n")
print(desc[["Risk group","n","Mean Chol (mg/dL)","SD (mg/dL)","Median (mg/dL)","Min (mg/dL)","Max (mg/dL)"]]
      .to_string(index=False))

print("\nTable 2. Welch’s t-test results (Cholesterol: risk-positive − risk-negative)\n")
res = pd.DataFrame([{
    "Mean difference (mg/dL)": round(mean_diff, 2),
    "95% CI (mg/dL)": f"[{ci_low:.2f}, {ci_high:.2f}]",
    "t (Welch)": round(t_stat, 2),
    "df (Welch)": round(df_welch, 2),
    "p-value (two-tailed)": f"{p_val:.2e}",
    "Hedges' g": round(hedges_g, 2),
    "Point-biserial r": round(r_pb, 3)
}])
print(res.to_string(index=False))

# Simple decision line
alpha = 0.05
decision = "Reject H0: cholesterol differs by risk group" if p_val < alpha else "Fail to reject H0"
print(f"\nDecision (alpha=0.05): {decision}.")

Table 1. Descriptive statistics of cholesterol by risk group

   Risk group   n  Mean Chol (mg/dL)  SD (mg/dL)  Median (mg/dL)  Min (mg/dL)  Max (mg/dL)
Risk negative 574             239.16       50.46           235.0           85          409
Risk positive 917             248.09       48.17           240.0          100          409

Table 2. Welch’s t-test results (Cholesterol: risk-positive − risk-negative)

 Mean difference (mg/dL) 95% CI (mg/dL)  t (Welch)  df (Welch) p-value (two-tailed)  Hedges' g  Point-biserial r
                    8.93  [3.75, 14.11]       3.38     1174.21             7.39e-04       0.18             0.098

Decision (alpha=0.05): Reject H0: cholesterol differs by risk group.
