
# 21AIC401T – Case Study (UCI Heart)

This notebook performs the three required hypothesis tests on the **CleanedData** sheet in `heart_case_study_cleaned.xlsx`:

1. **One-sample t-test**: `resting_bp` vs 120 mmHg (excluding values ≤ 0)  
2. **Two-sample t-test**: `cholesterol` by `sex` (Male vs Female) with Levene's test and Welch correction if needed  
3. **One-way ANOVA**: `thalch` by `chest_pain` with **Tukey HSD** post-hoc



In [2]:
!pip install statsmodels


Collecting statsmodels
  Downloading statsmodels-0.14.5-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading statsmodels-0.14.5-cp311-cp311-win_amd64.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ----------------------------- ---------- 7.1/9.6 MB 36.4 MB/s eta 0:00:01
   ---------------------------------------- 9.6/9.6 MB 30.1 MB/s eta 0:00:00
Downloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-1.0.1 statsmodels-0.14.5



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:

import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import matplotlib.pyplot as plt
import os

DATA_PATH = "heart_case_study_cleaned.xlsx"
SHEET = "CleanedData"

OUT_DIR = "python_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

def cohens_d_one_sample(sample, mu0):
    s = np.std(sample, ddof=1)
    return (np.mean(sample) - mu0) / s if s > 0 else np.nan

def hedges_g_two_sample(x, y):
    nx, ny = len(x), len(y)
    sx2, sy2 = np.var(x, ddof=1), np.var(y, ddof=1)
    sp = np.sqrt(((nx-1)*sx2 + (ny-1)*sy2) / (nx+ny-2))
    d = (np.mean(x) - np.mean(y)) / sp if sp > 0 else np.nan
    J = 1 - (3 / (4*(nx+ny) - 9)) 
    return d * J

def mean_ci(sample, alpha=0.05):
    n = len(sample)
    m = np.mean(sample)
    s = np.std(sample, ddof=1)
    se = s/np.sqrt(n) if n>0 else np.nan
    tcrit = stats.t.ppf(1 - alpha/2, df=n-1) if n>1 else np.nan
    return (m - tcrit*se, m + tcrit*se) if n>1 else (np.nan, np.nan)

print("Environment ready.")

Environment ready.


## Load and prepare data

In [4]:

df = pd.read_excel(DATA_PATH, sheet_name=SHEET).copy()

for col in ["resting_bp", "cholesterol", "thalch"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["sex"] = df["sex"].astype(str).str.strip().str.capitalize()
df["chest_pain"] = df["chest_pain"].astype(str).str.strip()

print(df.head(3))
print("\nRows:", len(df))

   id  age   sex    dataset      chest_pain  resting_bp  cholesterol  \
0   1   63  Male  Cleveland  typical angina         145          233   
1   2   67  Male  Cleveland    asymptomatic         160          286   
2   3   67  Male  Cleveland    asymptomatic         120          229   

   fasting_bs        rest_ecg  thalch  ...  st_depression     st_slope  \
0         1.0  lv hypertrophy     150  ...            2.3  downsloping   
1         0.0  lv hypertrophy     108  ...            1.5         flat   
2         0.0  lv hypertrophy     129  ...            2.6         flat   

  vessels               thal target  age_group Unnamed: 17  Unnamed: 18  \
0     0.0       fixed defect      0        >60         NaN          NaN   
1     3.0             normal      2        >60         NaN          NaN   
2     2.0  reversable defect      1        >60         NaN          NaN   

   ONE SAMPLE T-TEST Unnamed: 20  
0                NaN         NaN  
1         Hypothesis         NaN  
2       

## 1) One-sample t-test — `resting_bp` vs 120 mmHg (exclude ≤ 0)

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load data
df = pd.read_excel("heart_case_study_cleaned.xlsx", sheet_name="CleanedData")
for col in ["resting_bp", "cholesterol", "thalch"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["sex"] = df["sex"].astype(str).str.strip().str.capitalize()
df["chest_pain"] = df["chest_pain"].astype(str).str.strip()

results = {}
summary_rows = []

# 1) One-sample t-test (resting_bp vs 120, exclude ≤0)
bp = df.loc[df["resting_bp"] > 0, "resting_bp"].dropna()
n1, mean1, sd1 = len(bp), bp.mean(), bp.std(ddof=1)
t1, p1 = stats.ttest_1samp(bp, 120)
df1 = n1 - 1

results["one_sample"] = {
    "n": n1, "mean": mean1, "sd": sd1, "df": df1,
    "t_stat": t1, "p_val": p1,
    "decision": "Reject H0" if p1 < 0.05 else "Fail to reject H0"
}
summary_rows.append({
    "Test": "One-sample t (resting_bp vs 120)",
    "Group/Levels": "-",
    "n": n1, "Mean": mean1, "Std Dev": sd1,
    "df": df1, "Statistic": t1, "p-value": p1,
    "Decision (α=0.05)": "Reject H0" if p1 < 0.05 else "Fail to reject H0"
})


## 2) Two-sample t-test — `cholesterol` by `sex` (Male vs Female)

In [None]:
g_m = df.loc[df["sex"]=="Male", "cholesterol"].dropna()
g_f = df.loc[df["sex"]=="Female", "cholesterol"].dropna()

n_m, n_f = len(g_m), len(g_f)
mean_m, mean_f = g_m.mean(), g_f.mean()
sd_m, sd_f = g_m.std(ddof=1), g_f.std(ddof=1)

t2, p2 = stats.ttest_ind(g_m, g_f, equal_var=False)
df2 = n_m + n_f - 2

results["two_sample"] = {
    "male_n": n_m, "female_n": n_f,
    "male_mean": mean_m, "female_mean": mean_f,
    "male_sd": sd_m, "female_sd": sd_f,
    "df": df2, "t_stat": t2, "p_val": p2,
    "decision": "Reject H0" if p2 < 0.05 else "Fail to reject H0"
}
summary_rows.append({
    "Test": "Two-sample t (cholesterol: Male vs Female)",
    "Group/Levels": "Male / Female",
    "n": f"{n_m} / {n_f}",
    "Mean": f"{mean_m:.2f} / {mean_f:.2f}",
    "Std Dev": f"{sd_m:.2f} / {sd_f:.2f}",
    "df": df2, "Statistic": t2, "p-value": p2,
    "Decision (α=0.05)": "Reject H0" if p2 < 0.05 else "Fail to reject H0"
})

## 3) One-way ANOVA — `thalch` by `chest_pain` + Tukey HSD

In [3]:
sub = df[["thalch", "chest_pain"]].dropna().copy()
sub["chest_pain"] = sub["chest_pain"].astype("category")

model = smf.ols("thalch ~ C(chest_pain)", data=sub).fit()
anova_tbl = sm.stats.anova_lm(model, typ=2)

F_val = float(anova_tbl.loc["C(chest_pain)", "F"])
p3 = float(anova_tbl.loc["C(chest_pain)", "PR(>F)"])
df_between = int(anova_tbl.loc["C(chest_pain)", "df"])
df_within = int(anova_tbl.loc["Residual", "df"])

results["anova"] = {
    "n": len(sub),
    "df_between": df_between, "df_within": df_within,
    "F_stat": F_val, "p_val": p3,
    "decision": "Reject H0" if p3 < 0.05 else "Fail to reject H0"
}
summary_rows.append({
    "Test": "One-way ANOVA (thalch ~ chest_pain)",
    "Group/Levels": f"{sub['chest_pain'].nunique()} chest_pain levels",
    "n": len(sub),
    "Mean": "-",
    "Std Dev": "-",
    "df": f"{df_between}, {df_within}",
    "Statistic": F_val, "p-value": p3,
    "Decision (α=0.05)": "Reject H0" if p3 < 0.05 else "Fail to reject H0"
})

In [4]:
summary_df = pd.DataFrame(summary_rows)

# Print detailed results
for test, vals in results.items():
    print(f"\n--- {test.upper()} ---")
    for k, v in vals.items():
        print(f"{k}: {v}")

print("\n=== Summary Table ===")
print(summary_df.to_string(index=False))


--- ONE_SAMPLE ---
n: 832
mean: 132.31129807692307
sd: 18.452670579130206
df: 831
t_stat: 19.244484441865392
p_val: 1.5412543274389598e-68
decision: Reject H0

--- TWO_SAMPLE ---
male_n: 647
female_n: 185
male_mean: 188.53013910355486
female_mean: 242.42702702702704
male_sd: 113.73159132018732
female_sd: 84.76583040766104
df: 830
t_stat: -7.026833533298425
p_val: 9.396303054527558e-12
decision: Reject H0

--- ANOVA ---
n: 832
df_between: 3
df_within: 828
F_stat: 44.66601508371315
p_val: 9.267302115755024e-27
decision: Reject H0

=== Summary Table ===
                                      Test        Group/Levels         n            Mean        Std Dev     df  Statistic      p-value Decision (α=0.05)
          One-sample t (resting_bp vs 120)                   -       832      132.311298      18.452671    831  19.244484 1.541254e-68         Reject H0
Two-sample t (cholesterol: Male vs Female)       Male / Female 647 / 185 188.53 / 242.43 113.73 / 84.77    830  -7.026834 9.396303e-12  