# 🧪 Clinical Trial: Hipponol Intervention Study

Simulated 2-year RCT of **Hipponol** vs **placebo** (n=1000). Primary: 2-year survival. Secondary: SBP change. Notebook follows a lean **CONSORT-like** workflow with frequentist and Bayesian analyses. Fully self-contained: it loads a CSV if present, otherwise **simulates** the dataset (seed=11088).

💡 *Teaching aims*: trial data structure, Table 1, survival analysis (KM, log-rank, Cox), logistic regression, ANCOVA/OLS, and Bayesian counterparts.

## Environment & Packages
Works in Colab or locally. If a package is missing, uncomment the installs below.

In [None]:
# Optional installs (uncomment if needed)
# %pip install pandas numpy matplotlib seaborn scipy lifelines statsmodels pymc arviz

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, chi2_contingency
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pymc as pm
import arviz as az

sns.set_context("notebook"); sns.set_style("whitegrid")
RANDOM_SEED = 11088
rng = np.random.default_rng(RANDOM_SEED)

DATA_DIR = Path("data"); DATA_DIR.mkdir(exist_ok=True)
DATA_PATH = DATA_DIR / "hipponol_trial_data.csv"

## Data Loading / Simulation
If `data/hipponol_trial_data.csv` exists, we load it. Otherwise we **simulate**: balanced randomisation; realistic age/SBP; group effect on survival and SBP; time-to-event with censoring.

In [None]:
def simulate_trial(n=1000, seed=RANDOM_SEED):
    rng = np.random.default_rng(seed)
    # Design
    group = rng.choice(["Control", "Hipponol"], size=n, replace=True)
    age = rng.normal(55, 8, size=n).clip(40, 80)
    sex = rng.choice(["Female", "Male"], size=n)
    smoking = rng.choice(["Non-smoker", "Smoker"], p=[0.7, 0.3], size=n)

    # Baseline SBP (mmHg)
    base_sbp = rng.normal(135, 15, size=n)

    # Follow-up SBP with treatment effect (−4 mmHg average)
    trt_effect_sbp = np.where(group == "Hipponol", -4.0, 0.0)
    follow_sbp = base_sbp + trt_effect_sbp + rng.normal(0, 8, size=n)

    # Survival: logit model for 2y survival probability
    # Higher age & smoker worsen survival; Hipponol improves odds modestly
    x_hip = (group == "Hipponol").astype(int)
    x_smoke = (smoking == "Smoker").astype(int)
    x_male = (sex == "Male").astype(int)
    logit_p = (2.0 
               + 0.35 * x_hip 
               - 0.035 * (age - 55) 
               - 0.45 * x_smoke 
               - 0.10 * x_male)
    p_surv = 1 / (1 + np.exp(-logit_p))
    survival = rng.binomial(1, p_surv, size=n)  # 1=survived, 0=died

    # Time-to-event (months): shorter if death; censored at 24m
    # Hazard modulated by same risk factors; draw from exponential
    base_hazard = 0.04  # per month approximate
    linpred = (-0.35 * x_hip + 0.04*(age-55) + 0.4*x_smoke + 0.1*x_male)
    rate = base_hazard * np.exp(linpred)
    t = rng.exponential(1/np.maximum(rate, 1e-6))
    time_to_event = np.minimum(t, 24.0)  # months

    # For those marked survived=1, ensure censored at 24 often
    censored_mask = (survival == 1)
    time_to_event[censored_mask] = np.maximum(time_to_event[censored_mask], 24.0)
    time_to_event = np.minimum(time_to_event, 24.0)

    df = pd.DataFrame({
        "ID": np.arange(1, n+1),
        "Group": group,
        "Age": age.round(1),
        "Sex": sex,
        "SmokingStatus": smoking,
        "Baseline_SBP": base_sbp.round(1),
        "Followup_SBP": follow_sbp.round(1),
        "Survival": survival.astype(int),  # 1=alive at 24m
        "Time_to_Event": time_to_event.round(2) # months
    })
    return df

if DATA_PATH.exists():
    df = pd.read_csv(DATA_PATH)
else:
    df = simulate_trial()
    df.to_csv(DATA_PATH, index=False)
    print(f"Simulated dataset saved to {DATA_PATH}")

# Basic validation
required = {"Group","Age","Sex","SmokingStatus","Baseline_SBP","Followup_SBP","Survival","Time_to_Event"}
missing = required - set(df.columns)
assert not missing, f"Dataset missing columns: {missing}"

df.head()

## Baseline: Table 1
Summarise randomisation balance at baseline (Age, Baseline SBP; Sex, Smoking).

In [None]:
def create_table1(data, group_col="Group",
                  numeric_cols=("Age","Baseline_SBP"),
                  cat_cols=("Sex","SmokingStatus")):
    g = data[group_col].unique()
    assert len(g)==2, "This Table1 assumes exactly two groups."
    g1,g2 = sorted(g)
    rows = []
    # continuous
    for col in numeric_cols:
        v1 = data.loc[data[group_col]==g1, col].dropna()
        v2 = data.loc[data[group_col]==g2, col].dropna()
        t,p = ttest_ind(v1, v2, equal_var=False)
        rows.append({
            "Variable": col,
            f"{g1} (mean±SD)": f"{v1.mean():.1f} ± {v1.std():.1f}",
            f"{g2} (mean±SD)": f"{v2.mean():.1f} ± {v2.std():.1f}",
            "p": f"{p:.3f}"
        })
    # categorical
    for col in cat_cols:
        ct = pd.crosstab(data[col], data[group_col])
        chi2, p, _, _ = chi2_contingency(ct)
        for i, level in enumerate(ct.index):
            row = {
                "Variable": f"{col} = {level}",
                f"{g1} (n,%)": f"{ct.loc[level, g1]} ({ct.loc[level,g1]/ct[g1].sum()*100:.1f}%)",
                f"{g2} (n,%)": f"{ct.loc[level, g2]} ({ct.loc[level,g2]/ct[g2].sum()*100:.1f}%)",
                "p": f"{p:.3f}" if i==0 else ""
            }
            rows.append(row)
    return pd.DataFrame(rows)

table1 = create_table1(df)
table1

## Distributions (Age & Baseline SBP)
Visual checks support randomisation and test choices.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(11,4))
sns.histplot(df, x="Age", hue="Group", kde=True, bins=20, ax=ax[0])
ax[0].set_title("Age by Group")
sns.boxplot(df, x="Group", y="Baseline_SBP", ax=ax[1])
ax[1].set_title("Baseline SBP by Group")
plt.tight_layout(); plt.show()

## Primary Endpoint: 2-year Survival
Binary survival indicator at 24 months (`Survival`: 1=alive, 0=died).

In [None]:
cont = pd.crosstab(df["Group"], df["Survival"])  # columns: 0=dead,1=alive
chi2, p, dof, exp = chi2_contingency(cont)
print("Contingency (Group x Survival)")
print(cont)
print(f"Chi-squared p-value: {p:.4f}")

### Logistic Regression (unadjusted & adjusted)
Odds ratios with CIs; adjusted for age, sex, smoking.

In [None]:
df["Group"] = df["Group"].astype("category")
m1 = smf.logit("Survival ~ C(Group)", data=df).fit(disp=False)
m2 = smf.logit("Survival ~ C(Group) + Age + C(Sex) + C(SmokingStatus)", data=df).fit(disp=False)

def or_table(res):
    out = pd.DataFrame(np.column_stack([
        np.exp(res.params),
        np.exp(res.conf_int()[0]),
        np.exp(res.conf_int()[1])
    ]), index=res.params.index, columns=["OR","2.5%","97.5%"])
    return out

print("Unadjusted ORs\n", or_table(m1))
print("\nAdjusted ORs\n", or_table(m2))

### Bayesian Logistic Regression
Weakly-informative priors; posterior on treatment effect and OR.

In [None]:
X = (df["Group"]=="Hipponol").astype(int).values
y = df["Survival"].astype(int).values

with pm.Model() as blr:
    pm.MutableData("x", X)
    pm.MutableData("y", y)
    intercept = pm.Normal("Intercept", 0, 2.5)
    beta = pm.Normal("Beta_Group", 0, 2.5)
    logit_p = intercept + beta * pm.get_data("x")
    p = pm.Deterministic("p", pm.math.sigmoid(logit_p))
    pm.Bernoulli("y_obs", p=p, observed=pm.get_data("y"))
    trace_b = pm.sample(2000, tune=1000, target_accept=0.9,
                        return_inferencedata=True, random_seed=RANDOM_SEED)

az.plot_posterior(trace_b, var_names=["Beta_Group"], ref_val=0);
plt.title("Posterior (log-odds) for Hipponol vs Control"); plt.show()

or_samples = np.exp(trace_b.posterior["Beta_Group"].values.flatten())
az.plot_posterior(or_samples, ref_val=1);
plt.title("Posterior Odds Ratio: Hipponol / Control"); plt.xlabel("OR"); plt.show()
print("95% HDI for OR:", az.hdi(or_samples, hdi_prob=0.95))

## Time-to-Event: KM, Log-rank, Cox
⚠️ `lifelines` expects `event_observed=1` for **event = death**. Our `Survival` is 1=alive, so we define `Event = 1 - Survival`.

In [None]:
df = df.copy()
df["Event"] = 1 - df["Survival"]  # 1=death
kmf_c = KaplanMeierFitter(); kmf_h = KaplanMeierFitter()
dc = df.query("Group=='Control'"); dh = df.query("Group=='Hipponol'")

plt.figure(figsize=(8,5))
kmf_c.fit(dc["Time_to_Event"], event_observed=dc["Event"], label="Control")
kmf_c.plot_survival_function(ci_show=True)
kmf_h.fit(dh["Time_to_Event"], event_observed=dh["Event"], label="Hipponol")
kmf_h.plot_survival_function(ci_show=True)
plt.title("Kaplan–Meier Survival"); plt.xlabel("Months"); plt.ylabel("S(t)"); plt.tight_layout(); plt.show()

res = logrank_test(dc["Time_to_Event"], dh["Time_to_Event"],
                   event_observed_A=dc["Event"], event_observed_B=dh["Event"])
print(f"Log-rank p-value: {res.p_value:.4f}")

In [None]:
# Cox PH (one-hot, drop_first to set references: Female, Non-smoker, Control)
cox_df = pd.get_dummies(df[["Age","Sex","SmokingStatus","Group","Time_to_Event","Event"]], drop_first=True)
cph = CoxPHFitter()
cph.fit(cox_df, duration_col="Time_to_Event", event_col="Event")
cph.print_summary()  # includes HRs and CIs

# PH assumption check (plots in notebooks)
cph.check_assumptions(cox_df, p_value_threshold=0.05, show_plots=True)

## Secondary: SBP Change
`SBP_Change = Followup_SBP − Baseline_SBP` (negative = reduction).

In [None]:
df["SBP_Change"] = df["Followup_SBP"] - df["Baseline_SBP"]
fig, ax = plt.subplots(1,3, figsize=(13,4))
sns.kdeplot(df, x="Baseline_SBP", hue="Group", fill=True, common_norm=False, alpha=0.4, ax=ax[0]); ax[0].set_title("Baseline SBP")
sns.kdeplot(df, x="Followup_SBP", hue="Group", fill=True, common_norm=False, alpha=0.4, ax=ax[1]); ax[1].set_title("Follow-up SBP")
sns.kdeplot(df, x="SBP_Change", hue="Group", fill=True, common_norm=False, alpha=0.4, ax=ax[2]); ax[2].axvline(0, ls="--", c="k"); ax[2].set_title("Change (Δ)")
plt.tight_layout(); plt.show()

hip = df.loc[df.Group=="Hipponol","SBP_Change"]
con = df.loc[df.Group=="Control","SBP_Change"]
t_stat, p_val = stats.ttest_ind(hip, con, equal_var=False)
diff = hip.mean() - con.mean()
se = np.sqrt(hip.var(ddof=1)/len(hip) + con.var(ddof=1)/len(con))
dfree = (len(hip)+len(con)-2)
ci = stats.t.interval(0.95, df=dfree, loc=diff, scale=se)
print(f"Mean difference (Hipponol − Control): {diff:.2f} mmHg")
print(f"95% CI: ({ci[0]:.2f}, {ci[1]:.2f}) | p={p_val:.4f}")

### ANCOVA / OLS
Estimate ΔSBP difference with adjustment (Age, Sex, Smoking).

In [None]:
df["Treatment"] = (df["Group"]=="Hipponol").astype(int)
ols = smf.ols("SBP_Change ~ Treatment", data=df).fit()
print(ols.summary())

ancova = smf.ols("SBP_Change ~ Treatment + Age + C(Sex) + C(SmokingStatus)", data=df).fit()
print(ancova.summary())

### Bayesian Regression (SBP Change)
Posterior on treatment effect with weakly-informative priors; age standardised.

In [None]:
df_enc = pd.get_dummies(df[["SBP_Change","Age","Sex","SmokingStatus","Group"]], drop_first=True)
df_enc["Age_std"] = (df_enc["Age"] - df_enc["Age"].mean())/df_enc["Age"].std()
X = df_enc[["Age_std","Sex_Male","SmokingStatus_Smoker","Group_Hipponol"]].astype(float).values
y = df_enc["SBP_Change"].astype(float).values

with pm.Model() as blm:
    pm.MutableData("X", X)
    pm.MutableData("y", y)
    intercept = pm.Normal("Intercept", 0, 5)
    beta = pm.Normal("Beta", 0, 2, shape=X.shape[1])
    sigma = pm.HalfNormal("Sigma", 5)
    mu = intercept + pm.math.dot(pm.get_data("X"), beta)
    pm.Normal("y_obs", mu=mu, sigma=sigma, observed=pm.get_data("y"))
    trace_lm = pm.sample(2000, tune=1000, target_accept=0.9,
                         return_inferencedata=True, random_seed=RANDOM_SEED)

names = ["Age_std","Sex_Male","SmokingStatus_Smoker","Group_Hipponol"]
trace_lm.posterior = trace_lm.posterior.rename({"Beta_dim_0":"coef"})
trace_lm.posterior = trace_lm.posterior.assign_coords({"coef": names})
az.plot_forest(trace_lm, var_names=["Beta"], combined=True);
plt.title("Posterior 95% HDIs for coefficients"); plt.show()
print(az.summary(trace_lm, var_names=["Intercept","Beta","Sigma"], hdi_prob=0.95))

## Summary (for the simulated data)
- **Survival**: hipponol typically improves 2-year survival (logistic OR>1; KM separation; Cox HR<1).  
- **SBP**: hipponol lowers SBP on average; effects evident by t-test/OLS/ANCOVA and confirmed under Bayesian model.

### Notes
- Missing data: example uses listwise deletion.  
- Diagnostics to consider in real studies: influential points (OLS), residual checks, PH tests (Cox), prior sensitivity (Bayes).