In [5]:
# Load libraries
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.special import expit
import statsmodels.api as sm

Loaded 9839 records.
Confirmed: final cohort with quit outcome saved to: C:\Users\hayde\Desktop\simulated-smoking-cessation-cohort\data\final_simulated_smoking_cessation_cohort.csv
                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: quit_successful  Pseudo R-squared: 0.047     
Date:               2025-08-15 20:35 AIC:              2369.2455 
No. Observations:   1968             BIC:              2397.1694 
Df Model:           4                Log-Likelihood:   -1179.6   
Df Residuals:       1963             LL-Null:          -1237.6   
Converged:          1.0000           LLR p-value:      3.9159e-24
No. Iterations:     5.0000           Scale:            1.0000    
-----------------------------------------------------------------
                   Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------
const             -1.2455   0.2160 -5.7654 0.0000 -1.

In [None]:
# Set paths
ROOT = Path().resolve().parent
DATA_DIR = ROOT / "data"
RESULTS_DIR = ROOT / "results"
RESULTS_DIR.mkdir(exist_ok=True)

# Load processed dataset
df = pd.read_csv(DATA_DIR / "processed_simulated_smoking_cessation_cohort.csv")
print(f"Loaded {df.shape[0]} records.")

In [None]:
# Only simulate outcome for baseline smokers
df["eligible_for_quit"] = df["baseline_smoker"] == 1

# Base log-odds of quitting
log_odds = -1.8  # ~14% base quit rate

# Boost from intervention
log_odds += (df["intervention_group"] == "Cessation Program") * 0.8

# Boost from education (scaled 0–5)
log_odds += df["education_code"] * 0.2

# Boost from income (scaled 0–4)
log_odds += df["income_code"] * 0.2

# Age penalty
log_odds += np.where(df["age"] < 30, 0.3, 0)
log_odds += np.where(df["age"] > 65, -0.3, 0)

# Add random noise
log_odds += np.random.normal(0, 0.5, size=df.shape[0])

# Convert to probability (logistic)
quit_prob = expit(log_odds)

In [None]:
# Simulate quit outcome only for smokers
df["quit_successful"] = 0
smoker_idx = df["eligible_for_quit"]
df.loc[smoker_idx, "quit_successful"] = np.random.binomial(1, quit_prob[smoker_idx])

# Drop helper column
df.drop(columns=["eligible_for_quit"], inplace=True)

# Save final simulated dataset
output_path = DATA_DIR / "final_simulated_smoking_cessation_cohort.csv"
df.to_csv(output_path, index=False)
print(f"Confirmed: final cohort with quit outcome saved to: {output_path}")

In [None]:
# -------------------------------------------------------------------------
# Model: Predict quit_successful among baseline smokers 
# -------------------------------------------------------------------------
model_df = df[df["baseline_smoker"] == 1].copy()

# Define predictors
X = model_df[["education_code", "income_code", "age"]].copy()
X["intervention"] = (model_df["intervention_group"] == "Cessation Program").astype(int)
X = sm.add_constant(X)

# Define outcome
y = model_df["quit_successful"]

# Fit logistic regression
logit_model = sm.Logit(y, X).fit(disp=False)

# Print summary in notebook
print(logit_model.summary2())

# Save summary to file
summary_path = RESULTS_DIR / "model_summary.txt"
with open(summary_path, "w") as f:
    f.write(logit_model.summary2().as_text())

print(f"Model summary saved to: {summary_path}")