In [6]:
# Simulated Smoking Cessation Cohort
# Author: Hayden Hedman
# Project: simulated-smoking-cessation-cohort
# Description: Generate a synthetic dataset simulating retrospective smoking cessation behavior
# Data based on Data distributions informed by Leventhal et al. (2022), *JNCI*, 114(3), 381â€“390. doi: 10.1093/jnci/djab208
# --------------------------------------------------------------------------------------------------------------------------------
# load libraries
import os
import pandas as pd
import numpy as np
from pathlib import Path

In [12]:
# set random seed
np.random.seed(64)

# Notebook-safe directory setup
ROOT = Path().resolve().parent  # Use current notebook's parent folder
DATA_DIR = ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

n = 10_000

# Covariates
education_dist = {
    "<HS": 0.154,
    "HS Grad": 0.38,
    "Some College": 0.22,
    "Associate": 0.105,
    "Bachelor": 0.106,
    "Graduate": 0.034
}

income_dist = {
    "<25k": 0.255,
    "25k-49k": 0.356,
    "50k-74k": 0.179,
    "75k-99k": 0.093,
    "100k+": 0.117
}

disability_dist = {
    "No": 0.812,
    "Yes": 0.188
}

employment_dist = {
    "Full-time": 0.496,
    "Part-time": 0.104,
    "Unemployed": 0.059,
    "Not in workforce": 0.342
}

metro_dist = {
    "Yes": 0.805,
    "No": 0.195
}

In [14]:
def sample_from_distribution(distribution, size):
    categories = list(distribution.keys())
    probs = np.array(list(distribution.values()))
    probs = probs / probs.sum()  # Normalize to ensure sum == 1.0
    return np.random.choice(categories, size=size, p=probs)

df = pd.DataFrame({
    "id": np.arange(1, n + 1),
    "education": sample_from_distribution(education_dist, n),
    "income": sample_from_distribution(income_dist, n),
    "disability": sample_from_distribution(disability_dist, n),
    "employment": sample_from_distribution(employment_dist, n),
    "metro": sample_from_distribution(metro_dist, n),
    "age": np.random.normal(loc=45, scale=13, size=n).round(0).astype(int),
    "sex": np.random.choice(["Male", "Female"], size=n, p=[0.5, 0.5]),
    "race_ethnicity": np.random.choice(["White", "Black", "Hispanic", "Other"], size=n, p=[0.6, 0.15, 0.15, 0.1])
})

In [15]:
output_path = DATA_DIR / "raw_simulated_smoking_cessation_cohort.csv"
df.to_csv(output_path, index=False)
print(f"Confimed simulated processeddataset to: {output_path}")

Confimed simulated processeddataset to: C:\Users\hayde\Desktop\simulated-smoking-cessation-cohort\data\raw_simulated_smoking_cessation_cohort.csv
