In [1]:
# 02_prepare_dataset.ipynb

import pandas as pd
import numpy as np
from pathlib import Path

# Set paths
ROOT = Path().resolve().parent
DATA_DIR = ROOT / "data"

# Load raw data
df = pd.read_csv(DATA_DIR / "raw_simulated_smoking_cessation_cohort.csv")
print(f"Loaded {df.shape[0]} rows.")

Loaded 10000 rows.
Remaining after age filter: 9839
Saved cleaned dataset to: C:\Users\hayde\Desktop\simulated-smoking-cessation-cohort\data\processed_simulated_smoking_cessation_cohort.csv


In [None]:
# ----------------------------------------------------------------------------
# Data cleaning 
# ----------------------------------------------------------------------------
# Drop records with implausible ages
df = df[df["age"].between(18, 90)]
print(f"Remaining after age filter: {df.shape[0]}")

# --- Encode variables ---
# Binary encoding for sex
df["sex_female"] = (df["sex"] == "Female").astype(int)

# Ordinal education encoding
education_order = {
    "<HS": 0,
    "HS Grad": 1,
    "Some College": 2,
    "Associate": 3,
    "Bachelor": 4,
    "Graduate": 5
}
df["education_code"] = df["education"].map(education_order)

# Income encoding
income_order = {
    "<25k": 0,
    "25k-49k": 1,
    "50k-74k": 2,
    "75k-99k": 3,
    "100k+": 4
}
df["income_code"] = df["income"].map(income_order)


In [None]:
# Race/ethnicity one-hot encoding 
race_dummies = pd.get_dummies(df["race_ethnicity"], prefix="race")
df = pd.concat([df, race_dummies], axis=1)
# ----------------------------------------------------------------------------
# Simulate smoking baseline status 
# ----------------------------------------------------------------------------
# ~20% of general US adults smoke (adjust if needed)
df["baseline_smoker"] = np.random.binomial(n=1, p=0.2, size=df.shape[0])
# ----------------------------------------------------------------------------
# Simulate intervention group 
# ----------------------------------------------------------------------------
# Randomly assign half to a smoking cessation program
df["intervention_group"] = np.random.choice(["Usual Care", "Cessation Program"], size=df.shape[0], p=[0.5, 0.5])


In [2]:
# ----------------------------------------------------------------------------
# Save processed data
# ----------------------------------------------------------------------------
output_path = DATA_DIR / "processed_simulated_smoking_cessation_cohort.csv"
df.to_csv(output_path, index=False)
print(f"Saved cleaned dataset to: {output_path}")

Saved cleaned dataset to: C:\Users\hayde\Desktop\simulated-smoking-cessation-cohort\data\processed_simulated_smoking_cessation_cohort.csv
