In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# --- Dynamically resolve paths ---
CURRENT_DIR = Path().resolve()
ROOT_DIR = CURRENT_DIR.parent if CURRENT_DIR.name == "notebooks" else CURRENT_DIR
DATA_DIR = ROOT_DIR / "data"
FIGURE_DIR = ROOT_DIR / "figures"
FIGURE_DIR.mkdir(parents=True, exist_ok=True)

# --- Load the processed dataset ---
df = pd.read_csv(DATA_DIR / "processed_simulated_smoking_cessation_cohort.csv")

# --- Set categorical order for education ---
education_order = ["<HS", "HS Grad", "Some College", "Associate", "Bachelor", "Graduate"]
df["education"] = pd.Categorical(df["education"], categories=education_order, ordered=True)

# --- Initialize the plot ---
plt.figure(figsize=(10, 6))
sns.set(style="whitegrid")

# --- Create boxplot ---
ax = sns.boxplot(x="education", y="age", hue="education", data=df, legend=False, palette="viridis")

# --- Annotate median values on each box ---
for i, level in enumerate(education_order):
    median = df[df["education"] == level]["age"].median()
    ax.text(i, median, f"{median:.1f}", ha='center', va='center', color='black', fontsize=10, fontweight='bold')

# --- Format plot ---
plt.title("Age Distribution by Education", fontsize=14)
plt.xlabel("Education", fontsize=12)
plt.ylabel("Age", fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()

# --- Save the figure ---
plot_path = FIGURE_DIR / "age_by_education_boxplot.png"
plt.savefig(plot_path, dpi=300, bbox_inches="tight")
plt.close()

print(f"✅ Saved boxplot to: {plot_path}")


✅ Saved boxplot to: C:\Users\hayde\Desktop\simulated-smoking-cessation-cohort\figures\age_by_education_boxplot.png
