In [1]:
# -------------------------------------------------------------------
# GLP-1 NLP Pipeline Demo
# Author: Hayden Hedman
# Date: 2024-11-10
# -------------------------------------------------------------------
#Purpose:
#Demonstrate basic ingestion, cleaning, and integration of unstructured clinical text and structured EHR-style data for GLP-1–related analysis.
# -------------------------------------------------------------------
#Inputs:*
#Synthetic data generated by `scripts/generate_synthetic_glp1_data.py`
# -------------------------------------------------------------------
#Outputs:
#ummary tables and a publication-quality time series figure

### Notes
#This demo intentionally focuses on lightweight data integration and summary analysis.
#NLP extraction is limited to simple pattern-based signals to reflect common production pipelines.


In [2]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import sys

# -------------------------------------------------------------------
# Canonical project root resolution (DO NOT DUPLICATE ANYWHERE ELSE)
# -------------------------------------------------------------------
def find_project_root(start: Path) -> Path:
    for parent in [start] + list(start.parents):
        if (parent / "data").exists() and (parent / "figures").exists():
            return parent
    raise RuntimeError("Project root not found")

PROJECT_ROOT = find_project_root(Path.cwd())
DATA_DIR = PROJECT_ROOT / "data"
FIG_DIR = PROJECT_ROOT / "figures"

print("PROJECT_ROOT =", PROJECT_ROOT)
print("DATA_DIR     =", DATA_DIR)
print("FIG_DIR      =", FIG_DIR)

# HARD FAIL if something is wrong
if "scripts" in str(DATA_DIR):
    sys.exit("Error: DATA_DIR incorrectly points inside /scripts — STOP")


PROJECT_ROOT = G:\My Drive\Forge\Personal Data Science Portfolio\Healthcare Analytics\glp1_nlp_demo
DATA_DIR     = G:\My Drive\Forge\Personal Data Science Portfolio\Healthcare Analytics\glp1_nlp_demo\data
FIG_DIR      = G:\My Drive\Forge\Personal Data Science Portfolio\Healthcare Analytics\glp1_nlp_demo\figures


In [3]:
# Load data (disk = source of truth)
notes = pd.read_csv(DATA_DIR / "clinical_notes.txt", sep="|", parse_dates=["note_date"])
rx = pd.read_csv(DATA_DIR / "prescriptions.csv", parse_dates=["start_date"])
dx = pd.read_csv(DATA_DIR / "diagnoses_icd.csv", parse_dates=["diagnosis_date"])
labels = pd.read_csv(DATA_DIR / "labels.csv")

print("[INFO] Loaded data")


[INFO] Loaded data


In [4]:
# cleaning before merge
notes["note_text_lower"] = notes["note_text"].str.lower()

# very basic NLP signal
notes["mentions_gi"] = notes["note_text_lower"].str.contains(
    "nausea|gastrointestinal|gi intolerance",
    regex=True
)


In [5]:
#Merge datasets (patient-level integration)
notes_merged = (
    notes
    .merge(labels, on="patient_id", how="left")
)

rx_merged = (
    rx
    .merge(labels, on="patient_id", how="left")
)


In [6]:
# Table 1: Cohorot summary
cohort_summary = labels.agg({
    "has_diabetes": "mean",
    "glp1_exposed": "mean"
}).rename("proportion")

cohort_summary


has_diabetes    0.825000
glp1_exposed    0.678125
Name: proportion, dtype: float64

In [7]:
# Table 2: GI symptom mentions by diabetes status
gi_by_diabetes = (
    notes_merged
    .groupby("has_diabetes")["mentions_gi"]
    .mean()
    .reset_index()
)

gi_by_diabetes


Unnamed: 0,has_diabetes,mentions_gi
0,False,0.424779
1,True,0.499051


In [8]:
# Agregate data for time series plot
rx_ts = (
    rx
    .assign(week=lambda d: d["start_date"].dt.to_period("W").dt.start_time)
    .groupby("week")
    .size()
    .reset_index(name="n_initiations")
)


In [9]:
# Plot time series glp1 first prescriptions
# -------------------------------------------------------------------
rx = pd.read_csv(
    DATA_DIR / "prescriptions.csv",
    parse_dates=["start_date"]
)

# -------------------------------------------------------------------
# Bi-weekly aggregation
# -------------------------------------------------------------------
rx_ts = (
    rx
    .dropna(subset=["start_date"])
    .assign(
        period=lambda d: d["start_date"]
        .dt.to_period("2W")
        .dt.start_time
    )
    .groupby("period")
    .size()
    .reset_index(name="n_glp1_initiations")
)

# -------------------------------------------------------------------
# Plot (publication-quality bar chart)
# -------------------------------------------------------------------
plt.figure(figsize=(7, 4))

plt.bar(
    rx_ts["period"].dt.strftime("%b %d"),
    rx_ts["n_glp1_initiations"]
)

plt.title("Bi-Weekly GLP-1 Prescription Initiations (Oct–Nov 2022)")
plt.xlabel("Bi-Weekly Period")
plt.ylabel("Number of GLP-1 Initiations")

plt.tight_layout()

out_path = FIG_DIR / "glp1_biweekly_initiations.png"
plt.savefig(out_path, dpi=300)
plt.close()

print(f"[INFO] Saved figure to {out_path}")


[INFO] Saved figure to G:\My Drive\Forge\Personal Data Science Portfolio\Healthcare Analytics\glp1_nlp_demo\figures\glp1_biweekly_initiations.png
