In [2]:
import pandas as pd
import re
import hashlib
from datetime import datetime

# ========================
# ðŸ”¹ Helper Functions
# ========================

def anonymize_id(val):
    """Hash identifiers to anonymize PHI"""
    if pd.isna(val):
        return None
    return hashlib.sha256(str(val).encode()).hexdigest()[:12]  # short surrogate

def standardize_date(val):
    """Convert date/time to ISO format"""
    if pd.isna(val):
        return None
    try:
        return pd.to_datetime(val, errors="coerce").strftime("%Y-%m-%d")
    except:
        return None

def remove_phi_from_text(text):
    """Remove PHI patterns from TEXT field"""
    if pd.isna(text):
        return ""

    # Regex patterns for PHI-like data
    text = re.sub(r"\b\d{3}-\d{2}-\d{4}\b", "[SSN]", text)  # SSN
    text = re.sub(r"\b\d{10}\b", "[PHONE]", text)            # 10-digit phone
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "[EMAIL]", text)  # email
    text = re.sub(r"\b\d{2}/\d{2}/\d{4}\b", "[DATE]", text)  # dates
    text = re.sub(r"\s+", " ", text).strip()  # normalize whitespace
    return text

def chunk_text(text, chunk_size=1024, overlap=100):
    """Split long text into ~1KB chunks with slight overlap"""
    if not text:
        return []
    tokens = text.split()
    chunks, start = [], 0
    while start < len(tokens):
        end = start + chunk_size // 6  # approx 6 chars per token
        chunk = " ".join(tokens[start:end])
        chunks.append(chunk)
        start = end - (overlap // 6)
    return chunks

# ========================
# ðŸ”¹ Load and Clean Data
# ========================

df = pd.read_csv("/content/ehr_records.csv")

# 1. Deduplication
df.drop_duplicates(inplace=True)

# 2. Anonymize identifiers
df["SUBJECT_ID"] = df["SUBJECT_ID"].apply(anonymize_id)
df["HADM_ID"] = df["HADM_ID"].apply(anonymize_id)
df["CGID"] = df["CGID"].apply(anonymize_id)

# 3. Standardize date fields
for col in ["CHARTDATE", "CHARTTIME", "STORETIME"]:
    if col in df.columns:
        df[col] = df[col].apply(standardize_date)

# 4. Normalize CATEGORY & DESCRIPTION
df["CATEGORY"] = df["CATEGORY"].str.strip().str.title()
df["DESCRIPTION"] = df["DESCRIPTION"].astype(str).str.strip()

# 5. Clean TEXT (PHI removal + whitespace normalization)
df["TEXT"] = df["TEXT"].apply(remove_phi_from_text)

# 6. Validate JSON-like fields
for col in ["diagnoses", "procedures", "cpt_codes"]:
    if col in df.columns:
        df[col] = df[col].fillna("").astype(str).str.replace("'", '"')

# 7. Chunk long notes
expanded_rows = []
for _, row in df.iterrows():
    chunks = chunk_text(row["TEXT"])
    if not chunks:
        expanded_rows.append(row.to_dict())
    else:
        for i, chunk in enumerate(chunks):
            r = row.to_dict()
            r["TEXT"] = chunk
            r["chunk_id"] = i
            expanded_rows.append(r)

df_cleaned = pd.DataFrame(expanded_rows)

# ========================
# ðŸ”¹ Save Cleaned Data
# ========================

df_cleaned.to_csv("cleaned_records.csv", index=False)
print(f"âœ… Cleaned dataset saved: {df_cleaned.shape[0]} rows")


âœ… Cleaned dataset saved: 4709 rows
