In [6]:
# Celda 0 — Extracción global (regex) + muestreo de alta confianza + GT final para IMC
# - Recorre TODO PMC-Patients.csv
# - Clasifica: alta confianza / dudosos / descartados
# - Toma un sample (50) SOLO de alta confianza (seed=42)
# - Genera:
#     gt_imc_auto_confident_full.csv    (todas las filas seguras de TODO el dataset, con evidencias)
#     gt_imc_auto_doubtful_full.csv     (todas las dudosas)
#     gt_imc_auto_discarded_full.csv    (todas las descartadas)
#     sample_notes_imc.csv              (notas SOLO del sample seguro)
#     sample_notes_imc.jsonl            (lo mismo en JSONL)
#     gt_imc_final.csv                  (GT limpio SOLO del sample seguro)
#     gt_imc_confident_sample.csv       (seguras del sample con evidencias)

import re, json
from pathlib import Path
import pandas as pd
import numpy as np

# ---------- Rutas y parámetros ----------
DATA_CSV   = r"C:\Users\hered\Desktop\TFM\TFM\data\PMC-Patients.csv"
OUTPUT_DIR = Path(r"C:\Users\hered\Desktop\TFM\TFM\TFM2")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

SAMPLE_SIZE = 50
SEED = 42
MAX_CHAR_DISTANCE = 300  # cercanía entre altura y peso para considerarlo consistente

# Salidas "FULL" (todo el dataset)
CONF_FULL = OUTPUT_DIR / "gt_imc_auto_confident_full.csv"
DOUBT_FULL = OUTPUT_DIR / "gt_imc_auto_doubtful_full.csv"
DROP_FULL = OUTPUT_DIR / "gt_imc_auto_discarded_full.csv"

# Salidas del SAMPLE seguro
SAMPLE_NOTES_CSV   = OUTPUT_DIR / "sample_notes_imc.csv"
SAMPLE_NOTES_JSONL = OUTPUT_DIR / "sample_notes_imc.jsonl"
GT_FINAL_CSV       = OUTPUT_DIR / "gt_imc_final.csv"
CONF_SAMPLE        = OUTPUT_DIR / "gt_imc_confident_sample.csv"

# ---------- Carga robusta ----------
df = pd.read_csv(
    DATA_CSV,
    dtype={"patient_id": str},
    encoding="utf-8",
    engine="python",
    on_bad_lines="skip"
)

required_cols = {"patient_id", "patient"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Faltan columnas requeridas: {missing}. Columnas disponibles: {list(df.columns)}")

df = (
    df[["patient_id", "patient"]]
    .dropna(subset=["patient_id", "patient"])
    .assign(patient=lambda x: x["patient"].astype(str).str.replace("\u2009"," ").str.replace("\u00A0"," ").str.strip())
    .query("patient.str.len() > 0", engine="python")
)

# ---------- Parámetros y patrones ----------
# Rangos (adultos)
H_MIN, H_MAX = 1.2, 2.2      # metros
W_MIN, W_MAX = 30.0, 300.0   # kg
BMI_MIN, BMI_MAX = 10.0, 80.0

CM_RE   = re.compile(r"\b(?P<val>\d{2,3}(?:\.\d+)?)\s*cm\b", re.I)
M_RE    = re.compile(r"\b(?P<val>\d(?:\.\d{1,3})?)\s*m\b", re.I)
FTIN_RE = re.compile(r"\b(?P<ft>[4-7])\s*(?:ft|feet|')\s*(?P<inch>\d{1,2})?\s*(?:in|inches|\")?\b", re.I)

KG_RE   = re.compile(r"\b(?P<val>\d{2,3}(?:\.\d+)?)\s*kg\b", re.I)
LB_RE   = re.compile(r"\b(?P<val>\d{2,3}(?:\.\d+)?)\s*(?:lb|lbs|pounds?)\b", re.I)

BMI_RE  = re.compile(r"\b(?:bmi|body\s*mass\s*index)\s*[:=]?\s*(?P<val>\d{1,2}(?:\.\d{1,2})?)\b", re.I)

LAB_UNITS_RE = re.compile(r"\b(mg/dl|µ?mol/?l|g/dl|mmol/?l|ng/ml|pg/ml|u/l|iu/l)\b", re.I)
BLACKLIST = {"mass","lesion","tumor","nodule","cyst","defect","stone","specimen"}

def inches_to_m(ft:int, inch:int|None) -> float:
    total = ft*12 + (inch or 0)
    return total * 0.0254

def plausible_height(m: float|None) -> bool:
    return m is not None and H_MIN <= m <= H_MAX

def plausible_weight(kg: float|None) -> bool:
    return kg is not None and W_MIN <= kg <= W_MAX

def plausible_bmi(b: float|None) -> bool:
    return b is not None and BMI_MIN <= b <= BMI_MAX

def has_blacklist(snippet: str) -> bool:
    s = snippet.lower()
    return any(tok in s for tok in BLACKLIST)

def extract_heights(text: str):
    res = []
    for m in M_RE.finditer(text):
        try:
            val = float(m.group("val"))
            if plausible_height(val):
                res.append(("m", val, m.start(), m.end()))
        except: pass
    for m in CM_RE.finditer(text):
        try:
            cm = float(m.group("val"))
            val = cm/100.0
            if plausible_height(val):
                res.append(("cm", val, m.start(), m.end()))
        except: pass
    for m in FTIN_RE.finditer(text):
        try:
            ft = int(m.group("ft"))
            inch = m.group("inch")
            inch = int(inch) if inch is not None else 0
            val = round(inches_to_m(ft, inch), 3)
            if plausible_height(val):
                res.append(("ftin", val, m.start(), m.end()))
        except: pass
    return res

def extract_weights(text: str):
    res = []
    for m in KG_RE.finditer(text):
        try:
            kg = float(m.group("val"))
            if plausible_weight(kg):
                res.append(("kg", kg, m.start(), m.end()))
        except: pass
    for m in LB_RE.finditer(text):
        try:
            lb = float(m.group("val"))
            kg = round(lb * 0.45359237, 1)
            if plausible_weight(kg):
                res.append(("lb", kg, m.start(), m.end()))
        except: pass
    return res

def extract_bmis(text: str):
    res = []
    for m in BMI_RE.finditer(text):
        try:
            val = float(m.group("val"))
            if plausible_bmi(val):
                win = text[max(0,m.start()-15):m.end()+15]
                if not LAB_UNITS_RE.search(win):
                    res.append((val, m.start(), m.end()))
        except: pass
    return res

def nearest_pair(heights, weights):
    best = None
    best_dist = 1e12
    for h in heights:
        for w in weights:
            dist = min(abs(h[2]-w[2]), abs(h[3]-w[2]), abs(h[2]-w[3]), abs(h[3]-w[3]))
            if dist < best_dist:
                best_dist = dist
                best = (h, w, best_dist)
    return best

def same_paragraph(text, i, j):
    a, b = sorted([i, j])
    return text[a:b].count("\n") <= 1

def compute_bmi(h, w):
    try:
        return round(w/(h*h), 2)
    except: return None

# ---------- Extracción sobre TODO el dataset ----------
conf_rows, doubt_rows, drop_rows = [], [], []

for _, r in df.iterrows():
    pid = r["patient_id"]
    text = r["patient"]

    Hs = extract_heights(text)
    Ws = extract_weights(text)
    Bs = extract_bmis(text)

    # Filtrar por blacklist en ventana local
    Hs2 = []
    for tag, val, s, e in Hs:
        snip = text[max(0, s-40):e+40]
        if not has_blacklist(snip):
            Hs2.append((tag, val, s, e))
    Ws2 = []
    for tag, val, s, e in Ws:
        snip = text[max(0, s-40):e+40]
        if not has_blacklist(snip):
            Ws2.append((tag, val, s, e))

    if not Hs2 and not Ws2 and not Bs:
        drop_rows.append({"patient_id": pid, "reason": "no_matches", "text_preview": text[:200]})
        continue

    if Hs2 and Ws2:
        h, w, dist = nearest_pair(Hs2, Ws2)
        h_m, w_kg = h[1], w[1]
        bmi_calc = compute_bmi(h_m, w_kg)

        bmi_explicit = Bs[0][0] if Bs else None
        explicit_ok = (bmi_explicit is None) or (bmi_calc is not None and abs(bmi_calc - bmi_explicit) <= 0.5)
        close_enough = (dist <= MAX_CHAR_DISTANCE) or same_paragraph(text, h[2], w[2])

        if plausible_height(h_m) and plausible_weight(w_kg) and close_enough and explicit_ok and plausible_bmi(bmi_calc):
            conf_rows.append({
                "patient_id": pid,
                "height_m_true": round(h_m, 3),
                "weight_kg_true": round(w_kg, 1),
                "BMI_true": bmi_calc,
                "bmi_explicit_in_note": bool(Bs),
                "evidence_height": text[max(0, h[2]-60):h[3]+60],
                "evidence_weight": text[max(0, w[2]-60):w[3]+60],
                "evidence_bmi": (text[max(0, Bs[0][1]-60):Bs[0][2]+60] if Bs else None),
                "char_distance_hw": int(dist),
                "patient": text  # guardamos la nota completa para poder muestrear y exportar
            })
        else:
            doubt_rows.append({
                "patient_id": pid,
                "height_m_candidate": round(h_m,3),
                "weight_kg_candidate": round(w_kg,1),
                "BMI_calc": bmi_calc,
                "BMI_explicit": bmi_explicit,
                "close_enough": close_enough,
                "explicit_ok": explicit_ok,
                "char_distance_hw": int(dist),
                "text_preview": text[:300]
            })
    else:
        bmi_explicit = Bs[0][0] if Bs else None
        doubt_rows.append({
            "patient_id": pid,
            "height_m_candidate": (round(Hs2[0][1],3) if Hs2 else None),
            "weight_kg_candidate": (round(Ws2[0][1],1) if Ws2 else None),
            "BMI_calc": None,
            "BMI_explicit": bmi_explicit,
            "close_enough": False,
            "explicit_ok": False,
            "char_distance_hw": None,
            "text_preview": text[:300]
        })

# ---------- Guardar FULL ----------
pd.DataFrame(conf_rows).to_csv(CONF_FULL, index=False, encoding="utf-8")
pd.DataFrame(doubt_rows).to_csv(DOUBT_FULL, index=False, encoding="utf-8")
pd.DataFrame(drop_rows).to_csv(DROP_FULL, index=False, encoding="utf-8")

# ---------- Muestreo SOLO de alta confianza ----------
conf_df = pd.DataFrame(conf_rows)
if len(conf_df) == 0:
    raise RuntimeError("No se han encontrado filas de alta confianza. Revisa las reglas/umbrales.")

rng = np.random.default_rng(SEED)
take = min(SAMPLE_SIZE, len(conf_df))
sample_idx = rng.choice(conf_df.index, size=take, replace=False)
conf_sample = conf_df.loc[sample_idx].reset_index(drop=True)

# Notas del sample (para inferencia posterior)
notes_sample = conf_sample[["patient_id","patient"]].copy()
notes_sample.to_csv(SAMPLE_NOTES_CSV, index=False)
with open(SAMPLE_NOTES_JSONL, "w", encoding="utf-8") as f:
    for _, row in notes_sample.iterrows():
        f.write(json.dumps({"patient_id": row["patient_id"], "patient": row["patient"]}, ensure_ascii=False) + "\n")

# GT limpio SOLO del sample
gt_final = conf_sample[["patient_id","height_m_true","weight_kg_true","BMI_true","bmi_explicit_in_note"]].copy()
gt_final.to_csv(GT_FINAL_CSV, index=False, encoding="utf-8")

# También guardamos las seguras del sample con evidencias
conf_sample.to_csv(CONF_SAMPLE, index=False, encoding="utf-8")

# ---------- Informe ----------
print("✅ Extracción global completada.")
print(f"- FULL seguras     : {CONF_FULL}")
print(f"- FULL dudosas     : {DOUBT_FULL}")
print(f"- FULL descartadas : {DROP_FULL}")

print("\n✅ Muestreo de alta confianza:")
print(f"- Muestra notas    : {SAMPLE_NOTES_CSV}")
print(f"- Muestra JSONL    : {SAMPLE_NOTES_JSONL}")
print(f"- GT FINAL (sample): {GT_FINAL_CSV}")
print(f"- Seguras (sample) : {CONF_SAMPLE}")

print("\nResumen (FULL):")
print(f"  Seguras   : {len(conf_rows)}")
print(f"  Dudosas   : {len(doubt_rows)}")
print(f"  Descartes : {len(drop_rows)}")

print("\nResumen (sample):")
print(f"  Tamaño sample seguro: {len(conf_sample)} (seed={SEED})")


✅ Extracción global completada.
- FULL seguras     : C:\Users\hered\Desktop\TFM\TFM\TFM2\gt_imc_auto_confident_full.csv
- FULL dudosas     : C:\Users\hered\Desktop\TFM\TFM\TFM2\gt_imc_auto_doubtful_full.csv
- FULL descartadas : C:\Users\hered\Desktop\TFM\TFM\TFM2\gt_imc_auto_discarded_full.csv

✅ Muestreo de alta confianza:
- Muestra notas    : C:\Users\hered\Desktop\TFM\TFM\TFM2\sample_notes_imc.csv
- Muestra JSONL    : C:\Users\hered\Desktop\TFM\TFM\TFM2\sample_notes_imc.jsonl
- GT FINAL (sample): C:\Users\hered\Desktop\TFM\TFM\TFM2\gt_imc_final.csv
- Seguras (sample) : C:\Users\hered\Desktop\TFM\TFM\TFM2\gt_imc_confident_sample.csv

Resumen (FULL):
  Seguras   : 3712
  Dudosas   : 6308
  Descartes : 157014

Resumen (sample):
  Tamaño sample seguro: 50 (seed=42)
