In [1]:
import re
import json
import math
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# --- RUTA A TU DATASET ---
DATA_CSV = r"C:/Users/hered/Desktop/TFM/TFM/data/PMC-Patients.csv"
OUT_DIR  = Path(r"C:/Users/hered/Desktop/TFM/TFM/IMC2")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ================= Normalización de texto =================
def norm_text(s: str) -> str:
    s = (s or "")
    s = s.replace("\u2009", " ").replace("\u00a0", " ").replace("\u2011","-")
    s = s.replace("\r", " ").replace("\t", " ")
    # compactar espacios
    s = re.sub(r"[ \u00a0]{2,}", " ", s)
    return s

# ================= Conversión de unidades =================
def to_meters(val: float, unit: str, inches_extra: float|None=None):
    u = (unit or "").strip().lower()
    if u in ("m","meter","meters"):
        return val if 0.3 <= val <= 3.0 else None
    if u in ("cm","centimeter","centimeters"):
        return val/100.0
    if u in ("ft","foot","feet"):
        inches = inches_extra or 0.0
        return val*0.3048 + inches*0.0254
    if u in ("in","inch","inches","\""):
        return val*0.0254
    return None

def to_kg(val: float, unit: str, lbs_extra: float|None=None):
    u = (unit or "").strip().lower()
    if u in ("kg","kgs","kilogram","kilograms"):
        return val
    if u in ("lb","lbs","pound","pounds"):
        return val*0.453592 + (lbs_extra or 0.0)*0.453592
    if u in ("st","stone","stones"):
        # 1 stone = 14 lb
        total_lb = val*14 + (lbs_extra or 0.0)
        return total_lb*0.453592
    return None

# ================= Plausibilidad =================
def plausible_height_m(h):
    return (h is not None) and (1.3 <= h <= 2.2)

def plausible_weight_kg(w):
    return (w is not None) and (30 <= w <= 250)

# ================= Blacklist / Filtros de contexto =================
blacklist_terms = {
    # evita confundir "mass" (tumor) con "body mass"
    "mass", "lesion", "tumor", "nodule", "cyst", "defect", "stone", "specimen",
    # también palabras de laboratorio / fármacos
    "mg ", "mcg", "ml ", "dose", "dosage",
}

def has_blacklist(snippet: str) -> bool:
    t = (snippet or "").lower()
    return any(b in t for b in blacklist_terms)

# ================== REGEX Altura/Peso robustos ==================
# -- Altura --
# 1) formateos con pies y pulgadas: 5'7", 5 ft 7 in, 5 feet 7 inches, 5′7″ (apóstrofos tipográficos)
ft_in_1 = re.compile(r"(?i)\b(\d)\s*(?:ft|foot|feet|’|')\s*(\d{1,2})\s*(?:in|inch|inches|\"|’’|”)?\b")
# 2) solo pies (raro), o pulgadas sueltas
ft_only = re.compile(r"(?i)\b(\d)\s*(?:ft|foot|feet)\b")
in_only = re.compile(r"(?i)\b(\d{2})\s*(?:in|inch|inches|\"|’’|”)\b")
# 3) metros o centímetros
m_cm = re.compile(r"(?i)\b(\d{0,1}\.?\d{1,3})\s*(m|meter|meters)\b|\b(1\d{2}|[5-9]\d)\s*(cm|centimeter|centimeters)\b")

# -- Peso --
# 1) kg, lbs, stone+lbs
weight_main = re.compile(
    r"(?i)\b(?:wt|weight|weighs?)\s*[:=]?\s*(\d{2,3}(?:\.\d+)?)\s*(kg|kgs|kilograms?|lb|lbs|pounds?|st|stone|stones)\b"
)
# 2) números con unidad sin palabra "weight": “70 kg”, “154 lb”
weight_loose = re.compile(r"(?i)\b(\d{2,3}(?:\.\d+)?)\s*(kg|kgs|kilograms?|lb|lbs|pounds?|st|stone|stones)\b")
# 3) stone + lb explícitos: “11 st 3 lb”
stone_plus_lb = re.compile(r"(?i)\b(\d{1,2})\s*(?:st|stone|stones)\s*(\d{1,2})\s*(?:lb|lbs|pounds?)\b")

# Evitar capturar presión arterial como altura/peso: no usar números con "/" (BP 120/80) ni “mmHg”.
bp_like = re.compile(r"(?i)\b(?:bp|blood\s*pressure)\b|mm\s*hg|mmhg")

# Palabras clave que nos ayudan a elegir el par correcto
context_words = re.compile(r"(?i)\b(height|stature|talla|weight|weighs?|wt|bmi|imc|vitals?|examination|admission|triage)\b")

# ==================== Extracción por nota ====================
def extract_heights(text: str):
    t = norm_text(text)
    spans = []

    # 5'7", 5 ft 7 in
    for m in ft_in_1.finditer(t):
        feet = float(m.group(1)); inches = float(m.group(2))
        h_m = to_meters(feet, "ft", inches_extra=inches)
        s = t[max(0, m.start()-40): m.end()+40]
        if h_m is not None and not bp_like.search(s):
            spans.append(("height", h_m, f"{feet}ft {inches}in", s, m.start(), m.end()))

    # solo pies
    for m in ft_only.finditer(t):
        feet = float(m.group(1))
        h_m = to_meters(feet, "ft")
        s = t[max(0, m.start()-40): m.end()+40]
        if h_m is not None and not bp_like.search(s):
            spans.append(("height", h_m, f"{feet}ft", s, m.start(), m.end()))

    # solo pulgadas
    for m in in_only.finditer(t):
        inches = float(m.group(1))
        h_m = to_meters(inches, "in")
        s = t[max(0, m.start()-40): m.end()+40]
        if h_m is not None and not bp_like.search(s):
            spans.append(("height", h_m, f"{inches}in", s, m.start(), m.end()))

    # metros/centímetros (regex con 2 alternativas en uno)
    for m in m_cm.finditer(t):
        if m.group(1) and m.group(2):
            # metros
            val = float(m.group(1))
            h_m = to_meters(val, m.group(2))
            raw = f"{val} {m.group(2)}"
            s = t[max(0, m.start()-40): m.end()+40]
        else:
            # centímetros
            val = float(m.group(3))
            h_m = to_meters(val, "cm")
            raw = f"{val} cm"
            s = t[max(0, m.start()-40): m.end()+40]
        if h_m is not None and not bp_like.search(s):
            spans.append(("height", h_m, raw, s, m.start(), m.end()))

    # filtrar plausibles + sin blacklist
    spans = [sp for sp in spans if plausible_height_m(sp[1]) and not has_blacklist(sp[3])]
    return spans

def extract_weights(text: str):
    t = norm_text(text)
    spans = []

    # stone + lb (11 st 3 lb)
    for m in stone_plus_lb.finditer(t):
        st = float(m.group(1)); lb = float(m.group(2))
        w_kg = to_kg(st, "st", lbs_extra=lb)
        s = t[max(0, m.start()-40): m.end()+40]
        if w_kg is not None:
            spans.append(("weight", w_kg, f"{st} st {lb} lb", s, m.start(), m.end()))

    # “weight 70 kg” o “70 kg”
    for m in weight_main.finditer(t):
        val = float(m.group(1)); unit = m.group(2)
        w_kg = to_kg(val, unit)
        s = t[max(0, m.start()-40): m.end()+40]
        if w_kg is not None:
            spans.append(("weight", w_kg, f"{val} {unit}", s, m.start(), m.end()))
    for m in weight_loose.finditer(t):
        val = float(m.group(1)); unit = m.group(2)
        w_kg = to_kg(val, unit)
        s = t[max(0, m.start()-40): m.end()+40]
        if w_kg is not None:
            spans.append(("weight", w_kg, f"{val} {unit}", s, m.start(), m.end()))

    # filtrar plausibles + sin blacklist + no confundir con BP/mmHg
    spans = [sp for sp in spans if plausible_weight_kg(sp[1]) and not has_blacklist(sp[3]) and not bp_like.search(sp[3])]
    return spans

# ==================== Scoring para elegir el mejor par ====================
def pick_best_pair(h_spans, w_spans, text):
    """
    Devuelve (h_span, w_span, score, reason)
    Donde cada span: (kind, value, raw, snippet, start, end)
    """
    if not h_spans or not w_spans:
        return None, None, -1.0, "missing_h_or_w"

    t = norm_text(text)
    # índice de palabras clave
    ctx_hits = [m.start() for m in context_words.finditer(t)]

    best = (None, None, -1.0, "")
    for h in h_spans:
        for w in w_spans:
            # distancia en caracteres
            d = abs(h[4] - w[4])
            # distancia a la palabra clave más cercana
            if ctx_hits:
                d_ctx = min(abs(h[4]-c) for c in ctx_hits) + min(abs(w[4]-c) for c in ctx_hits)
            else:
                d_ctx = 9999
            # penaliza distancias grandes, favorece proximidad a contexto
            score = 0.0
            score += max(0, 200 - min(d, 200)) * 0.6
            score += max(0, 200 - min(d_ctx, 200)) * 0.4
            # bonifica formatos canónicos (m/cm y kg)
            if "cm" in h[2].lower() or " m" in h[2].lower(): score += 10
            if "kg" in w[2].lower(): score += 10

            if score > best[2]:
                best = (h, w, score, f"d={d}, d_ctx={d_ctx}")

    return best

# ================ Lectura del CSV y setup =================
df = pd.read_csv(DATA_CSV, dtype=str, encoding="utf-8", na_filter=False)
if "patient_id" not in df.columns:
    df["patient_id"] = np.arange(len(df)).astype(str)
df["patient_norm"] = df["patient"].apply(norm_text)


In [2]:
valid, doubtful, discarded = [], [], []

for r in tqdm(df.itertuples(index=False), total=len(df), desc="Procesando IMC (regex mejoradas)"):
    pid   = getattr(r, "patient_id")
    text  = getattr(r, "patient_norm")

    h_spans = extract_heights(text)
    w_spans = extract_weights(text)

    if h_spans and w_spans:
        h, w, score, info = pick_best_pair(h_spans, w_spans, text)
        if (h is not None) and (w is not None):
            h_m = h[1]; w_kg = w[1]
            bmi = round(w_kg / (h_m*h_m), 2) if h_m and w_kg else None
            if bmi is not None and math.isfinite(bmi):
                # snippet combinando ambos
                lo = max(0, min(h[4], w[4]) - 60)
                hi = min(len(text), max(h[5], w[5]) + 60)
                snippet = text[lo:hi]
                valid.append({
                    "patient_id": pid,
                    "height_m": round(h_m, 3),
                    "height_raw": h[2],
                    "weight_kg": round(w_kg, 1),
                    "weight_raw": w[2],
                    "BMI": bmi,
                    "score": round(score, 1),
                    "reason": info,
                    "snippet": snippet
                })
                continue  # siguiente paciente

    # si llega aquí, no conseguimos pareja completa
    if h_spans or w_spans:
        doubtful.append({
            "patient_id": pid,
            "heights": json.dumps([(round(h[1],3), h[2]) for h in h_spans], ensure_ascii=False),
            "weights": json.dumps([(round(w[1],1), w[2]) for w in w_spans], ensure_ascii=False),
            "text_excerpt": text[:320]
        })
    else:
        # había números pero no plausibles? los metemos en discarded
        # (intentamos encontrar cualquier rastro para trazar)
        any_nums = re.search(r"\b\d[\d.,]*\b", text) is not None
        if any_nums:
            discarded.append({
                "patient_id": pid,
                "text_excerpt": text[:320]
            })

# --- Guardar CSVs ---
pd.DataFrame(valid).to_csv(OUT_DIR/"valid_imc.csv", index=False, encoding="utf-8")
pd.DataFrame(doubtful).to_csv(OUT_DIR/"doubtful_imc.csv", index=False, encoding="utf-8")
pd.DataFrame(discarded).to_csv(OUT_DIR/"discarded_imc.csv", index=False, encoding="utf-8")

print("✅ Guardado:")
print(f" - {OUT_DIR/'valid_imc.csv'} (GT con IMC calculado)")
print(f" - {OUT_DIR/'doubtful_imc.csv'} (a revisar manualmente)")
print(f" - {OUT_DIR/'discarded_imc.csv'} (irrelevante)")
print("Ejemplo valid (head):")
pd.DataFrame(valid).head(5)


Procesando IMC (regex mejoradas): 100%|██████████| 167034/167034 [02:33<00:00, 1090.59it/s]


✅ Guardado:
 - C:\Users\hered\Desktop\TFM\TFM\IMC2\valid_imc.csv (GT con IMC calculado)
 - C:\Users\hered\Desktop\TFM\TFM\IMC2\doubtful_imc.csv (a revisar manualmente)
 - C:\Users\hered\Desktop\TFM\TFM\IMC2\discarded_imc.csv (irrelevante)
Ejemplo valid (head):


Unnamed: 0,patient_id,height_m,height_raw,weight_kg,weight_raw,BMI,score,reason,snippet
0,22,1.66,166.0 cm,48.4,48.4 kg,17.56,196.0,"d=24, d_ctx=24",", 20; and temperature, 37.1℃. The weight of th..."
1,72,1.75,1.75 m,76.0,76.0 kg,24.82,198.4,"d=16, d_ctx=30",A 20-year-old Caucasian male (1.75 m tall and ...
2,208,1.5,150.0 cm,41.0,41.0 kg,18.22,204.0,"d=16, d_ctx=16","An 88-year-old woman (height, 150 cm; weight, ..."
3,428,1.6,160.0 cm,53.0,53.0 kg,20.7,125.6,"d=24, d_ctx=848",rnal and Child Health Care Hospital. The pregn...
4,529,1.47,147.0 cm,50.0,50.0 kg,23.14,204.0,"d=16, d_ctx=16","A 36-year-old woman (height, 147 cm; weight, 5..."


In [4]:
# === Celda 1: construir eval_imc_fullnotes.csv (100 muestras) ===
import pandas as pd
import numpy as np
from pathlib import Path

# Rutas
PMC_CSV   = r"C:/Users/hered/Desktop/TFM/TFM/data/PMC-Patients.csv"
VALID_CSV = r"C:/Users/hered/Desktop/TFM/TFM/IMC2/valid_imc.csv"
OUT_DIR   = Path(r"C:/Users/hered/Desktop/TFM/TFM/IMC2")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Cargar
df_pmc  = pd.read_csv(PMC_CSV,  dtype=str, encoding="utf-8", na_filter=False)
df_val  = pd.read_csv(VALID_CSV, dtype=str, encoding="utf-8", na_filter=False)

# patient_id asegurado
if "patient_id" not in df_pmc.columns:
    df_pmc["patient_id"] = np.arange(len(df_pmc)).astype(str)

# Tipar numéricos GT
for c in ["height_m","weight_kg","BMI"]:
    if c in df_val.columns:
        df_val[c] = pd.to_numeric(df_val[c], errors="coerce")

# Muestra aleatoria de 100
rng = np.random.default_rng(42)
idx = rng.choice(df_val.index.to_numpy(), size=min(100, len(df_val)), replace=False)
sample = df_val.loc[idx].copy()

# Unir con fullnotes
df_pmc["full_note"] = df_pmc["patient"]  # renombrado conveniente
keep_cols = ["patient_id","full_note"]
eval_df = sample.merge(df_pmc[keep_cols], on="patient_id", how="left")

# Renombrar a *_true para MEDCALC
eval_df = eval_df.rename(columns={
    "height_m":"height_m_true",
    "weight_kg":"weight_kg_true",
    "BMI":"BMI_true"
})

# Orden amigable
cols = ["patient_id","full_note","height_m_true","weight_kg_true","BMI_true"]
for c in cols:
    if c not in eval_df.columns:
        eval_df[c] = np.nan
eval_df = eval_df[cols]

# Guardar
OUT_EVAL = OUT_DIR / "eval_imc_fullnotes.csv"
eval_df.to_csv(OUT_EVAL, index=False, encoding="utf-8")
print(f"✅ Saved eval set: {OUT_EVAL}  | rows: {len(eval_df)}")
print(eval_df.head(3).to_string(index=False))


✅ Saved eval set: C:\Users\hered\Desktop\TFM\TFM\IMC2\eval_imc_fullnotes.csv  | rows: 100
patient_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    