In [1]:
# Preprocesar data/raw/{nivel}/{split} → data/processed/{nivel}/{split}/sentences.jsonl
# Por línea: doc_id, sent_id, level, split, text_norm, n_tokens, is_boundary (si existe)
# Pasos: normalización básica, segmentación en frases, tokenización simple, stemming opcional.

#### ***Imports y config***

In [2]:
from pathlib import Path
import re, json
import pandas as pd
import numpy as np

SEED = 42
rng = np.random.default_rng(SEED)

# Stemming opcional (NLTK). Si no está, sigue sin stem.
try:
    import nltk
    from nltk.stem.snowball import SnowballStemmer
    STEMMER = SnowballStemmer("spanish")
    USE_STEM = True
except Exception:
    STEMMER = None
    USE_STEM = False

pd.set_option("display.max_colwidth", 120)

#### ***Rutas***

In [3]:
def find_root():
    p = Path.cwd()
    for cand in [p, *p.parents]:
        if (cand / "data" / "raw").exists():
            return cand
    raise FileNotFoundError("No encuentro data/raw.")

ROOT = find_root()
RAW = ROOT / "data" / "raw"
PROC = ROOT / "data" / "processed"
REPORTS = ROOT / "reports"
for d in [PROC, REPORTS]:
    d.mkdir(parents=True, exist_ok=True)

NIVELES = ["easy","medium","hard"]
SPLITS = ["train","validation"]

#### ***Limpieza y Segmentaciion***

In [4]:
# Regex simples
RE_URL = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
RE_EMAIL = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b")
RE_NUM = re.compile(r"\b\d+(?:[.,]\d+)*\b")
RE_TOKEN = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)

# Segmentación por fin de frase con excepciones comunes
ABREV = {"sr.", "sra.", "dr.", "dra.", "ud.", "etc.", "p.ej.", "ee.uu."}
SENT_SPLIT = re.compile(r"(?<=[\.\?\!])\s+")

def normalizar(text: str) -> str:
    t = text.replace("\u00A0"," ").strip()
    t = RE_URL.sub("<URL>", t)
    t = RE_EMAIL.sub("<EMAIL>", t)
    t = RE_NUM.sub("<NUM>", t)
    t = t.lower()
    t = re.sub(r"\s+", " ", t)
    return t.strip()

def segmentar_frases(text: str) -> list[str]:
    # Heurística: divide y re-une abreviaturas simples
    if not text.strip():
        return []
    trozos = SENT_SPLIT.split(text.strip())
    frases = []
    buf = ""
    for s in trozos:
        s2 = s.strip()
        if not s2:
            continue
        if any(s2.endswith(ab) for ab in ABREV):
            buf = (buf + " " + s2).strip()
            continue
        if buf:
            frases.append((buf + " " + s2).strip())
            buf = ""
        else:
            frases.append(s2)
    if buf:
        frases.append(buf)
    return frases

def tokenizar(text: str) -> list[str]:
    toks = RE_TOKEN.findall(text)
    if USE_STEM:
        toks = [STEMMER.stem(t) for t in toks]
    return toks

def limpiar_y_tokenizar(frase: str) -> tuple[str,int]:
    f = normalizar(frase)
    toks = tokenizar(f)
    return (" ".join(toks), len(toks))

#### ***Preprocesado por fichero***

In [5]:
def doc_id_from_path(fp: Path) -> str:
    return fp.stem  # p.ej., problem-123

def procesar_txt(fp: Path, level: str, split: str):
    out = []
    raw = fp.read_text(encoding="utf-8", errors="replace")
    for i, fr in enumerate(segmentar_frases(raw)):
        text_norm, n_tok = limpiar_y_tokenizar(fr)
        out.append({
            "doc_id": doc_id_from_path(fp),
            "sent_id": i,
            "level": level,
            "split": split,
            "text_norm": text_norm,
            "n_tokens": n_tok,
            "is_boundary": False  # no viene anotado en .txt
        })
    return out

def procesar_jsonl(fp: Path, level: str, split: str):
    out = []
    with fp.open("r", encoding="utf-8", errors="replace") as f:
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            text = str(obj.get("text") or obj.get("sentence") or "")
            text_norm, n_tok = limpiar_y_tokenizar(text)
            is_b = bool(obj.get("is_boundary", False))
            out.append({
                "doc_id": obj.get("doc_id", doc_id_from_path(fp)),
                "sent_id": obj.get("sent_id", i),
                "level": level,
                "split": split,
                "text_norm": text_norm,
                "n_tokens": n_tok,
                "is_boundary": is_b
            })
    return out

#### ***Recorrido por niveles/splits y guardado***

In [6]:
total_lineas = 0
for level in NIVELES:
    for split in SPLITS:
        base = RAW / level / split
        if not base.exists():
            continue
        destino = PROC / level / split
        destino.mkdir(parents=True, exist_ok=True)
        out_path = destino / "sentences.jsonl"

        files = list(base.rglob("*.txt")) + list(base.rglob("*.jsonl"))
        if not files:
            continue

        with out_path.open("w", encoding="utf-8") as w:
            for fp in files:
                if fp.suffix.lower() == ".txt":
                    filas = procesar_txt(fp, level, split)
                else:
                    filas = procesar_jsonl(fp, level, split)
                for row in filas:
                    w.write(json.dumps(row, ensure_ascii=False) + "\n")
                total_lineas += len(filas)

print(f"Líneas escritas en processed: {total_lineas}")

Líneas escritas en processed: 208160


#### ***Checks y resumen***

In [7]:
# Carga una muestra para verificar
muestras = []
for level in NIVELES:
    for split in SPLITS:
        p = PROC / level / split / "sentences.jsonl"
        if not p.exists():
            continue
        with p.open("r", encoding="utf-8") as f:
            for _ in range(5):
                line = f.readline()
                if not line:
                    break
                muestras.append(json.loads(line))
pd.DataFrame(muestras)

Unnamed: 0,doc_id,sent_id,level,split,text_norm,n_tokens,is_boundary
0,problem-1734,0,easy,train,i learned this about ukraine a while back and i think it was mila kunis who said it and i knew about it since,24,False
1,problem-1734,1,easy,train,it s easy for some to make the mistake as back then i had no idea either,17,False
2,problem-1734,2,easy,train,but when ppl see others say it the respectfully correct them,11,False
3,problem-1734,3,easy,train,it s an offensive way to refer to ukraine and is an old soviet term and minimizes the legitimacy of them being a fre...,30,False
4,problem-1734,4,easy,train,similar to how ppl correct others when they use the russian soviet spelling of kyiv and say kiev,18,False
5,problem-678,0,easy,validation,it s bizarre that people think it s controversial,9,False
6,problem-678,1,easy,validation,courts in europe interpreted laws,5,False
7,problem-678,2,easy,validation,the king could put his foot down but the equivalent of that in our system is supposed to be a new law,22,False
8,problem-678,3,easy,validation,if it s a constitutional question then that new law is a constitutional amendment,14,False
9,problem-678,4,easy,validation,i ve yet to read anybody even try to argue that a question of constitutional interpretation doesn t fall under that ...,22,False


In [8]:
# Métricas por nivel/split en processed
reg = []
for level in NIVELES:
    for split in SPLITS:
        p = PROC / level / split / "sentences.jsonl"
        if not p.exists():
            continue
        n_lines = sum(1 for _ in p.open("r", encoding="utf-8"))
        # mediana de tokens por frase en una muestra
        toks = []
        with p.open("r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if i >= 20000:  # límite para rapidez
                    break
                toks.append(json.loads(line)["n_tokens"])
        reg.append({
            "level": level,
            "split": split,
            "n_frases_processed": n_lines,
            "med_tokens_frase_muestra": float(np.median(toks)) if toks else np.nan
        })

df_sum = pd.DataFrame(reg).sort_values(["level","split"])
display(df_sum)

# Guarda informe
df_sum.to_csv(REPORTS / "01_processed_resumen.csv", index=False)
(REPORTS / "01_preprocesamiento_ok.txt").write_text("OK", encoding="utf-8")
print("Resumen guardado en reports/")

Unnamed: 0,level,split,n_frases_processed,med_tokens_frase_muestra
0,easy,train,52701,13.0
1,easy,validation,11146,13.0
4,hard,train,55515,17.0
5,hard,validation,11649,18.0
2,medium,train,63386,16.0
3,medium,validation,13763,16.0


Resumen guardado en reports/


# Informe breve — `01_preprocesamiento.ipynb`

## Objetivo

Convertir RAW en frases normalizadas y reutilizables para EDA y extracción de features.

## Pipeline aplicado

* Lectura de `data/raw/{easy,medium,hard}/{train,validation}`.
* Segmentación por `.?!` con excepciones comunes.
* Normalización: minúsculas, `<URL>`, `<EMAIL>`, `<NUM>`, espacios colapsados.
* Tokenización alfabética y conteo de tokens.
* Stemming español opcional si está disponible.
* Escritura en `data/processed/{nivel}/{split}/sentences.jsonl` con:
  `doc_id, sent_id, level, split, text_norm, n_tokens, is_boundary`.

## Cobertura y tamaño

* Líneas totales escritas: **208 160**.

|  level |    split   | n_frases_processed |
| :----: | :--------: | -----------------: |
|  easy  |    train   |             52 701 |
|  easy  | validation |             11 146 |
| medium |    train   |             63 386 |
| medium | validation |             13 763 |
|  hard  |    train   |             55 515 |
|  hard  | validation |             11 649 |

## Estadísticas rápidas

Mediana de tokens por frase (muestra):

|  level | train | validation |
| :----: | ----: | ---------: |
|  easy  |  13.0 |       13.0 |
| medium |  16.0 |       16.0 |
|  hard  |  17.0 |       18.0 |

Coherente con el RAW: ~12–13 frases/doc y ~200–230 tokens/doc.

## Validaciones

* Estructura `processed/{nivel}/{split}` creada con `sentences.jsonl`.
* Campos y tipos correctos.
* Conteos no nulos en todos los splits.
* Resumen guardado en `reports/01_processed_resumen.csv`.

## Limitaciones

* Segmentación heurística.
* La normalización elimina mayúsculas y forma exacta de números.
* `is_boundary` solo aparece si venía en origen `.jsonl`.