In [1]:
# Extraer embeddings contextuales por frase desde BERT multilingüe.
# Mean pooling con attention_mask.
# Guardar matrices y metadatos para E3/E4.

#### ***Imports y config***

In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

SEED = 42
BATCH_SIZE = 128
MAX_LEN = 64
MODEL_NAME = "bert-base-multilingual-cased"  # robusto multi-idioma
DTYPE_FP16 = True  # usa FP16 en CUDA si disponible

torch.manual_seed(SEED)
pd.set_option("display.max_colwidth", 120)

  from .autonotebook import tqdm as notebook_tqdm


#### ***Rutas***

In [3]:
def find_root():
    p = Path.cwd()
    for cand in [p, *p.parents]:
        if (cand / "data" / "processed").exists():
            return cand
    raise FileNotFoundError("No encuentro data/processed.")

ROOT = find_root()
PROC = ROOT / "data" / "processed"
FEAT = ROOT / "features" / "embeddings_contextual"
FEAT.mkdir(parents=True, exist_ok=True)

NIVELES = ["easy","medium","hard"]
SPLITS = ["train","validation"]

#### ***Carga processed***

In [4]:
def cargar_split(split):
    rows = []
    for lvl in NIVELES:
        p = PROC / lvl / split / "sentences.jsonl"
        if p.exists():
            df = pd.read_json(p, lines=True)
            df["level"] = lvl
            df["split"] = split
            rows.append(df[["level","split","doc_id","sent_id","text_norm"]])
    if not rows:
        return pd.DataFrame(columns=["level","split","doc_id","sent_id","text_norm"])
    return pd.concat(rows, ignore_index=True).sort_values(["level","doc_id","sent_id"]).reset_index(drop=True)

df_tr = cargar_split("train")
df_va = cargar_split("validation")
print(df_tr.shape, df_va.shape)

(171602, 5) (36558, 5)


### ***Modelo y tokenizer***

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
if device.type == "cuda" and DTYPE_FP16:
    model = model.half()
model = model.to(device).eval()

HIDDEN = model.config.hidden_size
print("device:", device, "hidden:", HIDDEN)

device: cpu hidden: 768


#### ***Funciones de batching y pooling***

In [6]:
def batches(lst, bs):
    for i in range(0, len(lst), bs):
        yield lst[i:i+bs]

@torch.no_grad()
def embed_texts(texts):
    # Prealoca salida
    out = np.zeros((len(texts), HIDDEN), dtype=np.float32)
    k = 0
    for chunk in batches(texts, BATCH_SIZE):
        enc = tokenizer(
            chunk,
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        enc = {k2: v.to(device) for k2, v in enc.items()}
        with torch.cuda.amp.autocast(enabled=(device.type=="cuda" and DTYPE_FP16)):
            last = model(**enc).last_hidden_state  # [bs, seq, hidden]
            mask = enc["attention_mask"].unsqueeze(-1)  # [bs, seq, 1]
            masked = last * mask
            # mean pooling seguro
            denom = mask.sum(dim=1).clamp(min=1)
            sent_emb = masked.sum(dim=1) / denom
        bs = sent_emb.size(0)
        out[k:k+bs] = sent_emb.detach().float().cpu().numpy()
        k += bs
    return out

#### ***Embeddings por split***

In [7]:
# TRAIN
texts_tr = df_tr["text_norm"].astype(str).tolist()
X_tr = embed_texts(texts_tr)
np.save(FEAT / "C_train.npy", X_tr)
df_tr[["level","split","doc_id","sent_id"]].to_csv(FEAT / "C_train_index.csv", index=False)
print("TRAIN:", X_tr.shape, "guardado.")

# VALIDATION
texts_va = df_va["text_norm"].astype(str).tolist()
X_va = embed_texts(texts_va)
np.save(FEAT / "C_validation.npy", X_va)
df_va[["level","split","doc_id","sent_id"]].to_csv(FEAT / "C_validation_index.csv", index=False)
print("VAL:", X_va.shape, "guardado.")

  with torch.cuda.amp.autocast(enabled=(device.type=="cuda" and DTYPE_FP16)):


TRAIN: (171602, 768) guardado.


  with torch.cuda.amp.autocast(enabled=(device.type=="cuda" and DTYPE_FP16)):


VAL: (36558, 768) guardado.


#### ***Resumen y guardado***

In [8]:
resumen = {
    "model": MODEL_NAME,
    "device": str(device),
    "dtype_fp16": bool(DTYPE_FP16 and device.type=="cuda"),
    "hidden": int(HIDDEN),
    "batch_size": int(BATCH_SIZE),
    "max_len": int(MAX_LEN),
    "train_shape": tuple(X_tr.shape),
    "val_shape": tuple(X_va.shape),
}
(Path(FEAT) / "contextual_resumen.json").write_text(json.dumps(resumen, indent=2), encoding="utf-8")
resumen

{'model': 'bert-base-multilingual-cased',
 'device': 'cpu',
 'dtype_fp16': False,
 'hidden': 768,
 'batch_size': 128,
 'max_len': 64,
 'train_shape': (171602, 768),
 'val_shape': (36558, 768)}

Sí. Vamos con el 04.

### Nombre del notebook

`04_embeddings_contextuales.ipynb`

### Secciones

1. Objetivo
2. Imports y config
3. Rutas
4. Carga processed
5. Modelo y tokenizer
6. Funciones de batching y pooling
7. Embeddings por split (train/validation)
8. Resumen y guardado

---

#### 1) Objetivo

```python
# Extraer embeddings contextuales por frase desde BERT multilingüe.
# Mean pooling con attention_mask.
# Guardar matrices y metadatos para E3/E4.
```

#### 2) Imports y config

```python
from pathlib import Path
import json
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

SEED = 42
BATCH_SIZE = 128
MAX_LEN = 64
MODEL_NAME = "bert-base-multilingual-cased"  # robusto multi-idioma
DTYPE_FP16 = True  # usa FP16 en CUDA si disponible

torch.manual_seed(SEED)
pd.set_option("display.max_colwidth", 120)
```

#### 3) Rutas

```python
def find_root():
    p = Path.cwd()
    for cand in [p, *p.parents]:
        if (cand / "data" / "processed").exists():
            return cand
    raise FileNotFoundError("No encuentro data/processed.")

ROOT = find_root()
PROC = ROOT / "data" / "processed"
FEAT = ROOT / "features" / "embeddings_contextual"
FEAT.mkdir(parents=True, exist_ok=True)

NIVELES = ["easy","medium","hard"]
SPLITS = ["train","validation"]
```

#### 4) Carga processed

```python
def cargar_split(split):
    rows = []
    for lvl in NIVELES:
        p = PROC / lvl / split / "sentences.jsonl"
        if p.exists():
            df = pd.read_json(p, lines=True)
            df["level"] = lvl
            df["split"] = split
            rows.append(df[["level","split","doc_id","sent_id","text_norm"]])
    if not rows:
        return pd.DataFrame(columns=["level","split","doc_id","sent_id","text_norm"])
    return pd.concat(rows, ignore_index=True).sort_values(["level","doc_id","sent_id"]).reset_index(drop=True)

df_tr = cargar_split("train")
df_va = cargar_split("validation")
print(df_tr.shape, df_va.shape)
```

#### 5) Modelo y tokenizer

```python
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
if device.type == "cuda" and DTYPE_FP16:
    model = model.half()
model = model.to(device).eval()

HIDDEN = model.config.hidden_size
print("device:", device, "hidden:", HIDDEN)
```

#### 6) Funciones de batching y pooling

```python
def batches(lst, bs):
    for i in range(0, len(lst), bs):
        yield lst[i:i+bs]

@torch.no_grad()
def embed_texts(texts):
    # Prealoca salida
    out = np.zeros((len(texts), HIDDEN), dtype=np.float32)
    k = 0
    for chunk in batches(texts, BATCH_SIZE):
        enc = tokenizer(
            chunk,
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        enc = {k2: v.to(device) for k2, v in enc.items()}
        with torch.cuda.amp.autocast(enabled=(device.type=="cuda" and DTYPE_FP16)):
            last = model(**enc).last_hidden_state  # [bs, seq, hidden]
            mask = enc["attention_mask"].unsqueeze(-1)  # [bs, seq, 1]
            masked = last * mask
            # mean pooling seguro
            denom = mask.sum(dim=1).clamp(min=1)
            sent_emb = masked.sum(dim=1) / denom
        bs = sent_emb.size(0)
        out[k:k+bs] = sent_emb.detach().float().cpu().numpy()
        k += bs
    return out
```

#### 7) Embeddings por split

```python
# TRAIN
texts_tr = df_tr["text_norm"].astype(str).tolist()
X_tr = embed_texts(texts_tr)
np.save(FEAT / "C_train.npy", X_tr)
df_tr[["level","split","doc_id","sent_id"]].to_csv(FEAT / "C_train_index.csv", index=False)
print("TRAIN:", X_tr.shape, "guardado.")

# VALIDATION
texts_va = df_va["text_norm"].astype(str).tolist()
X_va = embed_texts(texts_va)
np.save(FEAT / "C_validation.npy", X_va)
df_va[["level","split","doc_id","sent_id"]].to_csv(FEAT / "C_validation_index.csv", index=False)
print("VAL:", X_va.shape, "guardado.")
```

#### 8) Resumen y guardado

```python
resumen = {
    "model": MODEL_NAME,
    "device": str(device),
    "dtype_fp16": bool(DTYPE_FP16 and device.type=="cuda"),
    "hidden": int(HIDDEN),
    "batch_size": int(BATCH_SIZE),
    "max_len": int(MAX_LEN),
    "train_shape": tuple(X_tr.shape),
    "val_shape": tuple(X_va.shape),
}
(Path(FEAT) / "contextual_resumen.json").write_text(json.dumps(resumen, indent=2), encoding="utf-8")
resumen
```

---

### Informe breve — `04_embeddings_contextuales.ipynb`

**Objetivo.** Extraer embeddings contextuales por frase con `bert-base-multilingual-cased` usando mean pooling y `attention_mask`.

**Config.** `MAX_LEN=64`, `BATCH_SIZE=128`, FP16 en CUDA, `hidden=768`.

**Salidas.**

* `features/embeddings_contextual/C_train.npy` y `C_validation.npy`.
* Índices: `C_{split}_index.csv`.
* Resumen: `contextual_resumen.json` con modelo, shapes y parámetros.

**Notas.**

* Ajusta `BATCH_SIZE` si VRAM es limitada.
* Si el corpus es mayoritariamente español, alternativa: `"dccuchile/bert-base-spanish-wwm-cased"`.
* Estas matrices sirven directas para baselines por similitud o para modelos clásicos.