In [1]:
# Extraer embeddings contextuales por frase desde BERT multilingüe.
# Mean pooling con attention_mask.
# Guardar matrices y metadatos para E3/E4.

#### ***Imports y config***

In [1]:
# Configuración y semillas
from pathlib import Path
import json, random
import numpy as np
import pandas as pd
import torch

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

BATCH_SIZE = 128
MAX_LEN = 64
MODEL_NAME = "distilbert-base-uncased"  # inglés y ligero

pd.set_option("display.max_colwidth", 120)

#### ***Auto-device y carga de modelo/tokenizer***

In [2]:
# Auto-device y carga de modelo/tokenizer
from transformers import AutoTokenizer, AutoModel

if torch.cuda.is_available():
    device = torch.device("cuda"); amp_dtype = torch.float16
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device("mps");  amp_dtype = torch.float16
else:
    device = torch.device("cpu");  amp_dtype = torch.bfloat16  # ignorado si no aplica

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()
HIDDEN = model.config.hidden_size

print(f"device={device}, hidden={HIDDEN}, max_len={MAX_LEN}, batch={BATCH_SIZE}")

  from .autonotebook import tqdm as notebook_tqdm


device=mps, hidden=768, max_len=64, batch=128


#### ***Rutas***

In [3]:
# Rutas y carga de processed → textos e índices
def find_root():
    p = Path.cwd()
    for cand in [p, *p.parents]:
        if (cand / "data" / "processed").exists():
            return cand
    raise FileNotFoundError("No encuentro data/processed.")

ROOT = find_root()
PROC = ROOT / "data" / "processed"
FEAT = ROOT / "features" / "embeddings_contextual"
FEAT.mkdir(parents=True, exist_ok=True)

NIVELES = ["easy","medium","hard"]
SPLITS = ["train","validation"]

#### ***Carga processed***

In [4]:
def cargar_split(split):
    frames = []
    for lvl in NIVELES:
        p = PROC / lvl / split / "sentences.jsonl"
        if p.exists():
            df = pd.read_json(p, lines=True)
            df["level"] = lvl; df["split"] = split
            frames.append(df[["level","split","doc_id","sent_id","text_norm"]])
    if not frames:
        return pd.DataFrame(columns=["level","split","doc_id","sent_id","text_norm"])
    return pd.concat(frames, ignore_index=True).sort_values(["level","doc_id","sent_id"]).reset_index(drop=True)

df_tr = cargar_split("train")
df_va = cargar_split("validation")
print(df_tr.shape, df_va.shape)

(171602, 5) (36558, 5)


#### ***Funciones de batching y pooling***

In [5]:
# Funciones: batching, embedding (mean-pooling + autocast) y métricas de truncado
def batches(lst, bs):
    for i in range(0, len(lst), bs):
        yield lst[i:i+bs]

@torch.no_grad()
def embed_texts(texts):
    out = np.zeros((len(texts), HIDDEN), dtype=np.float32)
    trunc_hits = 0
    all_lengths = []

    k = 0
    for chunk in batches(texts, BATCH_SIZE):
        enc = tokenizer(
            chunk, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt"
        )
        lengths = enc["attention_mask"].sum(dim=1)  # longitudes efectivas
        all_lengths.extend(lengths.cpu().tolist())
        trunc_hits += int((lengths == MAX_LEN).sum().item())

        enc = {k2: v.to(device) for k2, v in enc.items()}
        with torch.amp.autocast(device_type=device.type, dtype=amp_dtype, enabled=(device.type != "cpu")):
            last = model(**enc).last_hidden_state            # [bs, seq, hid]
            mask = enc["attention_mask"].unsqueeze(-1)       # [bs, seq, 1]
            sent = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)

        bs = sent.size(0)
        out[k:k+bs] = sent.detach().float().cpu().numpy()
        k += bs

    trunc_ratio = trunc_hits / len(texts) if len(texts) else 0.0
    lens = np.array(all_lengths, dtype=np.int32)
    len_stats = {
        "p50_len": int(np.percentile(lens, 50)) if len(lens) else 0,
        "p90_len": int(np.percentile(lens, 90)) if len(lens) else 0,
        "p95_len": int(np.percentile(lens, 95)) if len(lens) else 0,
    }
    return out, trunc_ratio, len_stats

#### ***Embeddings TRAIN***

In [6]:
# Embeddings TRAIN
texts_tr = df_tr["text_norm"].astype(str).tolist()
X_tr, trunc_tr, len_stats_tr = embed_texts(texts_tr)
np.save(FEAT / "C_train.npy", X_tr)
df_tr[["level","split","doc_id","sent_id"]].to_csv(FEAT / "C_train_index.csv", index=False)
print("TRAIN:", X_tr.shape, "trunc@64:", round(trunc_tr, 3), "len_stats:", len_stats_tr)

TRAIN: (171602, 768) trunc@64: 0.015 len_stats: {'p50_len': 18, 'p90_len': 38, 'p95_len': 46}


#### ***Embeddings VAL***

In [7]:
# Embeddings VALIDATION
texts_va = df_va["text_norm"].astype(str).tolist()
X_va, trunc_va, len_stats_va = embed_texts(texts_va)
np.save(FEAT / "C_validation.npy", X_va)
df_va[["level","split","doc_id","sent_id"]].to_csv(FEAT / "C_validation_index.csv", index=False)
print("VAL:", X_va.shape, "trunc@64:", round(trunc_va, 3), "len_stats:", len_stats_va)

VAL: (36558, 768) trunc@64: 0.015 len_stats: {'p50_len': 19, 'p90_len': 38, 'p95_len': 47}


#### ***Resumen y guardado***

In [8]:
# Resumen y metadatos
resumen = {
    "model": MODEL_NAME,
    "device": str(device),
    "hidden": int(HIDDEN),
    "batch_size": int(BATCH_SIZE),
    "max_len": int(MAX_LEN),
    "train_shape": tuple(X_tr.shape),
    "val_shape": tuple(X_va.shape),
    "trunc_ratio_train": float(round(trunc_tr, 4)),
    "trunc_ratio_val": float(round(trunc_va, 4)),
    "len_stats_train": len_stats_tr,
    "len_stats_val": len_stats_va,
    "seed": SEED,
}
(FEAT / "contextual_resumen.json").write_text(json.dumps(resumen, indent=2), encoding="utf-8")
print("Resumen escrito en", FEAT / "contextual_resumen.json")

Resumen escrito en /Users/eeguskiza/Documents/Deusto/2025/NLP/multi-author-analysis/features/embeddings_contextual/contextual_resumen.json
