In [21]:
import json, re, warnings
from pathlib import Path
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from langdetect import detect, LangDetectException

warnings.filterwarnings("ignore")
sns.set_theme(style="whitegrid", font_scale=.9)



In [22]:

# ---------------------------------------------------------------------
# Configuración
# ---------------------------------------------------------------------
FILE       = Path("data/dp.jsonl")
FIG_DIR    = Path("figs_dp")
CSV_DIR    = Path("csv_dp")
MAX_WORDS_FILE = 512          # corte duro (descartar)
HALLU_WORDS    = 512          # ≥ 1 000 palabras ⇒ alucinado


HALLU_RE   = re.compile(r"(model que:){3,}", re.IGNORECASE)   # patrón repetitivo

FIG_DIR.mkdir(exist_ok=True)
CSV_DIR.mkdir(exist_ok=True)



In [23]:
# ------------------------- 1. Carga segura --------------------------
rows, bad_json = [], 0
with FILE.open(encoding="utf-8") as f:
    for ln in f:
        ln = ln.strip()
        if not ln:
            continue
        try:
            rows.append(json.loads(ln))
        except json.JSONDecodeError:
            bad_json += 1

df = pd.DataFrame(rows)
print(f"✔️  Registros válidos: {len(df):,} · JSON inválido: {bad_json:,}")

# -------------------- 2. Conteo de palabras -------------------------
for col in ("chosen", "rejected"):
    df[f"{col}_word_count"] = df[col].fillna("").str.split().str.len()

long_mask = (
    (df["chosen_word_count"]   > MAX_WORDS_FILE) |
    (df["rejected_word_count"] > MAX_WORDS_FILE)
)
df_long = df[long_mask]


✔️  Registros válidos: 4,613 · JSON inválido: 0


In [24]:

# ---------------- 3. Detección de alucinados ------------------------
def is_hallu(row) -> bool:
    regex_flag = HALLU_RE.search(str(row["chosen"])) or HALLU_RE.search(str(row["rejected"]))
    many_words = (row["chosen_word_count"] >= HALLU_WORDS) or (row["rejected_word_count"] >= HALLU_WORDS)
    return regex_flag or many_words

df["hallucinated"] = df.apply(is_hallu, axis=1)
df_hallu = df[df["hallucinated"]]

# --------------------- 4. Idioma español ----------------------------
def es(txt):
    try:
        return detect(txt) == "es" if isinstance(txt, str) and txt.strip() else False
    except LangDetectException:
        return False

df["chosen_es"]   = df["chosen"].apply(es)
df["rejected_es"] = df["rejected"].apply(es)
df["all_es"]      = df["chosen_es"] & df["rejected_es"]
df_not_es         = df[~df["all_es"]]


In [25]:

# ---------------------- 5. Métricas iniciales -----------------------
summary = {
    "total_lineas_leidas":        len(rows) + bad_json,
    "json_invalido":              bad_json,
    "registros_df":               len(df),
    "null_chosen":                df["chosen"].isna().sum(),
    "null_rejected":              df["rejected"].isna().sum(),
    "exceso_longitud(>8192)":     int(long_mask.sum()),
    "alucinados(regex|>1000)":    len(df_hallu),
    "no_espanol":                 len(df_not_es),
    # ── Promedios antes de limpiar ──
    "mean_chosen_words_before":   df["chosen_word_count"].mean().round(2),
    "mean_rejected_words_before": df["rejected_word_count"].mean().round(2),
}

pd.DataFrame(summary.items(), columns=["metrica", "valor"])\
  .to_csv(CSV_DIR / "resumen_calidad_dp.csv", index=False)

# --------------------- 6. Dataset limpio final ----------------------
clean_mask = (~long_mask) & (~df["hallucinated"]) & (df["all_es"])
df_clean   = df[clean_mask].copy()

# ── Promedios después de limpiar ──
mean_chosen_after   = df_clean["chosen_word_count"].mean().round(2)
mean_rejected_after = df_clean["rejected_word_count"].mean().round(2)

print(f"✅ Registros finales: {len(df_clean):,}")
print(f"   • conversation_id únicos: {df_clean['conversation_id'].nunique():,}")
print(f"   • Media palabras CHOSEN   → antes: {summary['mean_chosen_words_before']} | después: {mean_chosen_after}")
print(f"   • Media palabras REJECTED → antes: {summary['mean_rejected_words_before']} | después: {mean_rejected_after}")

# Guardar tabla de estadísticos detallados (count, mean, std, etc.)
stats_before = df[["chosen_word_count", "rejected_word_count"]].describe().T
stats_after  = df_clean[["chosen_word_count", "rejected_word_count"]].describe().T
stats_before.to_csv(CSV_DIR / "stats_before_clean.csv")
stats_after .to_csv(CSV_DIR / "stats_after_clean.csv")


✅ Registros finales: 2,103
   • conversation_id únicos: 854
   • Media palabras CHOSEN   → antes: 120.49 | después: 121.51
   • Media palabras REJECTED → antes: 22.34 | después: 34.88


In [26]:

# ----------------------- 7. Gráficas -------------------------------
plt.figure(figsize=(8,4))
plt.bar(summary.keys(), summary.values(), color="indianred")
plt.title("Registros descartados – motivos")
plt.ylabel("Cantidad")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(FIG_DIR / "descartes_dp.png", dpi=150)
plt.close()

for col, ttl in [("chosen_word_count", "Longitud «chosen»"),
                 ("rejected_word_count", "Longitud «rejected»")]:
    plt.figure(figsize=(8,4))
    sns.histplot(df_clean[col], bins=20, kde=True, color="steelblue")
    plt.title(f"{ttl} (palabras) – dataset limpio")
    plt.xlabel("Número de palabras")
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"{col}_hist_dp.png", dpi=150)
    plt.close()

# ------------------- 8. Exportar subconjuntos -----------------------
df_clean.to_json(CSV_DIR / "out_dp_clean.jsonl",
                 orient="records", lines=True, force_ascii=False)
df_long.to_json(CSV_DIR / "descartes_longitud_dp.jsonl",
                orient="records", lines=True, force_ascii=False)
df_hallu.to_json(CSV_DIR / "descartes_hallu_dp.jsonl",
                 orient="records", lines=True, force_ascii=False)
df_not_es.to_json(CSV_DIR / "descartes_no_es_dp.jsonl",
                  orient="records", lines=True, force_ascii=False)
