In [1]:
import json, re, warnings
from pathlib import Path
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from langdetect import detect, LangDetectException

warnings.filterwarnings("ignore")
sns.set_theme(style="whitegrid", font_scale=.9)



In [None]:

# ---------------------------------------------------------------------
# Configuración
# ---------------------------------------------------------------------
FILE       = Path("data/dp.jsonl")
FIG_DIR    = Path("figs_dp")
CSV_DIR    = Path("csv_dp")
MAX_WORDS  = 500
HALLU_RE   = re.compile(r"(model que:){3,}", re.IGNORECASE)   # patrón repetitivo

FIG_DIR.mkdir(exist_ok=True)
CSV_DIR.mkdir(exist_ok=True)



In [6]:
# ---------------------------------------------------------------------
# 1. Cargar JSONL y contabilizar líneas corruptas
# ---------------------------------------------------------------------
rows, bad_json = [], 0
with FILE.open(encoding="utf-8") as f:
    for ln in f:
        ln = ln.strip()
        if not ln:
            continue
        try:
            rows.append(json.loads(ln))
        except json.JSONDecodeError:
            bad_json += 1

print(f"✔️  Registros válidos: {len(rows):,} · JSON inválido: {bad_json:,}")

df = pd.DataFrame(rows)

# ---------------------------------------------------------------------
# 2. Conteo de palabras en chosen & rejected
# ---------------------------------------------------------------------
for col in ("chosen", "rejected"):
    df[f"{col}_word_count"] = df[col].fillna("").str.split().str.len()

long_mask = (
    (df["chosen_word_count"]   > MAX_WORDS) |
    (df["rejected_word_count"] > MAX_WORDS)
)
df_long  = df[long_mask]
n_long   = len(df_long)



✔️  Registros válidos: 4,613 · JSON inválido: 0


In [7]:
# ---------------------------------------------------------------------
# 3. Detectar patrones alucinados
# ---------------------------------------------------------------------
def bad_pattern(txt: str | None) -> bool:
    if not isinstance(txt, str):
        return False
    return bool(HALLU_RE.search(txt))

df["hallucinated"] = df["chosen"].apply(bad_pattern) | df["rejected"].apply(bad_pattern)
df_hallu = df[df["hallucinated"]]
n_hallu  = len(df_hallu)



In [8]:
# ---------------------------------------------------------------------
# 4. Detección de idioma
# ---------------------------------------------------------------------
def lang_es(txt: str | None) -> bool:
    try:
        return detect(txt) == "es" if isinstance(txt, str) and txt.strip() else False
    except LangDetectException:
        return False

df["chosen_es"]   = df["chosen"].apply(lang_es)
df["rejected_es"] = df["rejected"].apply(lang_es)
df["all_es"]      = df["chosen_es"] & df["rejected_es"]
df_not_es         = df[~df["all_es"]]
n_not_es          = len(df_not_es)



In [None]:
# ---------------------------------------------------------------------
# 5. Métricas generales
# ---------------------------------------------------------------------
summary = {
    "total_líneas_leídas":      len(rows) + bad_json,
    "json_inválido":            bad_json,
    "registros_df":             len(df),
    "null_chosen":              df["chosen"].isna().sum(),
    "null_rejected":            df["rejected"].isna().sum(),
    "exceso_longitud":          n_long,
    "patrones_alucinados":      n_hallu,
    "no_español":               n_not_es,
}

pd.DataFrame(summary.items(), columns=["métrica", "conteo"])\
  .to_csv(CSV_DIR / "resumen_calidad_dp.csv", index=False)


In [10]:

# ---------------------------------------------------------------------
# 6. Filtrar registros limpios
# ---------------------------------------------------------------------
clean_mask = (
    (~long_mask) &
    (~df["hallucinated"]) &
    (df["all_es"])
)
df_clean = df[clean_mask].copy()

n_conversations = df_clean["conversation_id"].nunique()
print(f"✅ Registros finales: {len(df_clean):,}  |  conversation_id únicos: {n_conversations:,}")


✅ Registros finales: 2,111  |  conversation_id únicos: 851


In [11]:

# ---------------------------------------------------------------------
# 7. Gráficas
# ---------------------------------------------------------------------
# 7-A. Barras de descartes
plt.figure(figsize=(8,4))
plt.bar(summary.keys(), summary.values(), color="indianred")
plt.title("Registros descartados por motivo")
plt.ylabel("Cantidad")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(FIG_DIR / "descartes_dp.png", dpi=150)
plt.close()

# 7-B. Histogramas de longitud
for col, title in [("chosen_word_count", "Longitud «chosen»"),
                   ("rejected_word_count", "Longitud «rejected»")]:
    plt.figure(figsize=(8,4))
    sns.histplot(df_clean[col], bins=20, kde=True, color="steelblue")
    plt.title(f"{title} (palabras)")
    plt.xlabel("Número de palabras")
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"{col}_hist_dp.png", dpi=150)
    plt.close()


In [12]:
# ---------------------------------------------------------------------
# 8. Exportar
# ---------------------------------------------------------------------
df_clean.to_json(CSV_DIR / "out_dp_clean.jsonl",
                 orient="records", lines=True, force_ascii=False)

df_long.to_json(CSV_DIR / "descartes_longitud_dp.jsonl",
                orient="records", lines=True, force_ascii=False)
df_hallu.to_json(CSV_DIR / "descartes_hallu_dp.jsonl",
                 orient="records", lines=True, force_ascii=False)
df_not_es.to_json(CSV_DIR / "descartes_no_es_dp.jsonl",
                  orient="records", lines=True, force_ascii=False)

