In [1]:
import re
import csv
import json
import spacy
import unicodedata
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict


nlp = spacy.load("es_core_news_sm")


def normalizar_texto(texto: str) -> str:
    texto = texto.lower()
    texto = re.sub(r'\s+', ' ', texto)
    texto = re.sub(r'[^a-z0-9áéíóúüñ ]', '', texto)
    return texto.strip()


def eliminar_verbos(texto: str) -> list[str]:
    doc = nlp(texto)
    tokens = [t.lemma_ for t in doc if t.pos_ != "VERB" and not t.is_stop and t.is_alpha]
    tokens = [''.join(
        c for c in unicodedata.normalize('NFD', tok) if unicodedata.category(c) != 'Mn'
    ) for tok in tokens]
    tokens = [tok for tok in tokens if len(tok) > 3]
    return tokens


def generar_ngrams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

In [2]:
apariciones = {2023: defaultdict(set), 2025: defaultdict(set)}
ngram_counts = {2023: {}, 2025: {}}
NGRAM_SIZES = [1, 2, 3]

for YEAR in [2023, 2025]:
    ruta = Path(f"clusters/clusters_{YEAR}_sustantivas_5_v3.json")
    with ruta.open("r", encoding="utf-8") as f:
        data = json.load(f)

    objectives = data.get("objectives", {})
    units = data.get("clusters_data", [])

    for unit in tqdm(units, total=len(units), desc=f"Procesando unidades {YEAR}", leave=False):
        unit_name = unit["name"]
        unit_uuid = unit["uuid"]
        text = objectives[unit_uuid]
        tokens = eliminar_verbos(normalizar_texto(f"{unit_name} {text}"))
        for n in NGRAM_SIZES:
            for ng in generar_ngrams(tokens, n):
                apariciones[YEAR][ng].add(unit_uuid)

    ngram_counts[YEAR] = {ng: len(ids) for ng, ids in apariciones[YEAR].items()}

                                                                           

In [4]:
year = 2025
MIN_TEXTS = 5
MAX_TEXTS = 20

ngram_filtrados = {
    ng: c for ng, c in ngram_counts[year].items()
    if MIN_TEXTS <= c <= MAX_TEXTS
}

ngram_ordenados = sorted(ngram_filtrados.items(), key=lambda x: (-x[1], x[0]))
ruta_csv = Path(f"ngrams_{year}.csv")

# Guardar CSV
with ruta_csv.open("w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["lema", "cantidad"])
    for lema, count in ngram_ordenados:
        writer.writerow([lema, count])

print(f"CSV guardado en {ruta_csv}")

CSV guardado en ngrams_2025.csv


In [5]:
ngram_filtrados_2023 = set({
    ng for ng, c in ngram_counts[2023].items()
    if MIN_TEXTS <= c <= MAX_TEXTS
})

ngram_filtrados_2025 = set({
    ng for ng, c in ngram_counts[2025].items()
    if MIN_TEXTS <= c <= MAX_TEXTS
})

ngram_filtrados_2023.update(ngram_filtrados_2025)

ruta_csv = Path(f"ngrams_todos.csv")
with ruta_csv.open("w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["lema"])
    for lema in ngram_filtrados_2023:
        writer.writerow([lema])