In [3]:
data = {
    2023: {"cluster_file": "clusters_2023_sustantivas_5_v3", "tree_file": "../data/estructura/2023_12_09.csv"},
    2025: {"cluster_file": "clusters_2025_sustantivas_5_v3", "tree_file": "../data/estructura/2025_07_08.csv"},
}

In [4]:
year = 2025
central_administration_only = True
objective_threshold = 0.6

In [None]:
# Obtener ocurrencias de un unico cluster
from chainsaw.heatmaps.llm_extraction import LLMExtraction
from chainsaw.heatmaps.constants import DimensionName

cluster_id = 0
LLMExtraction(data[year]["cluster_file"]).execute(
    cluster_id,
    reuse=[
        DimensionName.OBJECTIVE,
        DimensionName.TARGET
    ]
)

In [None]:
# Obtener ocurrencias en cada dimension para utilizarlas en la creacion de heatmaps
import json
from tqdm import tqdm
from collections import Counter
from chainsaw.heatmaps.llm_extraction import LLMExtraction
from chainsaw.heatmaps.constants import DimensionName

clusters_to_omit = [0]

with open(f"clusters/{data[year]['cluster_file']}.json", "r", encoding="utf-8") as f:
    cluster_file_content = json.load(f)
    clusters_by_size = sorted(
        Counter(
            each["cluster"]
            for each in cluster_file_content["clusters_data"]
            if each["cluster"] not in clusters_to_omit).items(),
        key=lambda c: c[1])

for cluster, size in tqdm(clusters_by_size, total=len(clusters_by_size), desc="Cluster", leave=False):
    LLMExtraction(data[year]["cluster_file"]).execute(
        cluster,
        reuse=[
            # DimensionName.OBJECTIVE,
            # DimensionName.TARGET,
            # DimensionName.ENVIRONMENT,
        ]
    )

In [None]:
# Distintos pesos para cada dimensión, generando múltiples escenarios posibles para los outputs json
WEIGHTS_TO_EXECUTE = [
  {"objetivos": 0.6, "distancia": 0.2, "destinatarios": 0.1, "ambitos": 0.1},
  {"objetivos": 0.5, "distancia": 0.3, "destinatarios": 0.1, "ambitos": 0.1},
  {"objetivos": 0.4, "distancia": 0.4, "destinatarios": 0.1, "ambitos": 0.1},
  {"objetivos": 0.5, "distancia": 0.4, "destinatarios": 0.1, "ambitos": 0},
  {"objetivos": 0.5, "distancia": 0.45, "destinatarios": 0.05, "ambitos": 0},
  {"objetivos": 0.6, "distancia": 0.3, "destinatarios": 0.1, "ambitos": 0},
]

In [None]:
# Guardar archivo json final, con todos los heatmaps del año indicado
import json
from collections import defaultdict
from chainsaw.heatmaps.plot import (
    partial_matrixes,
    _final_matrix,
    _apply_dimension_weights,
)


def statistics_for_cluster(units_order, matrixes, weights, unit_similarity_threshold):
    statistics = {}
    units_amount = len(units_order)
    units_with_position = {unit_data["idx"]: unit_data["unit"] for unit_uuid, unit_data in units_order.items()}

    units_by_jurisdiction = defaultdict(list)
    for unit_idx, unit in units_with_position.items():
        units_by_jurisdiction[unit["jurisdiction"]].append(unit)
    cluster_size = units_amount * units_amount

    statistics["cant_unidades"] = units_amount
    statistics["cant_diadas"] = cluster_size
    statistics["cant_jurisdicciones"] = len(units_by_jurisdiction.keys())
    statistics["cant_unidades_por_jurisdiccion"] = {j: len(units) for j, units in units_by_jurisdiction.items()}
    statistics["superposicion"] = {}

    for strict in (True, False):
        similarity_statistics = {}
        matrix = _final_matrix(units_order, matrixes, weights, strict)
        total = sum(sum(row) for row in matrix)
        average = total / cluster_size

        highest_similarities = []
        highest_units = set()
        for i in range(units_amount):
            for j in range(i+1, units_amount):
                similarity = matrix[i][j]
                if (similarity >= unit_similarity_threshold)\
                    and (matrixes["objetivos"][i][j] > (objective_threshold * weights["objetivos"]))\
                    and (matrixes["distancia"][i][j] > 0):
                    highest_similarities.append({
                        "fila": i,
                        "columna": j,
                        "unidad_fila": units_with_position[i]["name"],
                        "unidad_columna": units_with_position[j]["name"],
                        "uuid_fila": units_with_position[i]["uuid"],
                        "uuid_columna": units_with_position[j]["uuid"],
                        "jurisdiccion_fila": units_with_position[i]["jurisdiction"],
                        "jurisdiccion_columna": units_with_position[j]["jurisdiction"],
                        "path_fila": units_with_position[i]["path"],
                        "path_columna": units_with_position[j]["path"],
                        "similitud_coseno": similarity,
                    })
                    highest_units.add(units_with_position[i]["uuid"])
                    highest_units.add(units_with_position[j]["uuid"])

        similarity_statistics["suma_superposicion"] = total
        similarity_statistics["promedio_superposicion"] = average
        similarity_statistics["cant_diadas_superposicion_alta"] = len(highest_similarities)
        similarity_statistics["cant_unidades_superposicion_alta"] = len(highest_units)
        similarity_statistics["porcentaje_unidades_superpuestas"] = len(highest_units) / units_amount * 100
        similarity_statistics["diadas_superposicion_alta"] = highest_similarities
        key = "estricto" if strict else "no_estricto"
        statistics["superposicion"][key] = similarity_statistics
    return statistics


def global_statistics(
    global_units_amount,
    global_clustered_units_amount,
    global_pairs_amount,
    global_sum_all_similarities,
    global_highest_similarity_units_amount,
    unit_similarity_threshold,
    clusters_to_omit,
    weights,
):
    statistics = {}
    similarity_statistics = {}
    for key in ("estricto", "no_estricto"):
        similarity_statistics[key] = {
            "cant_unidades_superpuestas": global_highest_similarity_units_amount[key],
            "promedio_global_superposicion": global_sum_all_similarities[key] / global_pairs_amount,
            "porcentaje_superposicion_unidades_clusterizadas": global_highest_similarity_units_amount[key] / global_clustered_units_amount * 100,
            "porcentaje_superposicion_unidades_totales": global_highest_similarity_units_amount[key] / global_units_amount * 100,
        }
    statistics["cant_global_unidades"] = global_units_amount
    statistics["cant_unidades_clusterizadas"] = global_clustered_units_amount
    statistics["cant_unidades_restantes"] = statistics["cant_global_unidades"] - global_clustered_units_amount
    statistics["umbral_similitud"] = unit_similarity_threshold
    statistics["superposicion"] = similarity_statistics
    statistics["clusters_omitidos"] = clusters_to_omit
    statistics["ponderadores"] = weights
    return statistics


for cosine_similarity_threshold in [0.70]:
    for unit_similarity_threshold in [0.6]:
        for year, limit, clusters_to_omit in [(2023, 1, [-1]), (2025, 13, [-1])]:
            for weights in WEIGHTS_TO_EXECUTE:
                with open(f"clusters/{data[year]["cluster_file"]}.json", "r", encoding="utf-8") as f:
                    cluster_file_content = json.load(f)

                cluster_file_content["heatmaps"] = {}
                global_clustered_units_amount = 0
                global_pairs_amount = 0
                global_omited_units_amount = 0
                global_sum_all_similarities = defaultdict(float)
                global_highest_similarity_units_amount = defaultdict(int)

                all_matrixes = {}
                for cluster_id in range(-1, limit):
                    matrixes, units_order = partial_matrixes(
                        data[year]["cluster_file"],
                        cluster_id,
                        data[year]["tree_file"],
                        central_administration_only,
                        cosine_similarity_threshold,
                        list(weights.keys()),
                    )
                    all_matrixes[cluster_id] = {"matrixes": matrixes, "units_order": units_order}

                all_matrixes = _apply_dimension_weights(all_matrixes, weights)
                for cluster_id in range(0, limit):
                    matrixes = all_matrixes[cluster_id]["matrixes"]
                    units_order = all_matrixes[cluster_id]["units_order"]
                    cluster_data = {}
                    cluster_data["orden"] = [{"uuid": unit["uuid"], "nombre": unit["name"], "jurisdiccion": unit["jurisdiction"]}
                                            for e in sorted(units_order.values(), key=lambda each: each["idx"])
                                            if (unit:=e["unit"])]
                    cluster_data.update(matrixes)
                    cluster_data["estadisticas"] = statistics_for_cluster(units_order, matrixes, weights, unit_similarity_threshold)
                    cluster_file_content["heatmaps"][str(cluster_id)] = cluster_data

                    # Acumulamos estadisticas globales
                    if cluster_id in clusters_to_omit:
                        global_omited_units_amount = global_omited_units_amount + cluster_data["estadisticas"]["cant_unidades"]
                    else:
                        global_clustered_units_amount = global_clustered_units_amount + cluster_data["estadisticas"]["cant_unidades"]
                        global_pairs_amount = global_pairs_amount + cluster_data["estadisticas"]["cant_diadas"]
                        for key in ("estricto", "no_estricto"):
                            global_sum_all_similarities[key] = global_sum_all_similarities[key] + cluster_data["estadisticas"]["superposicion"][key]["suma_superposicion"]
                            global_highest_similarity_units_amount[key] = global_highest_similarity_units_amount[key] + cluster_data["estadisticas"]["superposicion"][key]["cant_unidades_superposicion_alta"]

                # Guardamos estadísticas globales
                cluster_file_content["estadisticas"] = global_statistics(
                    len(cluster_file_content["objectives"].keys()) - global_omited_units_amount,
                    global_clustered_units_amount,
                    global_pairs_amount,
                    global_sum_all_similarities,
                    global_highest_similarity_units_amount,
                    unit_similarity_threshold,
                    clusters_to_omit,
                    weights,
                )

                sufix = f"obj{int(weights['objetivos']*10)}_dist{int(weights['distancia']*10)}_dest{int(weights['destinatarios']*10)}_amb{int(weights['ambitos']*10)}_threshold{int(unit_similarity_threshold*10)}_cos_sim{int(cosine_similarity_threshold*100)}"
                with open(f"finals/{data[year]["cluster_file"]}_{sufix}.json", "w", encoding="utf-8") as f:
                    json.dump(cluster_file_content, f, ensure_ascii=False, indent=4)