In [24]:
import argparse
import csv
import json
import heapq
import torch
import numpy as np
from pathlib import Path
from tqdm import tqdm

In [25]:
def _find_file(root: Path, name: str):
    direct = root / name
    if direct.exists(): return direct
    matches = list(root.rglob(name))
    return matches[0] if matches else None

In [26]:
def _load_nodes(path: Path):
    with path.open("r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        col = next((c for c in reader.fieldnames if c.lower() in ["node_id", "id", "asn"]), reader.fieldnames[0])
        ids = [int(row[col]) for row in reader if row[col].strip()]
        return set(ids)

In [27]:
def _dijkstra(adj, root, terminals, cost_map):
    dist = [float("inf")] * len(adj)
    prev = [-1] * len(adj)
    dist[root], heap = 0.0, [(0.0, root)]
    
    while heap:
        d, u = heapq.heappop(heap)
        if d > dist[u]: continue
        for v, w in adj[u]:
            if d + w < dist[v]:
                dist[v], prev[v] = d + w, u
                heapq.heappush(heap, (dist[v], v))
    
    edges = set()
    for t in terminals:
        curr = t
        while curr != root and curr != -1:
            p = prev[curr]
            if p == -1: break
            edges.add(tuple(sorted((p, curr))))
            curr = p
            
    return sum(cost_map.get(e, 0) for e in edges)

In [28]:
input_dir = Path("../outputs")
output_dir = Path("../outputs")

snapshot_dir = input_dir / "snapshots"
snapshot_dir.mkdir(parents=True, exist_ok=True)

instance_dir = input_dir / "instances"
instance_dir.mkdir(parents=True, exist_ok=True)

label_dir = input_dir / "labels"
label_dir.mkdir(parents=True, exist_ok=True)

validate_dir = output_dir / "validades"
validate_dir.mkdir(parents=True, exist_ok=True)

In [29]:
nodes_csv = _find_file(snapshot_dir, "nodes.csv")
snaps_txt = _find_file(snapshot_dir, "snapshots.txt")
inst_jsonl = _find_file(instance_dir, "instances.jsonl")
lab_jsonl = _find_file(label_dir, "labels.jsonl")

print(f"  -> diretórios preparados: snapshots, instances, labels: {snapshot_dir}, {instance_dir}, {label_dir}")


  -> diretórios preparados: snapshots, instances, labels: ../outputs/snapshots, ../outputs/instances, ../outputs/labels


In [30]:
if nodes_csv is None:
    raise FileNotFoundError("nodes.csv")
if snaps_txt is None:
    raise FileNotFoundError("snapshots.txt")
if not inst_jsonl.exists():
    raise FileNotFoundError("instances.jsonl")
if not lab_jsonl.exists():
    raise FileNotFoundError("labels.jsonl")

In [31]:
node_set = _load_nodes(nodes_csv)
snap_ids = [l.strip() for l in snaps_txt.read_text(encoding="utf-8").splitlines() if l.strip()]
instances = [json.loads(l) for l in inst_jsonl.read_text(encoding="utf-8").splitlines() if l.strip()]
labels = [json.loads(l) for l in lab_jsonl.read_text(encoding="utf-8").splitlines() if l.strip()]

In [32]:
graphs = {}
total_n = 0
total_e = 0

In [33]:
for sid in tqdm(snap_ids, desc="validate_snapshots"):
    p = _find_file(input_dir, f"as_graph_{sid}.pt") or _find_file(input_dir, f"{sid}.pt")
    if p is None:
        matches = list(input_dir.glob(f"*{sid}*.pt"))
        if matches:
            p = matches[0]
        else:
            raise FileNotFoundError(f"Não foi possível encontrar o grafo para o snapshot {sid}")

    g = torch.load(p, map_location="cpu")
    graphs[sid] = g
    total_n += int(g.get("num_nodes", 0))
    total_e += int(g["edge_index"].size(1))

validate_snapshots: 100%|██████████| 120/120 [00:31<00:00,  3.79it/s]


In [34]:
st = {
    "conn": 0,
    "hits": 0,
    "total_edges": 0,
    "nodes": [],
    "edges": [],
    "saving": [],
    "steiner": 0
}

In [35]:
for lab in tqdm(labels, desc="validate_labels"):
    snap_key = lab.get("snapshot_next") or lab.get("snapshot")
    if snap_key not in graphs:
        raise KeyError(f"Snapshot {snap_key} não está carregado em graphs")
    g = graphs[snap_key]
    num_nodes = int(g.get("num_nodes", len(node_set)))

    ei = g["edge_index"]
    et = g.get("edge_type", torch.zeros(ei.size(1), dtype=torch.long))

    cost_map = {}
    adj = [[] for _ in range(num_nodes)]

    for i in range(ei.size(1)):
        u = int(ei[0, i])
        v = int(ei[1, i])
        t = int(et[i])
        pair = tuple(sorted((u, v)))
        c = 1.0 if t == 0 else 1.2
        if pair not in cost_map or c < cost_map[pair]:
            cost_map[pair] = c
        adj[u].append((v, c))
        adj[v].append((u, c))

    pcst_cost = 0.0
    for e in lab["tree_edges"]:
        st["total_edges"] += 1
        pair = tuple(sorted((int(e[0]), int(e[1]))))
        if pair in cost_map:
            st["hits"] += 1
            pcst_cost += cost_map[pair]

    tree_nodes = [int(n) for n in lab["tree_nodes"]]
    tree_edges = [(int(u), int(v)) for u, v in lab["tree_edges"]]
    terminals_out = [int(t) for t in lab["terminals_out"]]
    root = int(lab["root"])

    adj_tree = {n: [] for n in tree_nodes}
    for u, v in tree_edges:
        if u in adj_tree and v in adj_tree:
            adj_tree[u].append(v)
            adj_tree[v].append(u)

    q = [root]
    seen = {root}
    while q:
        u = q.pop(0)
        for v in adj_tree.get(u, []):
            if v not in seen:
                seen.add(v)
                q.append(v)

    if seen == set(tree_nodes) and all(t in seen for t in terminals_out):
        st["conn"] += 1

    base_cost = _dijkstra(adj, root, terminals_out, cost_map)
    st["saving"].append(1.0 - (pcst_cost / base_cost) if base_cost > 0 else 0.0)
    st["nodes"].append(len(tree_nodes))
    st["edges"].append(len(tree_edges))
    st["steiner"] += (len(tree_nodes) - len(terminals_out) - 1)

validate_labels: 100%|██████████| 595/595 [57:34<00:00,  5.81s/it]


In [36]:
labels_count = len(labels)
nodes_sum = sum(st["nodes"])

In [37]:
report = {
    "input_dir": str(input_dir),
    "output_dir": str(output_dir),
    "nodes_count": len(node_set),
    "graphs_found": len(list(input_dir.rglob("*.pt"))),
    "graphs_checked": len(snap_ids),
    "avg_nodes_per_graph_checked": (total_n / len(snap_ids)) if len(snap_ids) > 0 else 0.0,
    "avg_edges_per_graph_checked": (total_e / len(snap_ids)) if len(snap_ids) > 0 else 0.0,
    "instances_found": len(instances),
    "instances_checked": labels_count,
    "labels_tree_connected": (st["conn"] / labels_count) if labels_count > 0 else 0.0,
    "labels_physical_edge_hit_ratio": (st["hits"] / st["total_edges"]) if st["total_edges"] > 0 else 0.0,
    "metrics_avg_tree_nodes": float(np.mean(st["nodes"])) if st["nodes"] else 0.0,
    "metrics_avg_tree_edges": float(np.mean(st["edges"])) if st["edges"] else 0.0,
    "metrics_avg_steiner_ratio": (st["steiner"] / nodes_sum) if nodes_sum > 0 else 0.0,
    "metrics_avg_cost_saving": float(np.mean(st["saving"])) if st["saving"] else 0.0
}

In [38]:
print("  -> validações geradas:")
print(json.dumps(report, indent=2, ensure_ascii=False))

  -> validações geradas:
{
  "input_dir": "../outputs",
  "output_dir": "../outputs",
  "nodes_count": 101606,
  "graphs_found": 120,
  "graphs_checked": 120,
  "avg_nodes_per_graph_checked": 101606.0,
  "avg_edges_per_graph_checked": 1000871.0833333334,
  "instances_found": 595,
  "instances_checked": 595,
  "labels_tree_connected": 1.0,
  "labels_physical_edge_hit_ratio": 1.0,
  "metrics_avg_tree_nodes": 40.31596638655462,
  "metrics_avg_tree_edges": 39.31596638655462,
  "metrics_avg_steiner_ratio": 0.5090461897615475,
  "metrics_avg_cost_saving": 0.024164370262385593
}


In [39]:
(validate_dir / "validate.jsonl").write_text(json.dumps(report, ensure_ascii=False) + "\n", encoding="utf-8")

522