In [25]:
import numpy as np
import networkx as nx
import json
from collections import defaultdict
import random
from typing import List, Tuple, Dict, Any
import os
from tqdm.auto import tqdm

In [26]:
# Gerador de Dataset Balanceado para Roteamento - Versão Notebook
# Execute cada célula em ordem para gerar seus datasets

import numpy as np
import networkx as nx
import json
from collections import defaultdict
import random
from typing import List, Tuple, Dict, Any
import os
from tqdm.auto import tqdm  # Progress bars bonitas no notebook

class RoutingDatasetGenerator:
    def __init__(self, seed=42):
        """
        Gerador de dataset balanceado para problemas de roteamento
        """
        np.random.seed(seed)
        random.seed(seed)
        print(f"Seed definida como: {seed}")
        
    def generate_graph(self, num_nodes: int, edge_prob: float = 0.3, 
                      min_edges_per_node: int = 2) -> nx.Graph:
        """
        Gera um grafo conectado com características de rede realistas
        """
        # Começar com um grafo aleatório
        G = nx.erdos_renyi_graph(num_nodes, edge_prob)
        
        # Garantir conectividade
        while not nx.is_connected(G):
            components = list(nx.connected_components(G))
            if len(components) > 1:
                node1 = random.choice(list(components[0]))
                node2 = random.choice(list(components[1]))
                G.add_edge(node1, node2)
        
        # Garantir grau mínimo para todos os nós
        for node in G.nodes():
            if G.degree(node) < min_edges_per_node:
                available_nodes = [n for n in G.nodes() if n != node and not G.has_edge(node, n)]
                if available_nodes:
                    targets_needed = min(min_edges_per_node - G.degree(node), len(available_nodes))
                    targets = random.sample(available_nodes, targets_needed)
                    for target in targets:
                        G.add_edge(node, target)
        
        return G
    
    def generate_edge_attributes(self, num_edges: int) -> List[int]:
        """
        Gera atributos realistas para as arestas (0-255)
        """
        edge_attrs = []
        
        for _ in range(num_edges):
            link_quality = np.random.choice(['good', 'medium', 'poor'], p=[0.4, 0.4, 0.2])
            
            if link_quality == 'good':
                attr = np.random.randint(200, 256)
            elif link_quality == 'medium':
                attr = np.random.randint(100, 200)
            else:
                attr = np.random.randint(0, 100)
                
            edge_attrs.append(attr)
            
        return edge_attrs
    
    def get_shortest_path(self, G: nx.Graph, source: int, target: int, 
                         edge_attrs: List[int], edge_index: List[List[int]]) -> List[int]:
        """
        Calcula caminho mais curto considerando pesos das arestas
        """
        weighted_G = nx.Graph()
        weighted_G.add_nodes_from(G.nodes())
        
        for i, (u, v) in enumerate(zip(edge_index[0][::2], edge_index[1][::2])):
            weight = 256 - edge_attrs[i*2]
            weighted_G.add_edge(u, v, weight=weight)
        
        try:
            path = nx.shortest_path(weighted_G, source, target, weight='weight')
            return path
        except nx.NetworkXNoPath:
            if weighted_G.has_edge(source, target):
                return [source, target]
            return []
    
    def generate_balanced_samples_for_graph(self, graph_id: int, G: nx.Graph, 
                                          samples_per_graph: int = 50) -> List[Dict[str, Any]]:
        """
        Gera amostras balanceadas para um único grafo
        """
        samples = []
        
        # Converter para formato edge_index
        edges = list(G.edges())
        edge_index = [[], []]
        for u, v in edges:
            edge_index[0].extend([u, v])
            edge_index[1].extend([v, u])
        
        edge_attrs = self.generate_edge_attributes(len(edge_index[0]))
        
        nodes = list(G.nodes())
        num_pairs = max(3, samples_per_graph // 15)
        
        positive_samples = []
        negative_samples = []
        
        for _ in range(num_pairs):
            source, target = random.sample(nodes, 2)
            optimal_path = self.get_shortest_path(G, source, target, edge_attrs, edge_index)
            
            if len(optimal_path) < 2:
                continue
                
            for i in range(len(optimal_path) - 1):
                current_node = optimal_path[i]
                next_optimal = optimal_path[i + 1]
                neighbors = list(G.neighbors(current_node))
                
                # Amostra positiva
                positive_sample = {
                    'graph_id': graph_id,
                    'pair': [source, target],
                    'path_len': len(optimal_path),
                    'num_nodes': len(nodes),
                    'edge_index': edge_index,
                    'edge_attr': edge_attrs,
                    'current_node': current_node,
                    'target_node': target,
                    'candidate_node': next_optimal,
                    'labels': [1]
                }
                positive_samples.append(positive_sample)
                
                # Amostras negativas
                wrong_neighbors = [n for n in neighbors if n != next_optimal]
                for wrong_neighbor in wrong_neighbors:
                    negative_sample = {
                        'graph_id': graph_id,
                        'pair': [source, target],
                        'path_len': len(optimal_path),
                        'num_nodes': len(nodes),
                        'edge_index': edge_index,
                        'edge_attr': edge_attrs,
                        'current_node': current_node,
                        'target_node': target,
                        'candidate_node': wrong_neighbor,
                        'labels': [0]
                    }
                    negative_samples.append(negative_sample)
        
        # Balanceamento
        min_samples = min(len(positive_samples), len(negative_samples))
        if min_samples == 0:
            return []
            
        min_samples = max(min_samples, samples_per_graph // 4)
        
        balanced_positive = random.sample(positive_samples, min(len(positive_samples), min_samples))
        balanced_negative = random.sample(negative_samples, min(len(negative_samples), min_samples))
        
        samples.extend(balanced_positive)
        samples.extend(balanced_negative)
        
        return samples
    
    def generate_dataset(self, num_graphs: int = 100, 
                        node_range: Tuple[int, int] = (8, 20),
                        samples_per_graph: int = 50) -> List[Dict[str, Any]]:
        """
        Gera dataset completo com progress bar
        """
        all_samples = []
        
        print(f"Gerando {num_graphs} grafos...")
        
        # Progress bar para notebooks
        for graph_id in tqdm(range(num_graphs), desc="Gerando grafos"):
            num_nodes = np.random.randint(node_range[0], node_range[1] + 1)
            edge_prob = max(0.2, min(0.6, 3.0 / num_nodes))
            
            G = self.generate_graph(num_nodes, edge_prob)
            graph_samples = self.generate_balanced_samples_for_graph(
                graph_id, G, samples_per_graph
            )
            all_samples.extend(graph_samples)
        
        random.shuffle(all_samples)
        
        # Estatísticas
        positive_count = sum(1 for s in all_samples if s['labels'][0] == 1)
        negative_count = len(all_samples) - positive_count
        
        print(f"\nDataset gerado com sucesso!")
        print(f"Total de amostras: {len(all_samples):,}")
        print(f"Amostras positivas: {positive_count:,} ({positive_count/len(all_samples)*100:.1f}%)")
        print(f"Amostras negativas: {negative_count:,} ({negative_count/len(all_samples)*100:.1f}%)")
        print(f"Balanceamento: {min(positive_count, negative_count) / max(positive_count, negative_count):.3f}")
        
        return all_samples
    
    def generate_train_test_val_datasets(self, 
                                        total_graphs: int = 1000,
                                        node_range: Tuple[int, int] = (8, 20),
                                        samples_per_graph: int = 50,
                                        train_ratio: float = 0.7,
                                        test_ratio: float = 0.2,
                                        val_ratio: float = 0.1,
                                        output_dir: str = ""):
        """
        Gera datasets de treino, teste e validação
        """
        # Verificações
        if abs(train_ratio + test_ratio + val_ratio - 1.0) > 1e-6:
            raise ValueError("Os ratios devem somar 1.0!")
        
        # Criar diretório
        os.makedirs(output_dir, exist_ok=True)
        
        print(f"Gerando dataset completo com {total_graphs:,} grafos")
        print(f"Diretório de saída: {output_dir}")
        print(f"Splits: Train={train_ratio:.1%} | Test={test_ratio:.1%} | Val={val_ratio:.1%}")
        
        # Gerar dataset completo
        all_samples = self.generate_dataset(
            num_graphs=total_graphs,
            node_range=node_range,
            samples_per_graph=samples_per_graph
        )
        
        # Organizar por graph_id
        samples_by_graph = defaultdict(list)
        for sample in all_samples:
            samples_by_graph[sample['graph_id']].append(sample)
        
        graph_ids = list(samples_by_graph.keys())
        random.shuffle(graph_ids)
        
        # Calcular splits
        num_train_graphs = int(len(graph_ids) * train_ratio)
        num_test_graphs = int(len(graph_ids) * test_ratio)
        num_val_graphs = len(graph_ids) - num_train_graphs - num_test_graphs
        
        train_graph_ids = graph_ids[:num_train_graphs]
        test_graph_ids = graph_ids[num_train_graphs:num_train_graphs + num_test_graphs]
        val_graph_ids = graph_ids[num_train_graphs + num_test_graphs:]
        
        # Criar datasets
        train_samples = []
        test_samples = []
        val_samples = []
        
        for graph_id in train_graph_ids:
            train_samples.extend(samples_by_graph[graph_id])
        for graph_id in test_graph_ids:
            test_samples.extend(samples_by_graph[graph_id])
        for graph_id in val_graph_ids:
            val_samples.extend(samples_by_graph[graph_id])
        
        # Shuffle
        random.shuffle(train_samples)
        random.shuffle(test_samples)
        random.shuffle(val_samples)
        
        # Salvar com progress
        datasets = [
            ("train", train_samples),
            ("test", test_samples), 
            ("validation", val_samples)
        ]
        
        print(f"\nSalvando datasets...")
        
        for name, samples in datasets:
            filepath = os.path.join(output_dir, f"{name}.jsonl")
            
            with open(filepath, 'w') as f:
                for sample in tqdm(samples, desc=f"Salvando {name}", leave=False):
                    f.write(json.dumps(sample) + '\n')
            
        # Relatório final
        print(f"DATASET")
        print("="*50)
        
        for name, samples in datasets:
            if samples:
                pos_samples = sum(1 for s in samples if s['labels'][0] == 1)
                neg_samples = len(samples) - pos_samples
                balance = min(pos_samples, neg_samples) / max(pos_samples, neg_samples)
                
                print(f"\n{name.upper()}:")
                print(f"   Amostras: {len(samples):,}")
                print(f"   Positivas: {pos_samples:,} ({pos_samples/len(samples)*100:.1f}%)")
                print(f"   Negativas: {neg_samples:,} ({neg_samples/len(samples)*100:.1f}%)")
                print(f"   Balanceamento: {balance:.3f}")
        
        print(f"\nDatasets gerados com sucesso!")
        
        return {
            'train': train_samples,
            'test': test_samples,
            'validation': val_samples
        }

In [27]:
generator = RoutingDatasetGenerator(seed=42)

Seed definida como: 42


In [28]:
# Gerar os 3 datasets
datasets = generator.generate_train_test_val_datasets(
    total_graphs=100,          # Tamanho do Dataset
    node_range=(8, 25),         # Tamanho dos grafos
    samples_per_graph=10,       # Densidade
    train_ratio=0.7,            # 70% treino
    test_ratio=0.2,             # 20% teste
    val_ratio=0.1,              # 10% validação
    output_dir="../datasets"    # Pasta local
)

Gerando dataset completo com 100 grafos
Diretório de saída: ../datasets
Splits: Train=70.0% | Test=20.0% | Val=10.0%
Gerando 100 grafos...


Gerando grafos: 100%|██████████| 100/100 [00:00<00:00, 713.07it/s]



Dataset gerado com sucesso!
Total de amostras: 1,656
Amostras positivas: 828 (50.0%)
Amostras negativas: 828 (50.0%)
Balanceamento: 1.000

Salvando datasets...


                                                            

DATASET

TRAIN:
   Amostras: 1,164
   Positivas: 582 (50.0%)
   Negativas: 582 (50.0%)
   Balanceamento: 1.000

TEST:
   Amostras: 318
   Positivas: 159 (50.0%)
   Negativas: 159 (50.0%)
   Balanceamento: 1.000

VALIDATION:
   Amostras: 174
   Positivas: 87 (50.0%)
   Negativas: 87 (50.0%)
   Balanceamento: 1.000

Datasets gerados com sucesso!


