In [18]:
from pathlib import Path
import shutil
import tarfile
import gzip

import tsplib95
import networkx as nx
import torch
from torch_geometric.data import Data

# Funcions

In [19]:
def extract_tsp_archive(tar_path: Path, extract_path: Path):
    """
    Extract a TSPLIB .tar archive and decompress any .gz files inside it.
    Returns a list of .tsp file paths.
    """

    if not tar_path.exists():
        raise FileNotFoundError(f"Archive not found: {tar_path}")

    if extract_path.exists():
        shutil.rmtree(extract_path)
    extract_path.mkdir(parents=True, exist_ok=True)

    with tarfile.open(tar_path, "r") as tar:
        tar.extractall(path=extract_path)

    for gz_file in extract_path.glob("*.gz"):
        output_file = extract_path / gz_file.stem
        with gzip.open(gz_file, "rb") as f_in, open(output_file, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)
        gz_file.unlink()

    print("Archive extracted and .gz files decompressed.")

    return sorted(extract_path.glob("*.tsp"))

In [6]:
def prepare_graph(G):
    """
    Prepare a TSPLIB graph loaded with tsplib95:
    - ensure undirected
    - remove self-loops
    - keep only edge weight
    - keep only node id + initial/current/target
    """

    # Ensure undirected structure
    G = nx.Graph(G)

    # Remove self-loops
    G.remove_edges_from(nx.selfloop_edges(G))

    # Initialize node attributes
    first_node = min(G.nodes)
    for node in G.nodes:
        G.nodes[node].clear()
        G.nodes[node]["initial"] = int(node == first_node)
        G.nodes[node]["current"] = int(node == first_node)
        G.nodes[node]["target"] = 0

    # Keep only edge weight
    for u, v, attrs in G.edges(data=True):
        w = attrs.get("weight", None)
        attrs.clear()
        attrs["weight"] = w

    return G


In [20]:
def prepare_graph(G):
    """
    Prepare a TSPLIB graph loaded with tsplib95:
    - ensure undirected
    - remove self-loops
    - convert to 1-based indexing if needed
    - keep only edge weight
    - initialize node attributes (initial/current/target)
    """

    # Ensure undirected structure
    G = nx.Graph(G)

    # Remove self-loops
    G.remove_edges_from(nx.selfloop_edges(G))

    # --- NEW: Detect and convert 0-based graphs to 1-based ---
    nodes = sorted(G.nodes())
    if nodes[0] == 0:
        # Build mapping: 0→1, 1→2, ..., n-1→n
        mapping = {old: old + 1 for old in nodes}
        G = nx.relabel_nodes(G, mapping, copy=True)

    # Initialize node attributes
    first_node = min(G.nodes)
    for node in G.nodes:
        G.nodes[node].clear()
        G.nodes[node]["initial"] = int(node == first_node)
        G.nodes[node]["current"] = int(node == first_node)
        G.nodes[node]["target"] = 0

    # Keep only edge weight
    for u, v, attrs in G.edges(data=True):
        w = attrs.get("weight", None)
        attrs.clear()
        attrs["weight"] = w

    return G


In [21]:
def nx_to_pyg(G):
    """
    Convert a prepared NetworkX TSP graph into a PyTorch Geometric Data object.
    Keeps:
      - x: [initial, current]
      - edge_index (bidirectional)
      - edge_attr (weight)
      - node_id (original TSPLIB ids)
      - y: index of target node (0-based)
    """

    # Sorted node list for consistent indexing
    nodes = sorted(G.nodes())
    mapping = {node: i for i, node in enumerate(nodes)}

    # Node features
    x = torch.tensor(
        [
            [
                G.nodes[node]["initial"],
                G.nodes[node]["current"]
            ]
            for node in nodes
        ],
        dtype=torch.float
    )

    # Original TSPLIB node IDs
    node_id = torch.tensor(nodes, dtype=torch.long)

    # Target node (converted to PyTorch index)
    target_node = next((node for node in nodes if G.nodes[node]["target"] == 1), None)
    y = torch.tensor(
        mapping[target_node] if target_node is not None else -1,
        dtype=torch.long
    )

    # Edges (bidirectional)
    edge_index_list = []
    edge_attr_list = []

    for u, v, attrs in G.edges(data=True):
        i, j = mapping[u], mapping[v]
        w = attrs["weight"]

        edge_index_list.append([i, j])
        edge_attr_list.append([w])

        edge_index_list.append([j, i])
        edge_attr_list.append([w])

    edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr_list, dtype=torch.float)

    # Build Data object
    return Data(
        x=x,
        edge_index=edge_index,
        edge_attr=edge_attr,
        node_id=node_id,
        y=y
    )

In [22]:
def generate_training_graphs(G, tour):
    """
    Given a prepared graph G and a normalized tour,
    generate one graph per decision (num_nodes - 2).
    """

    graphs = []

    # Copy nodes to track which remain
    remaining = list(tour)

    initial = tour[0]

    for step in range(len(tour) - 2):
        current = tour[step]
        target = tour[step + 1]

        # Build a fresh copy of the graph
        H = G.copy()

        # Remove visited nodes except initial and current
        visited = tour[:step]
        for v in visited:
            if v != initial:
                if v in H:
                    H.remove_node(v)

        # Reset attributes
        for node in H.nodes:
            H.nodes[node]["initial"] = int(node == initial)
            H.nodes[node]["current"] = int(node == current)
            H.nodes[node]["target"] = int(node == target)

        graphs.append(H)

    return graphs

In [23]:
def load_opt_tour(tour_path: Path):
    """
    Load a TSPLIB .opt.tour file and return the tour as a list of node IDs.
    Handles:
      - one node per line
      - multiple nodes per line
      - -1 or EOF termination
    """
    tour = []
    reading = False

    with open(tour_path, "r") as f:
        for line in f:
            line = line.strip()

            if line == "TOUR_SECTION":
                reading = True
                continue

            if not reading:
                continue

            if line == "-1" or line == "EOF":
                break

            # Split line into tokens (handles multiple numbers per line)
            parts = line.split()
            for p in parts:
                tour.append(int(p))

    # Remove possible duplicated last node
    if len(tour) > 1 and tour[0] == tour[-1]:
        tour = tour[:-1]

    return tour

In [10]:
def save_graph(data: Data, pt_path: Path, count: int):
    """
    Save a PyTorch Geometric Data object to disk with a sequential filename.
    """
    file_path = pt_path / f"{count:05d}.pt"
    torch.save(data, file_path)
    return count + 1

# PLAY

In [24]:
# Prepare output directory
pt_path = Path("Datasets/train_pyg")
if pt_path.exists():
    shutil.rmtree(pt_path)
pt_path.mkdir(parents=True)

count = 0

# Extract archive
tar_path = Path("Datasets/ALL_tsp.tar")
extract_path = Path("Datasets/ALL_tsp")
tsp_files = extract_tsp_archive(tar_path, extract_path)

Archive extracted and .gz files decompressed.


In [25]:
tsp_file = tsp_files[10]
name = tsp_file.stem
tour_path = extract_path / f"{name}.opt.tour"

problem = tsplib95.load(tsp_file)
G = prepare_graph(problem.get_graph())
tour = load_opt_tour(tour_path)

print("TSP:", name)
print("Tour length:", len(tour), tour)
print("Nodes in G:", len(G.nodes), G.nodes)

TSP: brg180
Tour length: 180 [1, 12, 11, 10, 9, 8, 162, 163, 164, 165, 166, 167, 168, 157, 158, 159, 160, 161, 129, 128, 127, 126, 125, 124, 123, 122, 121, 132, 131, 130, 49, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 109, 37, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 99, 98, 97, 108, 107, 106, 105, 104, 103, 102, 101, 100, 145, 156, 155, 154, 153, 152, 151, 150, 149, 148, 147, 146, 64, 65, 66, 67, 68, 69, 70, 71, 72, 61, 62, 63, 29, 28, 27, 26, 25, 36, 35, 34, 33, 32, 31, 30, 133, 144, 143, 142, 141, 140, 139, 138, 137, 136, 135, 134, 89, 88, 87, 86, 85, 96, 95, 94, 93, 92, 91, 90, 17, 16, 15, 14, 13, 24, 23, 22, 21, 20, 19, 18, 75, 74, 73, 84, 83, 82, 81, 80, 79, 78, 77, 76, 179, 178, 177, 176, 175, 174, 173, 172, 171, 170, 169, 180, 7, 6, 5, 4, 3, 2]
Nodes in G: 180 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43

In [17]:
from pathlib import Path
import tsplib95

root = Path("Datasets/ALL_tsp")

for tsp_file in sorted(root.glob("*.tsp")):
    name = tsp_file.stem
    tour_path = root / f"{name}.opt.tour"
    if not tour_path.exists():
        continue

    # Load graph and tour
    problem = tsplib95.load(tsp_file)
    G = problem.get_graph()
    nodes = sorted(G.nodes())  # these are 0-based
    tour = []

    with open(tour_path, "r") as f:
        reading = False
        for line in f:
            line = line.strip()
            if line == "TOUR_SECTION":
                reading = True
                continue
            if not reading:
                continue
            if line in ("-1", "EOF"):
                break
            for p in line.split():
                tour.append(int(p))

    # Check indexing
    min_tour = min(tour)
    max_tour = max(tour)
    min_graph = min(nodes)
    max_graph = max(nodes)

    print(f"{name}: tour [{min_tour}, {max_tour}] vs graph [{min_graph}, {max_graph}]")


a280: tour [1, 280] vs graph [1, 280]
att48: tour [1, 48] vs graph [1, 48]
bayg29: tour [1, 29] vs graph [1, 29]
bays29: tour [1, 29] vs graph [1, 29]
berlin52: tour [1, 52] vs graph [1, 52]
brg180: tour [1, 180] vs graph [0, 179]
ch130: tour [1, 130] vs graph [1, 130]
ch150: tour [1, 150] vs graph [1, 150]
eil101: tour [1, 101] vs graph [1, 101]
eil51: tour [1, 51] vs graph [1, 51]
eil76: tour [1, 76] vs graph [1, 76]
fri26: tour [1, 26] vs graph [0, 25]
gr120: tour [1, 120] vs graph [1, 120]
gr202: tour [1, 202] vs graph [1, 202]
gr24: tour [1, 24] vs graph [0, 23]
gr48: tour [1, 48] vs graph [0, 47]
gr666: tour [1, 666] vs graph [1, 666]
gr96: tour [1, 96] vs graph [1, 96]
kroA100: tour [1, 100] vs graph [1, 100]
kroC100: tour [1, 100] vs graph [1, 100]
kroD100: tour [1, 100] vs graph [1, 100]
lin105: tour [1, 105] vs graph [1, 105]
pa561: tour [1, 561] vs graph [1, 561]
pcb442: tour [1, 442] vs graph [1, 442]
pr1002: tour [1, 1002] vs graph [1, 1002]
pr2392: tour [1, 2392] vs graph

# Main

In [None]:

"""
Main function
"""

# Prepare output directory
pt_path = Path("Datasets/train_pyg")
if pt_path.exists():
    shutil.rmtree(pt_path)
pt_path.mkdir(parents=True)

count = 0

# Extract archive
tar_path = Path("Datasets/ALL_tsp.tar")
extract_path = Path("Datasets/ALL_tsp")
tsp_files = extract_tsp_archive(tar_path, extract_path)

# Process each TSP instance
valid_i = 0
for tsp_file in tsp_files:
    name = tsp_file.stem
    tour_path = extract_path / f"{name}.opt.tour"

    # Skip if no optimal tour
    if not tour_path.exists():
        continue

    problem = tsplib95.load(tsp_file)
    print(f"\n---Graph {valid_i}: {problem.name}---")

    # Process only symmetric TSP instances
    if problem.type != "TSP":
        print(f"⚠️ Skipped (TYPE: {problem.type})")
        continue

    # Skip large instances
    if problem.dimension > 1000:
        print(f"⚠️ Skipped (DIMENSION: {problem.dimension})")
        continue

    # Load graph
    print("  Loading graph...")
    G = problem.get_graph()

    # Clean graph
    print("  Preparing graph...")
    G = prepare_graph(G)

    # Load tour
    print("  Loading tour...")
    tour = load_opt_tour(tour_path)

    # Generate train graphs
    print("  Generating training graphs...")
    graphs = generate_training_graphs(G, tour)

    # Convert to PyTorch Geometric and save Data Object
    print("  Converting to PyTorch Geometric format and saving...")
    for H in graphs:
        data = nx_to_pyg(H)
        count = save_graph(data, pt_path, count)

    print("✅ Success!")
    valid_i += 1

print(f"\nFinished. Saved {count} graphs to {pt_path}")
shutil.rmtree(extract_path)

# Outros

In [63]:
tar_path = Path("Datasets/ALL_tsp.tar")
extract_path = Path("Datasets/ALL_tsp")
tsp_files = extract_tsp_archive(tar_path, extract_path)

Archive extracted and .gz files decompressed.


In [69]:
problem = tsplib95.load(tsp_files[4])
G = problem.get_graph()
#print(G.graph)
#print(G.__dict__)
#print(G.nodes(data=True))
#print(G.edges(data=True))

G = prepare_graph(G)
#print(G.graph)
#print(G.__dict__)
#print(G.nodes(data=True))
#print(G.nodes()[1])
#print(G.edges(data=True))
#print(G.edges()[1, 2])

data = nx_to_pyg(G)
#print(data)
print(data.x)
#print(data.edge_index)
#print(data.edge_attr)
print(data.node_id)

tensor([[1., 1.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])


In [67]:
import matplotlib.pyplot as plt

def draw_graph(G, title=""):
    # Priority: display → coord → spring_layout
    if all("display" in G.nodes[n] for n in G.nodes):
        pos = {n: G.nodes[n]["display"] for n in G.nodes}
    elif all("coord" in G.nodes[n] for n in G.nodes):
        pos = {n: G.nodes[n]["coord"] for n in G.nodes}
    else:
        pos = nx.spring_layout(G, seed=42)

    colors = []
    for n in G.nodes:
        if G.nodes[n]["initial"]:
            colors.append("green")
        elif G.nodes[n]["current"]:
            colors.append("blue")
        elif G.nodes[n]["target"]:
            colors.append("red")
        else:
            colors.append("lightgray")

    nx.draw(G, pos, with_labels=True, node_color=colors, node_size=600)
    plt.title(title)
    plt.gca().invert_yaxis()  # TSPLIB coordinates have inverted Y
    plt.show()


In [72]:
problem = tsplib95.load(tsp_files[4])
G = problem.get_graph()
G = prepare_graph(G)
print(problem.name)

name = tsp_files[4].stem
tour_path = extract_path / f"{name}.opt.tour"
tour = load_opt_tour(tour_path)
print(tour)

graphs = generate_training_graphs(G, tour)
pyg_graphs = []

for i, g in enumerate(graphs):
    #draw_graph(g, title=f"Graph {i}")
    pyg_graph = nx_to_pyg(g)
    print(pyg_graph.node_id)
    print(pyg_graph.y)


bayg29
[1, 28, 6, 12, 9, 26, 3, 29, 5, 21, 2, 20, 10, 4, 15, 18, 14, 17, 22, 11, 19, 25, 7, 23, 8, 27, 16, 13, 24]
tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
tensor(27)
tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
tensor(5)
tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 29])
tensor(11)
tensor([ 1,  2,  3,  4,  5,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 29])
tensor(7)
tensor([ 1,  2,  3,  4,  5,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20,
        21, 22, 23, 24, 25, 26, 27, 29])
tensor(23)
tensor([ 1,  2,  3,  4,  5,  7,  8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21,
        22, 23, 24, 25, 26, 27, 29])
tensor(2)
tensor([ 1,  2,  3,  4,  5,  7,  8, 10, 11, 13, 14, 15, 16, 17, 