# NAME SURNAME

## Packages

In [3]:
import networkx as nx # For graphs
import pickle # For data parsing
import math
import torch
import torch.nn as nn
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, random_split, Dataset
from networkx.algorithms.approximation import greedy_tsp # For approx TSP

## Helper functions

In [4]:

def tour_length(G, tour):
    """
    Compute the length of a tour. A tour is a list having elments 0 and -1 equal
    """
    n = len(tour) - 1
    assert tour[0] == tour[-1], "Not valid tour"
    estimated = 0
    for i in range(n):
        estimated += G[tour[i]][tour[i + 1]]['weight']
    return estimated

def greedy_algorithm(G):
    """
    Run the value of the greedy approximation algorithm on graph G
    """
    return tour_length(G, greedy_tsp(G, weight='weight'))

def random_tour(G, seed = 42):
    """
    Return the value of a random tour
    """
    np.random.seed(seed)
    n = G.number_of_nodes()
    tour = [0]
    for i in range(1, n):
        next_node = np.random.choice([j for j in range(n) if j not in tour])
        tour.append(next_node)
    tour.append(0)
    return tour_length(G, tour)
    

def transformer_tsp(G, model, DEVICE = 'cpu'):
    """
    Evaluate your (trained) model on G
    TODO: If you used some masks, add them when needed. 
    """
    # Set the model in evaluation mode
    model.eval()

    # Note: number of edges is constant ed equal to n(n-1)/2
    n = G.number_of_nodes()
    E = G.number_of_edges()

    
    # Get node coordinates
    attr = nx.get_node_attributes(G, 'pos')
    x = []
    for i in range(n):
        x.append(torch.tensor(attr[i], dtype=torch.float32))

    # From list of tensors to tensor
    x = torch.stack(x)    

    tour = [0]
    y = torch.tensor(tour, dtype=torch.long)
    x = torch.stack(x)
    x = x.to(DEVICE).unsqueeze(0)
    y = y.to(DEVICE).unsqueeze(0)
    
    out = model(x, y)
    
    while len(tour) < n:
        _, idx = torch.topk(out, n, dim=2)
        for i in range(n):
            if idx[0, 0, i] not in tour:
                tour.append(idx[0,-1, i])
                break
        y = torch.tensor(tour)
        y = y.to(DEVICE).unsqueeze(0)
        out = model(x, y)
    
    tour = [int(i) for i in tour] + [0]
    return tour_length(G, tour)



def gap(G, model = None, model_GA = None, random_seed = 42, device = 'cpu'):
    """
    Compute the gap between the optimal solution on graph G and all the analyzed methods
    """
        
    # Optimal value (hard-coded in the graph)
    TSP = sum([G[i][j]['weight']*G[i][j]['tour'] for (i, j) in G.edges()]) # Optimal

    # Gaps dictionary
    gaps = {'greedy' : 0, 'random' : 0, 'transformer_tsp': 0, 'transformer_tsp_acc_grad': 0}
    gaps['greedy'] = 100* (greedy_algorithm(G) -  TSP) / TSP
    gaps['random'] = 100 * (random_tour(G, random_seed) - TSP) / TSP
    if model is not None:
        gaps['transformer_tsp'] = 100 * (transformer_tsp(G, model, DEVICE=device) - TSP) / TSP
    else:
        gaps['transformer_tsp'] = float('inf')
        
    if model_GA is not None:
        gaps['transformer_tsp_acc_grad'] = 100 * (transformer_tsp(G, model_GA, DEVICE=device) - TSP) / TSP
    else:
        gaps['transformer_tsp_acc_grad'] = float('inf')
    return gaps    
    

## Dataset & Dataloader

### Dataset Point 1

In [None]:
# Load the dummy dataset
with open('/kaggle/input/tsp-dataset/dummy_20_DLL_ass4.pkl', 'rb') as file:
   dummy = pickle.load(file)

# Print type of overall dataset and a single item
print(f"Type of dummy dataset is: {type(dummy)}")
single_item = dummy[0]
print(f"\nType of a single item is: {type(single_item)}")

# Print types of elements in the tuple
print(f"\nTypes of elements in the tuple:")
print(f"First element type: {type(single_item[0])}") # Graph
print(f"Second element type: {type(single_item[1])}") # List

print("\Single Item structure:")
print(single_item)

### Dataset Point 2

In [None]:
# Describe the edge attributes tour and weight, as well as the node attribute pos.


graph = single_item[0] 
tour = single_item[1]  # in this case tour as a list

# Describe node attribute: pos
node_positions = nx.get_node_attributes(graph, 'pos')
print("\nNode positions (pos):")
for node, pos in node_positions.items():
    print(f"Node {node}: Position {pos}")

# Describe edge attribute: weight
print("\nEdge weights:")
for u, v in graph.edges():
    weight = graph[u][v].get('weight', None) 
    print(f"Edge ({u}, {v}): Weight {weight}")

# Describe edge attribute: tour
print("\nTour:")
print(f"Tour as per solution: {tour}")





### Dataset Point 4

In [None]:
# Dataset Class


class TSPDataset(Dataset):
    def __init__(self, data):
        """
        Args:
            data (list): A list where each element is a tuple (graph, tour).
                         - graph: Graph object containing node positions as 'pos' attribute.
                         - tour: List of node indices representing the tour.
        """
        self.data = data

    def __len__(self):
        """
        Return the number of samples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Args:
            idx (int): Index of the data sample to retrieve.
        
        Returns:
            X (torch.Tensor): A tensor of shape (20, 2) representing node coordinates.
            y (torch.Tensor): A tensor of shape (n+1,) representing the tour starting and ending at 0.
        """
        graph, tour = self.data[idx]

        # Extract node positions from the graph
        pos = nx.get_node_attributes(graph, 'pos')  # Assumes graph stores positions in 'pos'
        X = torch.tensor([pos[node] for node in sorted(pos.keys())], dtype=torch.float32)

        # Ensure the tour starts and ends at node 0
        if tour[0] != 0:
            raise ValueError("Tour must start at node 0.")
        if tour[-1] != 0:
            tour = tour + [0]  # Append 0 to ensure the tour ends at the start node

        y = torch.tensor(tour, dtype=torch.long)

        return X, y


### Dataset Point 5

In [None]:
# Create Dataset objects for training, validation, and testing, along with their respective Dataloader

# tspdataset = TSPDataset(dummy)
# train_dataset, validation_dataset, test_dataset = random_split(tspdataset, [0.8, 0.1, 0.1])

# batch_size = 32

# trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# valloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
# testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)





In [None]:
# Funzione per caricare i file pickle
def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

# Caricamento dei dati
data_paths = {
    'train': '/kaggle/input/tsp-dataset/train_20_DLL_ass4.pkl',
    'valid': '/kaggle/input/tsp-dataset/valid_20_DLL_ass4.pkl',
    'test': '/kaggle/input/tsp-dataset/test_20_DLL_ass4.pkl'
}

data = {key: load_pickle(path) for key, path in data_paths.items()}

# Creazione dei dataset
batch_size = 32
datasets = {key: TSPDataset(data[key]) for key in data_paths.keys()}

# Creazione dei DataLoader
dataloaders = {key: DataLoader(datasets[key], batch_size=batch_size) for key in datasets.keys()}

# Accesso ai DataLoader
trainloader = dataloaders['train']
valloader = dataloaders['valid']
testloader = dataloaders['test']


## Model

In [None]:
import torch
import torch.nn as nn
from torch import Tensor
import math

class TSPTransformer(nn.Module):
    def __init__(self, n, num_encoder, num_decoder, de, dd, n_head, dropout, dim_feedforward=1024):
        super().__init__()
        # Encoder
        self.linear1 = nn.Linear(2, de)  # Linear 2 × d_e
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=de,
            nhead=n_head,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
            norm_first=True  # Aggiungi normalizzazione
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder)
        self.linear2 = nn.Linear(de, dd)  # Linear d_e × d_d
        
        # Decoder
        self.embedding = nn.Embedding(n, dd)  # Embedding n × d_d
        self.posEncoding = PositionalEncoding(dd, dropout)  # Positional Encoding
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=dd,
            nhead=n_head,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
            norm_first=True  # Aggiungi normalizzazione
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder)
        self.ffnn = nn.Linear(dd, n)  # Feed Forward from d_d to n
        
        # Inizializzazione dei pesi
        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, src, trg, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask):
        # Encoding
        x = self.linear1(src)
        x = self.encoder(x, mask=src_mask, src_key_padding_mask=src_padding_mask)
        memory = self.linear2(x)
        
        # Decoding
        x = self.embedding(trg)
        x = self.posEncoding(x)
        x = self.decoder(x, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
        output = self.ffnn(x)
        
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)
    
    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [None]:
def generate_square_subsequent_mask(sequence_length: int) -> Tensor:
    mask = (torch.triu(torch.ones((sequence_length, sequence_length), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt, DEVICE):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

    src_padding_mask = torch.zeros((src.shape[0], src_seq_len), device=DEVICE).bool() # Corrected
    tgt_padding_mask = torch.zeros((tgt.shape[0], tgt_seq_len), device=DEVICE).bool() # Corrected

    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

In [None]:

n = 20
n_enc = 6
n_dec = 6
de = 128
dd = 128
N_HEAD = 8
DROPOUT = 0.3
dim_feedforward=1024 #(se non specificato 1024 in mio modello)

model = TSPTransformer(n, n_enc, n_dec, de, dd, N_HEAD, DROPOUT, dim_feedforward).to(DEVICE)

## Training

In [None]:
import math

def evaluate(model, valloader, loss_fn, DEVICE):
    # Set the model to evaluation mode
    model.eval()
    losses = 0

    # Iterate over batches in the validation loader
    for src, tgt in valloader:
        # Move data to the correct device (CPU/GPU)
        src = src.to(DEVICE)  # Node coordinates (input to the encoder)
        tgt = tgt.to(DEVICE)  # Target sequence (tour)

        # Prepare input for the decoder (exclude the last token)
        tgt_input = tgt[:, :-1]

        # Generate masks for attention
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input, DEVICE)

        # Forward pass: get model predictions
        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask)

        # Prepare target output (exclude the first token)
        tgt_out = tgt[:, 1:]

        # Compute the loss
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    # Return the average loss 
    return losses / len(valloader)

In [None]:
def train_epoch(model, optimizer, trainloader, loss_fn, DEVICE):
    # Set the model to training mode
    model.train()
    losses = 0

    # Iterate over batches in the trainloader
    for src, tgt in trainloader:
        # Move data to the correct device (CPU/GPU)
        src = src.to(DEVICE)  # Node coordinates (input to the encoder)
        tgt = tgt.to(DEVICE)  # Target sequence (tour)

        # Prepare input for the decoder (exclude the last token)
        tgt_input = tgt[:, :-1]

        # Clear accumulated gradients
        optimizer.zero_grad()

        # Generate masks for attention
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input, DEVICE)

        # Forward pass: get model predictions
        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask)

        # Prepare target output (exclude the first token)
        tgt_out = tgt[:, 1:]

        # Compute the loss
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))

        # Backward pass: compute gradients
        loss.backward()

        # Update model weights
        optimizer.step()

        # Accumulate the loss
        losses += loss.item()

    # Return the average loss for the epoch
    return losses / len(trainloader)

### Training WITHOUT gradient accumulation

In [None]:
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.nn import CrossEntropyLoss
from time import time
from torch.utils.data import DataLoader

# Hyperparameters
LEARNING_RATE = 0.0002
BETAS = (0.9, 0.98)
EPSILON = 1e-9
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 15
PRINT_EVERY = 3
EARLY_STOPPING_PATIENCE = 5
COSINE_T0 = 5
COSINE_T_MULT = 2
COSINE_ETA_MIN = 1e-6

# Initialize the optimizer, scheduler, and loss function
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, betas=BETAS, eps=EPSILON, weight_decay=WEIGHT_DECAY)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=COSINE_T0, T_mult=COSINE_T_MULT, eta_min=COSINE_ETA_MIN)
loss_fn = CrossEntropyLoss()

# Training variables
train_loss_history = []
val_loss_history = []
best_val_loss = float('inf')
best_epoch = 0
no_improvement_counter = 0  # Counter for early stopping

# Start the training timer
training_start_time = time()

# Training loop
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time()
    
    # Train for one epoch
    epoch_train_loss = train_epoch(model, optimizer, trainloader, loss_fn, DEVICE)
    train_loss_history.append(epoch_train_loss)
    
    # Evaluate on the validation set
    epoch_val_loss = evaluate(model, valloader, loss_fn, DEVICE)
    val_loss_history.append(epoch_val_loss)
    
    # Update the learning rate
    scheduler.step()
    
    # Check if the current model is the best so far
    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        best_epoch = epoch
        no_improvement_counter = 0  # Reset the early stopping counter
        
        # Save the best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': epoch_train_loss,
            'val_loss': epoch_val_loss,
        }, 'best_model.pt')
    else:
        no_improvement_counter += 1  # Increment the counter if no improvement
    
    # Print training metrics every `PRINT_EVERY` epochs
    if epoch % PRINT_EVERY == 0:
        print(f"Epoch: {epoch}")
        print(f"   Training Loss: {epoch_train_loss:.3f}, Validation Loss: {epoch_val_loss:.3f}")
        print(f"   Learning Rate: {scheduler.get_last_lr()[0]:.6f}")
        print(f"   Epoch Time: {time() - epoch_start_time:.3f} seconds")
    
    # Early stopping if no improvement for `EARLY_STOPPING_PATIENCE` epochs
    if no_improvement_counter >= EARLY_STOPPING_PATIENCE:
        print(f'Early stopping at epoch {epoch} after {EARLY_STOPPING_PATIENCE} epochs without improvement.')
        break
    
    # Stop training if the total training time exceeds 10 minutes (600 seconds)
    if time() - training_start_time >= 600:
        print(f'Reached ten minutes of training without overfitting at epoch {epoch}')
        break

# Calculate total training time
total_training_time = time() - training_start_time

# Print training summary
print(f'Total Training Time: {total_training_time / 60:.2f} minutes')
print(f"The best model was obtained at epoch {best_epoch} with:")
print(f"   Training Loss: {train_loss_history[best_epoch - 1]:.3f}")
print(f"   Validation Loss: {best_val_loss:.3f}")

In [None]:


# Creazione del grafico
plt.figure(figsize=(10, 6))  # Imposta la dimensione della figura
plt.plot(range(1, len(train_loss_history) + 1), train_loss_history, label='Training Loss', marker='o', linestyle='-', color='blue')
plt.plot(range(1, len(val_loss_history) + 1), val_loss_history, label='Validation Loss', marker='o', linestyle='-', color='orange')

# Aggiunta di etichette e titolo
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Training and Validation Loss Over Epochs', fontsize=14, fontweight='bold')

# Aggiunta della griglia per migliorare la leggibilità
plt.grid(True, linestyle='--', alpha=0.6)

# Aggiunta della legenda
plt.legend(fontsize=12)

# Mostra il grafico
plt.show()

# Creazione di un DataFrame per le perdite
loss_table = pd.DataFrame({
    'Epoch': range(1, len(train_loss_history) + 1),  # Numero di epoche
    'Training Loss': train_loss_history,             # Perdite di training
    'Validation Loss': val_loss_history              # Perdite di validazione
})

# Stampa della tabella
print(loss_table)

### Training WITH gradient accumulation

In [None]:
# Function to train for one epoch with gradient accumulation
def train_epoch_with_accumulation(model, optimizer, trainloader, loss_fn, DEVICE, accumulation_steps):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad()  # Clear gradients at the start of the epoch

    for step, (src, tgt) in enumerate(trainloader):
        src = src.to(DEVICE)  # Move data to the correct device (CPU/GPU)
        tgt = tgt.to(DEVICE)

        # Prepare input and output for the decoder
        tgt_input = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        # Forward pass
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input, DEVICE)
        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask)

        # Compute the loss
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss = loss / accumulation_steps  # Normalize loss for gradient accumulation
        loss.backward()  # Accumulate gradients

        # Perform optimizer step and zero gradients after `accumulation_steps`
        if (step + 1) % accumulation_steps == 0:
            optimizer.step()  # Update weights
            optimizer.zero_grad()  # Clear gradients for the next accumulation

        # Accumulate the loss
        total_loss += loss.item() * accumulation_steps  # Scale back the loss

    # Return the average loss for the epoch
    return total_loss / len(trainloader)

In [None]:
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.nn import CrossEntropyLoss
from time import time
from torch.utils.data import DataLoader

# Hyperparameters
LEARNING_RATE = 0.0002
BETAS = (0.9, 0.98)
EPSILON = 1e-9
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 15
PRINT_EVERY = 3
EARLY_STOPPING_PATIENCE = 5
COSINE_T0 = 5
COSINE_T_MULT = 2
COSINE_ETA_MIN = 1e-6
GRADIENT_ACCUMULATION_STEPS = 4  # Number of steps to accumulate gradients

# Initialize the optimizer, scheduler, and loss function
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, betas=BETAS, eps=EPSILON, weight_decay=WEIGHT_DECAY)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=COSINE_T0, T_mult=COSINE_T_MULT, eta_min=COSINE_ETA_MIN)
loss_fn = CrossEntropyLoss()

# Training variables
train_loss_history = []
val_loss_history = []
best_val_loss = float('inf')
best_epoch = 0
no_improvement_counter = 0  # Counter for early stopping

# Start the training timer
training_start_time = time()
# Training loop with gradient accumulation
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time()
    
    # Train for one epoch with gradient accumulation
    epoch_train_loss = train_epoch_with_accumulation(model, optimizer, trainloader, loss_fn, DEVICE, GRADIENT_ACCUMULATION_STEPS)
    train_loss_history.append(epoch_train_loss)
    
    # Evaluate on the validation set
    epoch_val_loss = evaluate(model, valloader, loss_fn, DEVICE)
    val_loss_history.append(epoch_val_loss)
    
    # Update the learning rate
    scheduler.step()
    
    # Check if the current model is the best so far
    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        best_epoch = epoch
        no_improvement_counter = 0  # Reset the early stopping counter
        
        # Save the best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': epoch_train_loss,
            'val_loss': epoch_val_loss,
        }, 'best_model.pt')
    else:
        no_improvement_counter += 1  # Increment the counter if no improvement
    
    # Print training metrics every `PRINT_EVERY` epochs
    if epoch % PRINT_EVERY == 0:
        print(f"Epoch: {epoch}")
        print(f"   Training Loss: {epoch_train_loss:.3f}, Validation Loss: {epoch_val_loss:.3f}")
        print(f"   Learning Rate: {scheduler.get_last_lr()[0]:.6f}")
        print(f"   Epoch Time: {time() - epoch_start_time:.3f} seconds")
    
    # Early stopping if no improvement for `EARLY_STOPPING_PATIENCE` epochs
    if no_improvement_counter >= EARLY_STOPPING_PATIENCE:
        print(f'Early stopping at epoch {epoch} after {EARLY_STOPPING_PATIENCE} epochs without improvement.')
        break
    
    # Stop training if the total training time exceeds 10 minutes (600 seconds)
    if time() - training_start_time >= 600:
        print(f'Reached ten minutes of training without overfitting at epoch {epoch}')
        break

# Calculate total training time
total_training_time = time() - training_start_time

# Print training summary
print(f'Total Training Time: {total_training_time / 60:.2f} minutes')
print(f"The best model was obtained at epoch {best_epoch} with:")
print(f"   Training Loss: {train_loss_history[best_epoch - 1]:.3f}")
print(f"   Validation Loss: {best_val_loss:.3f}")

## Testing