Se compara el desempeño entre los siguientes modelos/solvers:

* Modelo base
* Modelo bloque Transformer
* Modelo bloque Transformer * 6
* Modelo con encoding posicional
* Greedy
* OR-Tools


## Descarga de librerías

In [2]:
# Descarga de librerías

# Linux only, doesn't work on windows
# ! python -c "import ortools" 2>/dev/null  && echo "OR-Tools is already installed" || pip install ortools -q
# ! [[ ! -d eda ]]  && echo "Downloading eda repo" && curl -L  https://github.com/rilianx/eda/archive/refs/heads/main.tar.gz | tar xzvf - && mv eda-main eda
# import torch
# !pip install tqdm

In [3]:
# ! curl -LO https://github.com/rilianx/eda/archive/refs/heads/main.tar.gz
# ! 7z x main.tar.gz
# ! 7z x main.tar
# ! move eda-main eda
# ! del main.tar.gz
# ! del main.tar

## state2vecSeq


In [1]:
# Generación de datos


from copy import deepcopy
import random
import math

import numpy as np
from torch.nn.functional import one_hot
from eda.TSP import TSP_Instance, TSP_Environment, TSP_State
from eda.solveTSP_v2 import solve
env = TSP_Environment

def distance(punto1, punto2):
    return math.sqrt((punto1[0] - punto2[0])**2 + (punto1[1] - punto2[1])**2)

# función para transformar un estado tsp en una secuencia de vectores
# para el modelo basado en capas de atención
def state2vecSeq(self):
    # creamos dos diccionarios para mantenre un mapeo de los
    # movimientos con los índices de la secuencia del modelo de aprendizaje

    city_locations = self.inst_info.city_locations

    idx2move = dict()
    move2idx = dict()
    origin = city_locations[self.visited[-1]]
    destination = city_locations[self.visited[0]]

    origin_dist = 0.0
    dest_dist = distance(origin, destination)

    seq = [list(origin) + [1,0] + [origin_dist, dest_dist], # Última ciudad visitada (origen)
           list(destination) + [0, 1] + [dest_dist, 0.0]]  # Ciudad final

    idx2move[0] = None
    idx2move[1] = ("constructive", self.visited[0])
    move2idx[self.visited[0]] = 1

    idx = 2
    for i in self.not_visited:
        point = list(city_locations[i])
        origin_dist = distance( point, origin)
        dest_dist = distance( point, destination)
        city_vector = point + [0, 0] + [origin_dist, dest_dist] # Otras ciudades

        seq.append(city_vector)
        idx2move[idx] = ("constructive", i)
        move2idx[i] = idx
        idx += 1

    return seq, idx2move, move2idx




## Meta clase para definir modelos y ciclo de entrenamiento

Se definen hiperparámetros de los modelos a evaluar

In [2]:
## Todos los modelos serán entrenados con el mismo dataset
# X: [20000, 11, 6], Y: [20000, 11]
# donde X: (nb_sample, max_cities + 1, param_count), Y: (nb_sample, max_cities+1)
import torch
from tqdm import tqdm
class Model:
    # El modelo se genera en el constructor y se guarda en self.model
    def __init__(self, 
         input_dim = 6,
         num_heads = 10,
         head_dim = 64,
         city_count = 50,
                 
         batch_size = 512,
         train_split = 0.5,
         nb_samples = 20000,
         epochs = 10):

        self.city_count = city_count # Número de ciudades a evaluar
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        ## Parámetros modelo
        self.input_dim = input_dim  # Dimensión de la entrada
        self.num_heads = num_heads  # Número de cabezas en la atención multi-cabeza
        self.head_dim = head_dim  # Dimensión de cada cabeza
        ## Parámetros entrenamiento
        self.batch_size = batch_size
        self.train_split = train_split
        self.nb_samples = nb_samples
        self.epochs = 10

        
        self.model = None
    
    
    def load_model(self):
        raise NotImplementedError("La función 'load_model' debe ser declarada")
    def unload_model(self):
        raise NotImplementedError("La función 'unload_model' debe ser implementada");

    def train(self, x, y):
        raise NotImplementedError("La función 'train' debe ser declarada")
      

    def generate_data(self, use_progress_bar=False):
        X = []  # Lista para almacenar las secuencias de entrada
        Y = []  # Lista para almacenar las etiquetas objetivo (las siguientes ciudades a visitar)
        seq_len = self.city_count + 1  # Longitud de la secuencia, ajustada para incluir una ciudad extra
        
        # If the flag is set, initialize the progress bar
        pbar = tqdm(total=self.nb_samples, desc="Generating data", unit="sample", position=0, leave=True) if use_progress_bar else None
        
        # Bucle para generar datos hasta alcanzar el número deseado de muestras
        while True:
            # 1. Generamos instancia aleatoria
            n_cities = self.city_count
            dim = 2  # Dimensión para las coordenadas de la ciudad (2D: x, y)
            city_points = np.random.rand(n_cities, dim)  # Generar puntos aleatorios para las ciudades
            inst_info = TSP_Instance(city_points)
    
            # 2. Resolvemos TSP usando algoritmo tradicional
            solution = solve(city_points)  # Resolver el TSP y obtener un estado final
    
            # 3. Iteramos sobre los movimientos de la solución final para generar varias muestras:
            # estado (X) -> movimiento (Y)
            current_state = TSP_State(inst_info)
            env.state_transition(current_state, ("constructive", solution.visited[0]))
            samples_per_sol = self.city_count - 1  # Número máximo de muestras por solución
            
            for move in [("constructive", city) for city in solution.visited[1:]]:
                seq, _, move2idx = state2vecSeq(current_state)  # Convertir el estado actual a secuencia vectorizada
    
                X.append(torch.tensor(seq))  # Añadir la secuencia a X
                Y.append(one_hot(torch.tensor(move2idx[move[1]]), num_classes=seq_len))
                #Y.append(to_categorical(move2idx[move[1]], num_classes=seq_len))  # Añadir el movimiento como categoría a Y
    
                env.state_transition(current_state, move)  # Hacer la transición al siguiente estado
    
                # Actualizar el progreso de la barra si se está usando
                if use_progress_bar:
                    pbar.update(1)
    
                # Condiciones de parada basadas en el número de ciudades visitadas/no visitadas o muestras generadas
                if len(current_state.visited) > samples_per_sol or len(X) >= self.nb_samples:
                    break
    
            # Romper el bucle externo si se ha alcanzado el número deseado de muestras
            if len(X) >= self.nb_samples:
                break
        
        # Close the progress bar if it was used
        if use_progress_bar:
            pbar.close()
    
        X_padded = torch.nn.utils.rnn.pad_sequence(X, batch_first=True)
        
        return X_padded, torch.stack(Y)


Model()

<__main__.Model at 0x26209d7a5d0>

In [3]:
## Definición base de modelo para usar el mismo entrenamiento en los distintos modelos
import pandas as pd
import gc
class TrainableModel(Model):
    def __init__(self):
        super().__init__()
        # Parámetros del modelo
    def unload_model(self):
        del model.model
        torch.cuda.empty_cache()
        gc.collect()

    def train(self, xt, yt, xv, yv, num_iter=-1, use_progress_bar=False):
        # self.load_model()
        # Asumiendo que X_padded y Y_stacked ya están definidos y son tensores de PyTorch
        trd = TensorDataset(xt, yt)
        ted = TensorDataset(xv, yv)
    
        # # Dividir el dataset en entrenamiento y prueba
        # train_size = int(self.train_split * len(dataset))
        # test_size = len(dataset) - train_size
        train_dataset, test_dataset = trd, ted
    
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
    
        # Definir el modelo, la función de pérdida y el optimizador
        loss_function = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.model.parameters())
    
        # Initialize the DataFrame to store training results
        df = pd.DataFrame(columns=["Model Name", "cities", "iter", "Epoch",
                                   "Training Loss", "Training Accuracy",
                                   "Validation Loss", "Validation Accuracy"])
    
        # Initialize the progress bar for epochs if required
        epoch_range = range(self.epochs)
        if use_progress_bar:
            epoch_range = tqdm(epoch_range, desc="Training Epochs", unit="epoch", position = 0, leave = True)
        
        print("Entrenando modelo...")
        for epoch in epoch_range:
            self.model.train()
            train_loss = 0
            correct = 0
            total = 0
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()  # Limpia los gradientes
                outputs = self.model(X_batch)  # Obtenemos logits
                loss = loss_function(outputs, y_batch.argmax(dim=1))  # Calcular la pérdida
                loss.backward()  # Backward pass
                optimizer.step()  # Actualizar parámetros
                train_loss += loss.item() * X_batch.size(0)
                _, predicted = torch.max(outputs.data, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch.argmax(dim=1)).sum().item()
    
            train_loss /= len(train_loader.dataset)
            train_accuracy = 100 * correct / total
    
            # Validación
            self.model.eval()
            validation_loss = 0
            correct = 0
            total = 0
            with torch.no_grad():
                for X_batch, y_batch in test_loader:
                    outputs = self.model(X_batch)
                    loss = loss_function(outputs, y_batch.argmax(dim=1))
                    validation_loss += loss.item() * X_batch.size(0)
                    _, predicted = torch.max(outputs.data, 1)
                    total += y_batch.size(0)
                    correct += (predicted == y_batch.argmax(dim=1)).sum().item()
    
            validation_loss /= len(test_loader.dataset)
            validation_accuracy = 100 * correct / total
    
            # Log results to DataFrame
            df = pd.concat([df, pd.DataFrame([{
                "Model Name": type(self).__name__,
                "cities": self.city_count,
                "iter": num_iter,
                "Epoch": epoch + 1,
                "Training Loss": train_loss,
                "Training Accuracy": train_accuracy,
                "Validation Loss": validation_loss,
                "Validation Accuracy": validation_accuracy
            }])], ignore_index=True)
    
            # print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%')
            # print(f'Epoch {epoch+1}, Val Loss: {validation_loss:.4f}, Val Accuracy: {validation_accuracy:.2f}%')
    
        # If tqdm was used, close the progress bar
        if use_progress_bar:
            epoch_range.close()
    
        # self.unload_model()
        return df;


In [4]:
## Modelo base

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, random_split

class BaseModel(TrainableModel):

    
    def load_model(self):
        self.model = BaseModel.CustomModel(self.input_dim, self.num_heads, self.head_dim)
        
    class CustomModel(nn.Module):
        def __init__(self, input_dim, num_heads, head_dim, dropout_rate=0.2):
            super(BaseModel.CustomModel, self).__init__()
            #self.seq_length = seq_length  # Asumiendo una longitud fija de secuencia para simplificar
            self.num_heads = num_heads
            self.head_dim = head_dim

            # Proyección de entrada
            self.input_projection = nn.Linear(input_dim, num_heads * head_dim)

            # Capa de atención multi-cabeza
            self.multihead_attention = nn.MultiheadAttention(embed_dim=num_heads * head_dim,
                                                            num_heads=num_heads,
                                                            dropout=dropout_rate)

            # Capas lineales individuales para cada posición de la secuencia
            # Esto es un cambio respecto al código original para aplicar una capa lineal por posición de salida
            self.positionwise_linear = nn.Linear(num_heads * head_dim, 1)

            # Capa de salida final, después de un flatten, para aplicar Softmax
            # Nota: Softmax se aplica después del flatten, por lo tanto no se define aquí como una capa pero sí en el forward

        def generate_attention_mask(self, x, padding_value=0):
            # Identificar posiciones de padding en x
            mask = (x.sum(dim=-1) == padding_value)  # Asumiendo que el padding se puede identificar sumando los valores de la característica y comparando con 0
            mask = mask.to(dtype=torch.bool)  # Convierte a bool para usar como máscara
            # PyTorch espera una máscara con True y False donde True indica donde aplicar la máscara
            return mask


        def forward(self, x, seq_lengths=10, return_probabilities=False):
            # x: [batch_size, seq_length, input_dim]
            x = x.float()

            max_len = x.shape[1]

            # Generar máscara de atención basada en las longitudes de las secuencias
            attn_mask = None

            # Aplicar proyección de entrada
            x_proj = self.input_projection(x)
            x_proj = x_proj.permute(1, 0, 2)  # Reordenar para multihead_attention: [seq_length, batch_size, num_heads*head_dim]


            # Aplicar atención multi-cabeza
            attn_output, _ = self.multihead_attention(x_proj, x_proj, x_proj, attn_mask=attn_mask)
            attn_output = attn_output.permute(1, 0, 2)  # Reordenar de vuelta: [batch_size, seq_length, num_heads*head_dim]

            # Aplicar la capa lineal posición por posición
            # Usamos una capa lineal que se aplica a cada vector de salida de la atención de forma independiente
            positionwise_output = self.positionwise_linear(attn_output)

            # Flatten
            flat_output = positionwise_output.view(positionwise_output.size(0), -1)  # [batch_size, seq_length]

            # Softmax
            if return_probabilities:
                output = F.softmax(flat_output, dim=-1)
                return output
            else: #return logits
                return flat_output


    
    def __init__(self):
        super().__init__()
    def load_model(self):
        self.model = BaseModel.CustomModel(input_dim=self.input_dim, num_heads=self.num_heads, head_dim=self.head_dim)
        self.model.to(self.device)



In [5]:
class TransformerBlockModel(TrainableModel):

  class CustomModel(nn.Module):
    class TransformerBlock(nn.Module):
        def __init__(self, input_dim, num_heads, head_dim, ff_dim, dropout_rate=0.2):
            super(TransformerBlockModel.CustomModel.TransformerBlock, self).__init__()
            self.attention = nn.MultiheadAttention(embed_dim=num_heads * head_dim, num_heads=num_heads, dropout=dropout_rate)
            self.norm1 = nn.LayerNorm(input_dim)  # LayerNorm based on input_dim
            self.ff = nn.Sequential(
                nn.Linear(input_dim, ff_dim),
                nn.ReLU(),
            )
            self.norm2 = nn.LayerNorm(input_dim)  # LayerNorm based on input_dim
            self.dropout = nn.Dropout(dropout_rate)
    
        def forward(self, x):
            # Attention block
            attn_output, _ = self.attention(x, x, x)  # x: (seq_len, batch_size, input_dim)
            x = self.norm1(x + self.dropout(attn_output))  # Residual + Norm
            
            # Feed-forward block
            ff_output = self.ff(x)
            x = self.norm2(x + self.dropout(ff_output))  # Residual + Norm
        
            return x

    def __init__(self, input_dim, num_heads, head_dim, dropout_rate=0.2):
        super(TransformerBlockModel.CustomModel, self).__init__()
        #self.seq_length = seq_length  # Asumiendo una longitud fija de secuencia para simplificar
        self.num_heads = num_heads
        self.head_dim = head_dim
        
        # Proyección de entrada
        self.input_projection = nn.Linear(input_dim, num_heads * head_dim)
        
        self.attention_blocks = nn.ModuleList([
          TransformerBlockModel.CustomModel.TransformerBlock(input_dim=num_heads * head_dim,
                                        num_heads=num_heads,
                                        head_dim=head_dim,
                                        ff_dim=num_heads * head_dim,
                                        dropout_rate=dropout_rate)
        ])
    
        # Capas lineales individuales para cada posición de la secuencia
        # Esto es un cambio respecto al código original para aplicar una capa lineal por posición de salida
        self.positionwise_linear = nn.Linear(num_heads * head_dim, 1)
        
        # Capa de salida final, después de un flatten, para aplicar Softmax
        # Nota: Softmax se aplica después del flatten, por lo tanto no se define aquí como una capa pero sí en el forward

    def generate_attention_mask(self, x, padding_value=0):
      # Identificar posiciones de padding en x
      mask = (x.sum(dim=-1) == padding_value)  # Asumiendo que el padding se puede identificar sumando los valores de la característica y comparando con 0
      mask = mask.to(dtype=torch.bool)  # Convierte a bool para usar como máscara
      # PyTorch espera una máscara con True y False donde True indica donde aplicar la máscara
      return mask
    
    
    def forward(self, x, seq_lengths=10, return_probabilities=False):
      # x: [batch_size, seq_length, input_dim]
      x = x.float()
    
      max_len = x.shape[1]
    
      # Generar máscara de atención basada en las longitudes de las secuencias
      attn_mask = None
    
      # Aplicar proyección de entrada
      x_proj = self.input_projection(x)
      attn_output = x_proj
      x_proj = x_proj.permute(1, 0, 2)  # Reordenar para multihead_attention: [seq_length, batch_size, num_heads*head_dim]
    
    
      # Aplicar atención multi-cabeza
      attn_output = self.attention_blocks[0](x_proj)
      attn_output = attn_output.permute(1, 0, 2)  # Reordenar de vuelta: [batch_size, seq_length, num_heads*head_dim]
    
      # Aplicar la capa lineal posición por posición
      # Usamos una capa lineal que se aplica a cada vector de salida de la atención de forma independiente
      positionwise_output = self.positionwise_linear(attn_output)
    
      # Flatten
      flat_output = positionwise_output.view(positionwise_output.size(0), -1)  # [batch_size, seq_length]
    
      # Softmax
      if return_probabilities:
        output = F.softmax(flat_output, dim=-1)
        return output
      else: #return logits
        return flat_output

  def load_model(self):
    self.model = TransformerBlockModel.CustomModel(input_dim=self.input_dim, num_heads=self.num_heads, head_dim=self.head_dim)
    self.model = self.model.to(self.device)


In [6]:
## Transformer Block Multicapa

In [7]:
class MultilayerTransformerBlockModel(TrainableModel):
  class CustomModel(TransformerBlockModel.CustomModel):
      def __init__(self, input_dim, num_heads, head_dim, dropout_rate=0.2):
          super().__init__(input_dim, num_heads, head_dim, dropout_rate)
          self.attention_blocks = nn.ModuleList([TransformerBlockModel.CustomModel.TransformerBlock(input_dim=num_heads * head_dim,
                                            num_heads=num_heads,
                                            head_dim=head_dim,
                                            ff_dim=num_heads * head_dim,
                                            dropout_rate=dropout_rate)]*6)
      def forward(self, x, seq_lengths=10, return_probabilities=False):
          # x: [batch_size, seq_length, input_dim]
          x = x.float()

          max_len = x.shape[1]

          # Generar máscara de atención basada en las longitudes de las secuencias
          attn_mask = None

          # Aplicar proyección de entrada
          x_proj = self.input_projection(x)
          attn_output = x_proj
          x_proj = x_proj.permute(1, 0, 2)  # Reordenar para multihead_attention: [seq_length, batch_size, num_heads*head_dim]


          # Aplicar atención multi-cabeza
          for i in range(len(self.attention_blocks)):
              attn_output = self.attention_blocks[i](x_proj)
          
          attn_output = attn_output.permute(1, 0, 2)  # Reordenar de vuelta: [batch_size, seq_length, num_heads*head_dim]

          # Aplicar la capa lineal posición por posición
          # Usamos una capa lineal que se aplica a cada vector de salida de la atención de forma independiente
          positionwise_output = self.positionwise_linear(attn_output)

          # Flatten
          flat_output = positionwise_output.view(positionwise_output.size(0), -1)  # [batch_size, seq_length]

          # Softmax
          if return_probabilities:
            output = F.softmax(flat_output, dim=-1)
            return output
          else: #return logits
            return flat_output
  def __init__(self):
    super().__init__()
  def load_model(self):
    self.model = self.CustomModel(input_dim=self.input_dim, num_heads=self.num_heads, head_dim=self.head_dim)
    self.model = self.model.to(self.device)

In [8]:
# import pandas as pd
# df = pd.DataFrame(columns=["Model Name", "cities", "iter", "Epoch", "Training Loss", "Training Accuracy", "Validation Loss", "Validation Accuracy"])
# df

# dfb = pd.DataFrame(columns=["Model Name", "cities", "iter", "avg path cost"])

# import gc
# models = [BaseModel(), TransformerBlockModel(), MultilayerTransformerBlockModel()]
# # for city_count in [10, 50, 100, 500]:
# num_iters=10
# eval_count = 50
# # for city_count in [50]:

# from eda.TSP import TSP_Instance, TSP_Environment, TSP_State, evalConstructiveActions, plot_tour
# from eda.agents import SingleAgentSolver, GreedyAgent
# import numpy as np


# class ModelEvalActions():
#   def __init__(self, model):
#     self.model=model

#   # permite evaluar acctiones de varios estados a la vez
#   # para optimizar los cáluclos del modelo
#   def __call__(self, states, env):
#     single_state = False
#     if not isinstance(states, list):
#       single_state=True
#       states = [states]

#     evals = [list() for _ in states]
#     vecSeqs=[]; move2idx =[]

#     for state in states:
#       vecSeq, _, mov2idx = state.state2vecSeq()
#       vecSeqs.append(vecSeq)
#       move2idx.append(mov2idx)
#     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#     predictions = self.model(torch.tensor(vecSeqs).to(device), return_probabilities=True)

#     for k in range(len(states)):
#       state = states[k]
#       for action in env.gen_actions(state, "constructive"):
#           idx = move2idx[k][action[1]] #mapping from move to output index (model)
#           evals[k].append((action,predictions[k][idx]))

#     if single_state: return evals[0]
#     else: return evals

# for iter in range(num_iters):
#     X = None
#     Y = None
#     for model in models:
#         if X == None and Y == None:
#             print(f"Generando datos iteración i={iter}")
#             X, Y = model.generate_data(use_progress_bar=True)
#             X = X.to(model.device)
#             Y = Y.to(model.device)
        
#         name = type(model).__name__
#         print("Entrenando", name, "con n=", model.city_count)
#         trained_model_df = model.train(X, Y, num_iter=iter, use_progress_bar=True)
#         pd.concat([df, trained_model_df])
#         # Evaluación post entrenamiento
#         print("Evaluando modelo: Generando 20 instancias")
#         instances = [
#             TSP_Instance(np.random.rand(model.city_count, 2)) for _ in tqdm(
#                 range(eval_count), desc="Instancias", unit="instance", position=0, leave=True)
#         ]
#         greedy = SingleAgentSolver (env,GreedyAgent(ModelEvalActions(model.model)))
#         solutions = []
#         for instance in tqdm(instances, desc="Solving Instances", unit="instance", position=0, leave=True):
#             solution, *_ = greedy.solve(TSP_State(instance, visited=[0]))
#             solutions.append(solution.cost)
#         model.unload_model()

#         solutions_prom = sum(solutions) / len(solutions)
#         dfb = pd.concat([dfb, pd.DataFrame([{
#             "Model Name": type(model).__name__,
#             "iter": iter,
#             "cities": model.city_count,
#             "avg path cost": solutions_prom
#         }])])
        
#     del X
#     del Y
#     X = None
#     Y = None


# df

In [9]:
df.to_csv("out2.csv")

NameError: name 'df' is not defined

In [None]:
dfb.groupby(["Model Name", "cities"]).mean()

In [10]:
df

NameError: name 'df' is not defined

In [11]:
train_df = pd.DataFrame(columns=["model_name", "fold", "epoch", "tr_loss", "tr_acc", "val_loss", "val_acc"])
train_df

Unnamed: 0,model_name,fold,epoch,tr_loss,tr_acc,val_loss,val_acc


In [12]:
eval_df = pd.DataFrame(columns=["model_name", "fold", "cost"])

In [13]:
from sklearn.model_selection import KFold

from eda.TSP import TSP_Instance, TSP_Environment, TSP_State, evalConstructiveActions, plot_tour
from eda.agents import SingleAgentSolver, GreedyAgent
nb_cities = 50
nb_samples=20000
nb_eval = 50
k_folds=10
iter=0
models = [BaseModel(), TransformerBlockModel(), MultilayerTransformerBlockModel()]
class ModelEvalActions():
  def __init__(self, model):
    self.model=model

  # permite evaluar acctiones de varios estados a la vez
  # para optimizar los cáluclos del modelo
  def __call__(self, states, env):
    single_state = False
    if not isinstance(states, list):
      single_state=True
      states = [states]

    evals = [list() for _ in states]
    vecSeqs=[]; move2idx =[]

    for state in states:
      vecSeq, _, mov2idx = state.state2vecSeq()
      vecSeqs.append(vecSeq)
      move2idx.append(mov2idx)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    predictions = self.model(torch.tensor(vecSeqs).to(device), return_probabilities=True)

    for k in range(len(states)):
      state = states[k]
      for action in env.gen_actions(state, "constructive"):
          idx = move2idx[k][action[1]] #mapping from move to output index (model)
          evals[k].append((action,predictions[k][idx]))

    if single_state: return evals[0]
    else: return evals


for model in models:
        print(f"Iteration {iter}: Preparing dataset")

        # Generate data once per iteration
        print(f"Generating data for iteration {iter}")
        X, Y = model.generate_data(use_progress_bar=True)
        X = X.to(model.device)
        Y = Y.to(model.device)

        kfold = KFold(n_splits=k_folds, shuffle=True, random_state=iter)
        fold = 0

        for train_idx, val_idx in kfold.split(X):
            fold += 1

            print(f"Training {type(model).__name__} on fold {fold}")

            # Split data into train and validation sets
            X_train, X_val = X[train_idx], X[val_idx]
            Y_train, Y_val = Y[train_idx], Y[val_idx]

            model.load_model()
            # Train the model
            trained_model_df = model.train(X_train, Y_train, X_val, Y_val, use_progress_bar=True)
                        # Log training metrics
            for _, row in trained_model_df.iterrows():
                train_df = pd.concat([train_df, pd.DataFrame([{
                    "model_name": type(model).__name__,
                    "fold": fold,
                    "epoch": row["Epoch"],
                    "tr_loss": row["Training Loss"],
                    "tr_acc": row["Training Accuracy"],
                    "val_loss": row["Validation Loss"],
                    "val_acc": row["Validation Accuracy"]
                }])])

            print(f"Evaluating {type(model).__name__} on fold {fold}")
            instances = [
                TSP_Instance(np.random.rand(model.city_count, 2)) for _ in tqdm(
                    range(nb_eval), desc="Instances", unit="instance", position=0, leave=True
                )
            ]
            greedy = SingleAgentSolver(env, GreedyAgent(ModelEvalActions(model.model)))
            solutions = []

            for instance in tqdm(instances, desc="Solving Instances", unit="instance", position=0, leave=True):
                solution, *_ = greedy.solve(TSP_State(instance, visited=[0]))
                eval_df = pd.concat([eval_df, pd.DataFrame([{
                    "model_name" : type(model).__name__,
                    "fold": fold,
                    "cost": solution.cost,                
            }]) ])

            model.unload_model()







Iteration 0: Preparing dataset
Generating data for iteration 0


Generating data: 100%|█████████████████████████████████████████████████████| 20000/20000 [00:10<00:00, 1862.58sample/s]


Training BaseModel on fold 1


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:17<00:00,  1.75s/epoch]
  train_df = pd.concat([train_df, pd.DataFrame([{


Evaluating BaseModel on fold 1


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 24927.52instance/s]
  eval_df = pd.concat([eval_df, pd.DataFrame([{
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.42instance/s]


Training BaseModel on fold 2


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.50s/epoch]


Evaluating BaseModel on fold 2


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 16677.15instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.42instance/s]


Training BaseModel on fold 3


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.51s/epoch]


Evaluating BaseModel on fold 3


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 19933.01instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.94instance/s]


Training BaseModel on fold 4


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.62s/epoch]


Evaluating BaseModel on fold 4


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25007.77instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.40instance/s]


Training BaseModel on fold 5


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.52s/epoch]


Evaluating BaseModel on fold 5


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25631.29instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.16instance/s]


Training BaseModel on fold 6


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.53s/epoch]


Evaluating BaseModel on fold 6


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 24963.12instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.24instance/s]


Training BaseModel on fold 7


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.51s/epoch]


Evaluating BaseModel on fold 7


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 27335.14instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.01instance/s]


Training BaseModel on fold 8


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.64s/epoch]


Evaluating BaseModel on fold 8


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 19988.11instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.14instance/s]


Training BaseModel on fold 9


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.63s/epoch]


Evaluating BaseModel on fold 9


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25034.64instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.20instance/s]


Training BaseModel on fold 10


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.52s/epoch]


Evaluating BaseModel on fold 10


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 21879.52instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.26instance/s]


Iteration 0: Preparing dataset
Generating data for iteration 0


Generating data: 100%|█████████████████████████████████████████████████████| 20000/20000 [00:10<00:00, 1822.80sample/s]


Training TransformerBlockModel on fold 1


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.97s/epoch]


Evaluating TransformerBlockModel on fold 1


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25037.63instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.08instance/s]


Training TransformerBlockModel on fold 2


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:20<00:00,  2.01s/epoch]


Evaluating TransformerBlockModel on fold 2


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25001.81instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.55instance/s]


Training TransformerBlockModel on fold 3


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:20<00:00,  2.02s/epoch]


Evaluating TransformerBlockModel on fold 3


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 19326.81instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.92instance/s]


Training TransformerBlockModel on fold 4


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.96s/epoch]


Evaluating TransformerBlockModel on fold 4


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25019.71instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.13instance/s]


Training TransformerBlockModel on fold 5


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:20<00:00,  2.10s/epoch]


Evaluating TransformerBlockModel on fold 5


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 19206.45instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.60instance/s]


Training TransformerBlockModel on fold 6


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.95s/epoch]


Evaluating TransformerBlockModel on fold 6


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25013.74instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.95instance/s]


Training TransformerBlockModel on fold 7


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.98s/epoch]


Evaluating TransformerBlockModel on fold 7


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 19122.39instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.80instance/s]


Training TransformerBlockModel on fold 8


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.96s/epoch]


Evaluating TransformerBlockModel on fold 8


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 19380.39instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.41instance/s]


Training TransformerBlockModel on fold 9


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.95s/epoch]


Evaluating TransformerBlockModel on fold 9


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25028.67instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.87instance/s]


Training TransformerBlockModel on fold 10


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:20<00:00,  2.01s/epoch]


Evaluating TransformerBlockModel on fold 10


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25028.67instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.24instance/s]


Iteration 0: Preparing dataset
Generating data for iteration 0


Generating data: 100%|█████████████████████████████████████████████████████| 20000/20000 [00:10<00:00, 1824.76sample/s]


Training MultilayerTransformerBlockModel on fold 1


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:51<00:00,  5.13s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 1


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 16667.87instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  4.01instance/s]


Training MultilayerTransformerBlockModel on fold 2


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:50<00:00,  5.02s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 2


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25004.79instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  4.10instance/s]


Training MultilayerTransformerBlockModel on fold 3


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:52<00:00,  5.28s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 3


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 25010.76instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:11<00:00,  4.17instance/s]


Training MultilayerTransformerBlockModel on fold 4


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:49<00:00,  4.98s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 4


Instances: 100%|███████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 7089.76instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  4.13instance/s]


Training MultilayerTransformerBlockModel on fold 5


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:50<00:00,  5.07s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 5


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 16671.85instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:11<00:00,  4.18instance/s]


Training MultilayerTransformerBlockModel on fold 6


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:51<00:00,  5.13s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 6


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 24989.90instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  4.07instance/s]


Training MultilayerTransformerBlockModel on fold 7


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:50<00:00,  5.04s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 7


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 16325.33instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:11<00:00,  4.19instance/s]


Training MultilayerTransformerBlockModel on fold 8


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:50<00:00,  5.08s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 8


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 19881.99instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  4.03instance/s]


Training MultilayerTransformerBlockModel on fold 9


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:51<00:00,  5.17s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 9


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 13437.25instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:14<00:00,  3.53instance/s]


Training MultilayerTransformerBlockModel on fold 10


Training Epochs:   0%|                                                                       | 0/10 [00:00<?, ?epoch/s]

Entrenando modelo...


  df = pd.concat([df, pd.DataFrame([{
Training Epochs: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:59<00:00,  5.96s/epoch]


Evaluating MultilayerTransformerBlockModel on fold 10


Instances: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 16256.99instance/s]
Solving Instances: 100%|█████████████████████████████████████████████████████████| 50/50 [00:14<00:00,  3.47instance/s]


In [18]:
train_df

Unnamed: 0,model_name,fold,epoch,tr_loss,tr_acc,val_loss,val_acc
0,BaseModel,1,1,1.555729,58.283333,0.599872,82.35
0,BaseModel,1,2,0.618517,80.038889,0.530862,83.05
0,BaseModel,1,3,0.587735,80.783333,0.556028,83.35
0,BaseModel,1,4,0.577756,81.072222,0.525841,83.35
0,BaseModel,1,5,0.566840,81.750000,0.510448,83.95
...,...,...,...,...,...,...,...
0,MultilayerTransformerBlockModel,10,6,0.550962,81.950000,0.490693,85.20
0,MultilayerTransformerBlockModel,10,7,0.547259,81.861111,0.526866,84.80
0,MultilayerTransformerBlockModel,10,8,0.551965,81.983333,0.508395,85.45
0,MultilayerTransformerBlockModel,10,9,0.540237,82.166667,0.488472,85.15


In [19]:
eval_df

Unnamed: 0,model_name,fold,cost
0,BaseModel,1,6.797305
0,BaseModel,1,6.305213
0,BaseModel,1,6.239691
0,BaseModel,1,6.563191
0,BaseModel,1,7.003203
...,...,...,...
0,MultilayerTransformerBlockModel,10,6.766856
0,MultilayerTransformerBlockModel,10,6.136618
0,MultilayerTransformerBlockModel,10,6.017674
0,MultilayerTransformerBlockModel,10,6.733908


In [35]:
ex_traindf = train_df.reset_index().drop('index', axis=1)
ex_evaldf = eval_df.reset_index().drop('index', axis=1)
ex_evaldf


Unnamed: 0,model_name,fold,cost
0,BaseModel,1,5.927755
1,BaseModel,1,6.535351
2,BaseModel,1,6.602594
3,BaseModel,1,6.763597
4,BaseModel,1,7.058743
...,...,...,...
1495,MultilayerTransformerBlockModel,10,7.180407
1496,MultilayerTransformerBlockModel,10,6.208302
1497,MultilayerTransformerBlockModel,10,5.976801
1498,MultilayerTransformerBlockModel,10,6.413471


In [36]:
ex_traindf.to_csv("train_df.csv")
ex_evaldf.to_csv("eval_df.csv")