# Imports

In [1]:
from unsloth import FastLanguageModel
import torch
from tqdm import tqdm
from typing import List, Dict, Any
import json
import re

import sys
import os
current_dir = os.path.abspath('')
project_root = os.path.dirname(current_dir)
sys.path.insert(0, project_root)

from grpo.grpo import SYSTEM_PROMPT, extract_trace, optimal_solution_reward_func, improvement_reward_func, valid_response_reward_func, strict_format_reward_func, soft_format_reward_func
from tsp.tsp_llm import load_tsp_problem_dataset, coordinates_to_tsp
from tsp.tsp import calculate_tsp_distance

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-06 00:23:27 [__init__.py:239] Automatically detected platform cuda.
Standard import failed for UnslothOnlineDPOTrainer: No module named 'UnslothOnlineDPOTrainer'. Using tempfile instead!


# Constants

In [2]:
MODEL = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
BENCHMARK_DATASET = "tsp_benchmark_dataset.json"
BENCHMARK_RESULTS = "tsp_benchmark_results_qwen2.5-7B-2.json"

# Load Model

In [3]:
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
  model_name=MODEL,
  max_seq_length=4096,
  load_in_4bit=True,
  dtype=torch.bfloat16
)

# Prepare model for inference
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.3. vLLM: 0.8.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.381 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584, padding_idx=151654)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((

# Generate

In [4]:
def extract_assistant_response_regex(text: str) -> str:
    """
    Extract the assistant's response from the decoded text output of Qwen.
    
    Args:
        text: The decoded text output from the model
        
    Returns:
        str: The extracted assistant response or empty string if no match found
    """
    # Common patterns for assistant responses in chat models
    patterns = [
        r"(?:^|\n)(?:assistant|assistant:)(.*?)(?:$|\n\s*(?:user|user:|system|system:|<\|im_end\|>))",  # Matches cases with various endings
        r"<\|im_start\|>assistant\s*(.*?)(?:<\|im_end\|>|$)",  # Matches with special tokens
        r"(?:^|\n)assistant:\s*(.*?)(?:$|\n\s*(?:user|user:|system|system:))",  # Standard chat format
    ]
    
    # Try each pattern until we find a match
    for pattern in patterns:
        matches = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if matches:
            return matches.group(1).strip()
    
    # If no structured format is found, return everything after the last "user:" or system prompt
    last_user = re.search(r"(?:^|\n)user:(?!.*\nuser:).*?\n(.*?)$", text, re.DOTALL | re.IGNORECASE)
    if last_user:
        return last_user.group(1).strip()
    
    # If all else fails, return the original text (this is a fallback)
    return text.strip()

def generate(prompt: str, max_new_tokens: int = 2500, num_samples: int = 3, temperature: float = 0.7, top_p: float = 0.9) -> Dict[str, Any]:
    """
    Generate a response from the model based on the prompt.

    Args:
        prompt: The input prompt
        max_new_tokens: Max number of tokens to generate
        num_samples: Number of samples per prompt
        temperature: Model temperature
        top_p: Model top_p
    
    Returns:
        Dict[str, Any]: Summary dict containing input token count, average output token count, and
                        model responses
    """

    # Format chat message for Qwen
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt}
    ]

    # Tokenize input and move input tensors to GPU
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    # Get input token count
    input_token_count = inputs.shape[1]

    # Generate outputs
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=max_new_tokens,
        num_return_sequences=num_samples,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        return_dict_in_generate=True,
        output_scores=True
    )
        
    # Get output token counts
    output_sequences = outputs.sequences
    avg_output_token_count = sum([len(o) - input_token_count for o in output_sequences]) / len(output_sequences)
        
    # Process outputs
    responses = [tokenizer.decode(o, skip_special_tokens=True) for o in output_sequences]
    responses = [extract_assistant_response_regex(r) for r in responses]

    # Return token counts and responses
    out = {
        "input token count": input_token_count,
        "average output token count": avg_output_token_count,
        "responses": responses
    }

    return out

# Apply Reward Functions

In [5]:
def apply_reward_functions(problem: Dict, responses: List[str]) -> Dict:
    n = len(responses)
    tsp = coordinates_to_tsp(problem['coordinates'])

    # Extract traces and distances from model responses
    traces = [extract_trace(response) for response in responses]
    distances = [round(calculate_tsp_distance(tsp, trace)) for trace in traces]
    
    # Construct args for reward functions
    completions = [[{'content': response}] for response in responses]
    kwargs = {
        'tsp': [tsp for _ in responses],
        'answer': [problem['solution'] for _ in responses],
        'reference_distance': [problem['reference_distance'] for _ in responses]
    }

    # Invoke reward functions
    optimal_solution_rewards = optimal_solution_reward_func(completions, **kwargs)
    improvement_rewards = improvement_reward_func(completions, **kwargs)
    valid_response_rewards = valid_response_reward_func(completions, **kwargs)
    strict_format_rewards = strict_format_reward_func(completions, **kwargs)
    soft_format_rewards = soft_format_reward_func(completions, **kwargs)
    
    print("NEW PROBLEM")
    print(f"target_distance: {problem['solution']['distance'] * 1.1}")
    for i in range(len(responses)):
        print(f"sample_{i}")
        print(f"model distance: {distances[i]}, optimal solution reward: {optimal_solution_rewards[i]}, improvement reward: {improvement_rewards[i]}, valid response reward: {valid_response_rewards[i]}, strict format: {strict_format_rewards[i]}, soft format: {soft_format_rewards[i]}")

    # Calculate problem averages
    problem_summary = {
        "problem_id": problem["problem_id"],
        "solution": problem["solution"],
        "average optimal solution reward": sum(optimal_solution_rewards) / n,
        "average improvement reward": sum(improvement_rewards) / n,
        "average valid response reward": sum(valid_response_rewards) / n,
        "average strict format reward": sum(strict_format_rewards) / n,
        "average soft format reward": sum(soft_format_rewards) / n
    }

    # Add summary for each sample
    for i in range(n):
        sample_key = f"sample_{i}"
        problem_summary[sample_key] = {
            "response": responses[i],
            "solution": {
                "path": traces[i],
                "distance": distances[i]
            },
            "optimal solution reward": optimal_solution_rewards[i],
            "improvement reward": improvement_rewards[i],
            "valid response reward": valid_response_rewards[i],
            "strict format reward": strict_format_rewards[i],
            "soft format reward": soft_format_rewards[i],
        }
    
    return problem_summary

# Load Benchmark Dataset and Solve

In [6]:
# Load benchmark dataset
dataset = load_tsp_problem_dataset(BENCHMARK_DATASET)

pbar = tqdm(total=len(dataset) * len(dataset["size_5"]), desc="Solving TSP Benchmark")

results = {}

# Solve benchmark dataset
for size_key in dataset:
  problems = dataset[size_key]

  size_results = []

  for problem in problems:
    prompt = problem['prompt']
    solution = problem['solution']

    # Generate 3 completions per prompt
    generate_out = generate(prompt, num_samples=3)

    # Calculate average rewards across 3 samples
    reward_out = apply_reward_functions(problem, generate_out['responses'])

    # Remove completions from gen out
    del generate_out['responses']

    # Append problem summary to results
    generate_out.update(reward_out)
    problem_summary = generate_out
    size_results.append(problem_summary)

    # Update progress bar
    pbar.update(1)
  
  results[size_key] = size_results

pbar.close()

Solving TSP Benchmark:   0%|          | 0/50 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Solving TSP Benchmark:   2%|▏         | 1/50 [01:57<1:36:11, 117.78s/it]

NEW PROBLEM
target_distance: 602.8000000000001
sample_0
model distance: 561, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 561, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 559, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:   4%|▍         | 2/50 [04:03<1:37:53, 122.37s/it]

NEW PROBLEM
target_distance: 498.30000000000007
sample_0
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_1
model distance: 471, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 453, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:   6%|▌         | 3/50 [06:08<1:36:59, 123.81s/it]

NEW PROBLEM
target_distance: 478.50000000000006
sample_0
model distance: 353, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_1
model distance: 456, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:   8%|▊         | 4/50 [07:30<1:22:06, 107.09s/it]

NEW PROBLEM
target_distance: 419.1
sample_0
model distance: 477, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_1
model distance: 403, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 381, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  10%|█         | 5/50 [09:21<1:21:21, 108.48s/it]

NEW PROBLEM
target_distance: 456.50000000000006
sample_0
model distance: 460, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 495, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 415, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  12%|█▏        | 6/50 [10:52<1:15:12, 102.57s/it]

NEW PROBLEM
target_distance: 490.6
sample_0
model distance: 458, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 458, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 458, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  14%|█▍        | 7/50 [12:02<1:05:57, 92.04s/it] 

NEW PROBLEM
target_distance: 524.7
sample_0
model distance: 485, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 477, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 648, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  16%|█▌        | 8/50 [14:08<1:11:52, 102.69s/it]

NEW PROBLEM
target_distance: 435.6
sample_0
model distance: 561, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_1
model distance: 451, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  18%|█▊        | 9/50 [15:35<1:06:55, 97.94s/it] 

NEW PROBLEM
target_distance: 498.30000000000007
sample_0
model distance: 487, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 532, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 453, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  20%|██        | 10/50 [16:30<56:20, 84.51s/it] 

NEW PROBLEM
target_distance: 573.1
sample_0
model distance: 522, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 522, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 522, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  22%|██▏       | 11/50 [18:04<56:55, 87.58s/it]

NEW PROBLEM
target_distance: 523.6
sample_0
model distance: 549, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 530, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 571, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  24%|██▍       | 12/50 [19:30<55:05, 86.99s/it]

NEW PROBLEM
target_distance: 409.20000000000005
sample_0
model distance: 409, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 478, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 441, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  26%|██▌       | 13/50 [20:43<51:08, 82.93s/it]

NEW PROBLEM
target_distance: 607.2
sample_0
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.25, soft format: 0.25
sample_1
model distance: 592, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 603, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  28%|██▊       | 14/50 [22:02<48:59, 81.66s/it]

NEW PROBLEM
target_distance: 522.5
sample_0
model distance: 475, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 604, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 601, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  30%|███       | 15/50 [22:36<39:12, 67.21s/it]

NEW PROBLEM
target_distance: 403.70000000000005
sample_0
model distance: 539, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 455, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 416, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  32%|███▏      | 16/50 [23:46<38:30, 67.96s/it]

NEW PROBLEM
target_distance: 508.20000000000005
sample_0
model distance: 462, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 690, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 462, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  34%|███▍      | 17/50 [25:12<40:22, 73.40s/it]

NEW PROBLEM
target_distance: 444.40000000000003
sample_0
model distance: 425, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 412, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 413, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  36%|███▌      | 18/50 [27:04<45:26, 85.20s/it]

NEW PROBLEM
target_distance: 458.70000000000005
sample_0
model distance: 637, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 451, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 442, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  38%|███▊      | 19/50 [28:12<41:15, 79.85s/it]

NEW PROBLEM
target_distance: 575.3000000000001
sample_0
model distance: 637, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 605, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 523, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  40%|████      | 20/50 [30:17<46:46, 93.56s/it]

NEW PROBLEM
target_distance: 382.8
sample_0
model distance: 388, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_1
model distance: 348, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  42%|████▏     | 21/50 [32:23<49:50, 103.13s/it]

NEW PROBLEM
target_distance: 554.4000000000001
sample_0
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_1
model distance: 504, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 609, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  44%|████▍     | 22/50 [34:28<51:15, 109.84s/it]

NEW PROBLEM
target_distance: 436.70000000000005
sample_0
model distance: 630, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_2
model distance: 467, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  46%|████▌     | 23/50 [35:59<46:54, 104.23s/it]

NEW PROBLEM
target_distance: 592.9000000000001
sample_0
model distance: 600, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 702, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 550, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  48%|████▊     | 24/50 [38:05<47:56, 110.62s/it]

NEW PROBLEM
target_distance: 531.3000000000001
sample_0
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_1
model distance: 646, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 658, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  50%|█████     | 25/50 [40:10<47:57, 115.08s/it]

NEW PROBLEM
target_distance: 556.6
sample_0
model distance: 533, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_1
model distance: 506, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  52%|█████▏    | 26/50 [41:59<45:19, 113.30s/it]

NEW PROBLEM
target_distance: 313.5
sample_0
model distance: 315, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 286, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 258, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  54%|█████▍    | 27/50 [42:47<35:51, 93.56s/it] 

NEW PROBLEM
target_distance: 444.40000000000003
sample_0
model distance: 460, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.25, soft format: 0.25
sample_1
model distance: 404, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 498, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  56%|█████▌    | 28/50 [44:52<37:48, 103.13s/it]

NEW PROBLEM
target_distance: 618.2
sample_0
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_1
model distance: 562, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 651, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  58%|█████▊    | 29/50 [46:41<36:39, 104.73s/it]

NEW PROBLEM
target_distance: 479.6
sample_0
model distance: 527, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 453, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 544, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  60%|██████    | 30/50 [48:14<33:46, 101.33s/it]

NEW PROBLEM
target_distance: 349.8
sample_0
model distance: 352, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 372, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 361, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  62%|██████▏   | 31/50 [49:17<28:23, 89.66s/it] 

NEW PROBLEM
target_distance: 493.90000000000003
sample_0
model distance: 473, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 484, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 484, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  64%|██████▍   | 32/50 [51:18<29:45, 99.19s/it]

NEW PROBLEM
target_distance: 675.4000000000001
sample_0
model distance: 884, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 624, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 675, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  66%|██████▌   | 33/50 [53:08<29:01, 102.42s/it]

NEW PROBLEM
target_distance: 466.40000000000003
sample_0
model distance: 520, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 561, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.25


Solving TSP Benchmark:  68%|██████▊   | 34/50 [55:14<29:09, 109.34s/it]

NEW PROBLEM
target_distance: 273.90000000000003
sample_0
model distance: 335, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_2
model distance: 279, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  70%|███████   | 35/50 [57:07<27:37, 110.52s/it]

NEW PROBLEM
target_distance: 534.6
sample_0
model distance: 534, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 587, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 563, optimal solution reward: 0.0, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  72%|███████▏  | 36/50 [58:24<23:26, 100.44s/it]

NEW PROBLEM
target_distance: 622.6
sample_0
model distance: 622, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_1
model distance: 566, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 622, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  74%|███████▍  | 37/50 [59:21<18:58, 87.58s/it] 

NEW PROBLEM
target_distance: 522.5
sample_0
model distance: 497, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 475, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 994, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  76%|███████▌  | 38/50 [1:01:03<18:20, 91.69s/it]

NEW PROBLEM
target_distance: 405.90000000000003
sample_0
model distance: 369, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 369, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 541, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  78%|███████▊  | 39/50 [1:02:27<16:23, 89.42s/it]

NEW PROBLEM
target_distance: 547.8000000000001
sample_0
model distance: 504, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.25
sample_1
model distance: 506, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 504, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  80%|████████  | 40/50 [1:04:15<15:52, 95.21s/it]

NEW PROBLEM
target_distance: 704.0
sample_0
model distance: 640, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 640, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  82%|████████▏ | 41/50 [1:06:03<14:49, 98.83s/it]

NEW PROBLEM
target_distance: 534.6
sample_0
model distance: 486, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 504, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 649, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  84%|████████▍ | 42/50 [1:08:08<14:14, 106.82s/it]

NEW PROBLEM
target_distance: 608.3000000000001
sample_0
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_1
model distance: 598, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 553, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  86%|████████▌ | 43/50 [1:09:43<12:02, 103.28s/it]

NEW PROBLEM
target_distance: 449.90000000000003
sample_0
model distance: 409, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 639, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 417, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  88%|████████▊ | 44/50 [1:11:29<10:24, 104.03s/it]

NEW PROBLEM
target_distance: 313.5
sample_0
model distance: 285, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 285, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 292, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  90%|█████████ | 45/50 [1:12:55<08:12, 98.52s/it] 

NEW PROBLEM
target_distance: 368.50000000000006
sample_0
model distance: 357, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 335, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 454, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark:  92%|█████████▏| 46/50 [1:15:00<07:06, 106.61s/it]

NEW PROBLEM
target_distance: 442.20000000000005
sample_0
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_1
model distance: 429, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  94%|█████████▍| 47/50 [1:15:48<04:27, 89.11s/it] 

NEW PROBLEM
target_distance: 465.3
sample_0
model distance: 423, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 456, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 456, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  96%|█████████▌| 48/50 [1:17:39<03:11, 95.61s/it]

NEW PROBLEM
target_distance: 526.9000000000001
sample_0
model distance: 510, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 510, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 708, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0


Solving TSP Benchmark:  98%|█████████▊| 49/50 [1:18:28<01:21, 81.53s/it]

NEW PROBLEM
target_distance: 342.1
sample_0
model distance: 311, optimal solution reward: 2.5, improvement reward: 2.5, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_1
model distance: 314, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25
sample_2
model distance: 349, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.25, soft format: 0.25


Solving TSP Benchmark: 100%|██████████| 50/50 [1:20:00<00:00, 96.01s/it]

NEW PROBLEM
target_distance: 522.5
sample_0
model distance: 0, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0
sample_1
model distance: 521, optimal solution reward: 2.5, improvement reward: 0.0, valid response reward: 0.5, strict format: 0.0, soft format: 0.0
sample_2
model distance: 542, optimal solution reward: 0.0, improvement reward: 0.0, valid response reward: 0.0, strict format: 0.0, soft format: 0.0





# Process Results and Save

In [11]:
summary = {}
del results['summary']

for size_key in results:
  size_results = results[size_key]
  size_summary = {}
  n = len(size_results)

  # Calculate summary per size
  average_input_token_count = sum([problem['input token count'] for problem in size_results]) / n
  average_output_token_count = sum([problem['average output token count'] for problem in size_results]) / n
  average_optimal_solution_reward = sum([problem['average optimal solution reward'] for problem in size_results]) / n
  average_improved_reward = sum([problem['average improvement reward'] for problem in size_results]) / n
  average_valid_response_reward = sum([problem['average valid response reward'] for problem in size_results]) / n
  average_strict_format_reward = sum([problem['average strict format reward'] for problem in size_results]) / n
  average_soft_format_reward = sum([problem['average soft format reward'] for problem in size_results]) / n

  # Load size summary
  size_summary['average input token count'] = average_input_token_count
  size_summary['average output token count'] = average_output_token_count
  size_summary['average optimal solution reward'] = average_optimal_solution_reward
  size_summary['average improvement reward'] = average_improved_reward
  size_summary['average valid response reward'] = average_valid_response_reward
  size_summary['average strict format reward'] = average_strict_format_reward
  size_summary['average soft format reward'] = average_soft_format_reward

  # Append size summary to summary
  summary[size_key] = size_summary

# Append summary to results
results['summary'] = summary

# Save benchmark results
with open("benchmark_results2.json", "w") as file:
  json.dump(results, file, indent=4)