#Environment

In [None]:
!pip install -q h5py typing-extensions wheel
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [None]:
!nvidia-smi

In [None]:
!mkdir -p model_outputs


#Load and Prepare

In [None]:
import random
import json
import torch
import os
import numpy as np
import gc
import time
from tqdm import tqdm
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
import re
from jinja2 import Template
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
import matplotlib.pyplot as plt
from transformers import DataCollatorForLanguageModeling
from huggingface_hub import login
login()

In [None]:
# radom for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

def clear_memory():
    """Improved memory clearing function"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    print("Memory cleared")

In [None]:
# Set environment variables to limit memory usage
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Clear all memory before starting
clear_memory()

In [None]:
def load_combined_math_datasets(sample_size=10000):
    """IMPROVED: Load multiple math datasets for better training signal"""
    print(f"Loading multiple math datasets (up to {sample_size} examples total)...")
    datasets = []

    # 1. Load NuminaMath dataset (primary source)
    try:
        numina_dataset = load_dataset("PrimeIntellect/NuminaMath-QwQ-CoT-5M")
        # Sample more examples
        sampled_numina = numina_dataset["train"].shuffle(seed=42).select(
            range(min(5000, len(numina_dataset["train"])))
        )
        datasets.append(sampled_numina)
        print(f"Added {len(sampled_numina)} examples from NuminaMath")
    except Exception as e:
        print(f"Error loading NuminaMath: {e}")

    # 2. Add GSM8K training data for better alignment with evaluation
    try:
        gsm8k_train = load_dataset("gsm8k", "main")["train"]
        # Prioritize GSM8K examples by taking more of them
        sampled_gsm8k = gsm8k_train.shuffle(seed=42).select(
            range(min(3000, len(gsm8k_train)))
        )
        datasets.append(sampled_gsm8k)
        print(f"Added {len(sampled_gsm8k)} examples from GSM8K train set")
    except Exception as e:
        print(f"Error loading GSM8K: {e}")

    # If we have no datasets yet, fall back to the original function
    if not datasets:
        print("Falling back to original dataset loading function")
        try:
            # Try to load the dataset
            numina_dataset = load_dataset("PrimeIntellect/NuminaMath-QwQ-CoT-5M")
            sampled_data = numina_dataset["train"].shuffle(seed=42).select(range(min(sample_size, len(numina_dataset["train"]))))
            print(f"Successfully loaded {len(sampled_data)} examples from NuminaMath dataset")
            return sampled_data
        except Exception as e:
            print(f"Error loading PrimeIntellect dataset: {e}")
            try:
                # Try loading GSM8k dataset as a fallback
                print("Falling back to GSM8k dataset...")
                gsm8k_dataset = load_dataset("gsm8k", "main")
                print(f"Successfully loaded GSM8k dataset with {len(gsm8k_dataset['train'])} examples")
                return gsm8k_dataset["train"]
            except Exception as e2:
                print(f"Error loading GSM8k dataset: {e2}")
                raise ValueError("Could not load any mathematics dataset. Please check your connection and try again.")

    # Combine all datasets and limit total size
    combined_data = concatenate_datasets(datasets)
    combined_data = combined_data.shuffle(seed=42)
    if len(combined_data) > sample_size:
        combined_data = combined_data.select(range(sample_size))

    print(f"Final combined dataset size: {len(combined_data)} examples")

    # Get a sample to inspect the data structure
    sample = combined_data[0]
    print("\nSample data structure:")
    for key in sample:
        if isinstance(sample[key], str):
            print(f"{key}: {sample[key][:100]}...")
        else:
            print(f"{key}: {sample[key]}")

    return combined_data


def load_gsm8k_evaluation():
    print("Loading GSM8k for evaluation...")
    gsm8k_dataset = load_dataset("gsm8k", "main")
    print(f"GSM8k dataset loaded. Test size: {len(gsm8k_dataset['test'])}")
    return gsm8k_dataset["test"]

In [None]:
primeintellect_data = load_combined_math_datasets(sample_size=8000)

gsm8k_eval = load_gsm8k_evaluation()

#Evaluation Dataset

In [None]:
def create_evaluation_set(gsm8k_eval, size=100):
    """IMPROVED: Create a larger evaluation set with better answer extraction"""
    print("Creating evaluation dataset...")

    # Get samples from GSM8k test set
    gsm8k_samples = gsm8k_eval.select(range(min(size, len(gsm8k_eval))))

    # Format GSM8k samples
    eval_set = []
    for sample in gsm8k_samples:
        # Find the answer in the solution
        solution = sample["answer"]

        # Try to extract number after #### marker
        hash_matches = re.findall(r"####\s*([-+]?\d*\.?\d+)", solution)
        if hash_matches:
            answer_value = hash_matches[-1].strip()
        else:
            # Fall back to extracting the last number in the solution
            numbers = re.findall(r"([-+]?\d*\.?\d+)", solution)
            if numbers:
                answer_value = numbers[-1].strip()
            else:
                # Last resort - use the text after #### marker
                parts = solution.split("####")
                if len(parts) > 1:
                    answer_value = parts[-1].strip()
                else:
                    answer_value = solution.strip().split("\n")[-1].strip()

        eval_set.append({
            "category": "math_reasoning",
            "question": sample["question"],
            "solution": solution,
            "answer": answer_value
        })

    # Save the evaluation set for consistency
    with open("evaluation_set.json", "w") as f:
        json.dump(eval_set, f)

    print(f"Created evaluation set with {len(eval_set)} samples")
    return eval_set

# Create the evaluation set
evaluation_set = create_evaluation_set(gsm8k_eval, size=100)

#Model Evaluation Functions

In [None]:
def load_model_and_tokenizer(model_name, load_in_4bit=True):
    """Load model and tokenizer with 4-bit quantization for reduced memory usage"""
    print(f"Loading {model_name} with 4-bit quantization...")

    # Configure quantization with additional CPU offloading parameters
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=load_in_4bit,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Handle tokenizer peculiarities
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    print("Loading with CPU offloading enabled for memory efficiency...")

    # Check available GPU memory and determine if we need disk offloading
    try:
        free_in_GB = torch.cuda.get_device_properties(0).total_memory / 1e9
        max_memory = {0: f"{int(free_in_GB * 0.85)}GB"}
        print(f"GPU memory available: {free_in_GB:.2f} GB, allocating: {max_memory}")
    except:
        max_memory = None
        print("Could not determine GPU memory, using default allocation")

    # IMPROVED: Better error handling during model loading
    try:
        # First try with 4-bit quantization
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=quantization_config,
            torch_dtype=torch.float16,
            offload_folder="offload_folder",
            max_memory=max_memory,
            offload_state_dict=True
        )
    except Exception as e:
        print(f"Initial loading attempt failed: {e}")
        print("Trying alternative loading strategy...")
        try:
            # Try 8-bit quantization if 4-bit fails
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                load_in_8bit=True,
                torch_dtype=torch.float16,
                offload_folder="offload_folder"
            )
        except Exception as e2:
            print(f"8-bit loading also failed: {e2}")
            print("Trying with minimal configuration...")
            # Last resort - try loading with most conservative settings
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True
            )

    return model, tokenizer

def format_math_prompt(question):
    """Format math question with improved prompt engineering"""
    return f"""Solve this math problem by breaking it down into small, logical steps.

Problem: {question}

Follow these steps:
1. Understand what the problem is asking for
2. Identify the key variables and relationships
3. Plan your approach step-by-step
4. Execute each calculation carefully, showing your work
5. Check your answer for reasonableness
6. State the final numerical answer after ####

Remember to maintain clear reasoning throughout and verify your calculations.
"""

def get_chat_template(tokenizer, model_name):
    """Get appropriate chat template for the model"""
    if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template:
        template = Template(tokenizer.chat_template)
        return template

    # Default template for models without one
    if "mistral" in model_name.lower():
        template_str = "<s>[INST] {{ messages[0]['content'] }} [/INST] {{ messages[1]['content'] }}</s>"
    else:
        template_str = "<|im_start|>user\n{{ messages[0]['content'] }}<|im_end|>\n<|im_start|>assistant\n{{ messages[1]['content'] }}<|im_end|>"

    return Template(template_str)

# IMPROVED: Enhanced answer extraction with multiple strategies
def extract_answer(response, question_type):
    """Extract the final answer from model responses with improved pattern matching"""
    if question_type == "math_reasoning":
        # Try multiple extraction strategies in order of reliability

        # 1. Find answer after #### marker (most reliable)
        hash_match = re.search(r"#{3,}\s*([-+]?\d*\.?\d+)", response, re.DOTALL)
        if hash_match:
            return hash_match.group(1).strip()

        # 2. Look for explicit "The answer is X" pattern
        answer_match = re.search(r"(?:the\s+answer\s+is|final\s+answer\s+is|answer\s*[=:]\s*)([-+]?\d*\.?\d+)",
                                response.lower(), re.DOTALL)
        if answer_match:
            return answer_match.group(1).strip()

        # 3. Look for "Therefore" pattern
        therefore_match = re.search(r"(?:therefore|thus|hence|so),?\s*([-+]?\d*\.?\d+)",
                                  response.lower(), re.DOTALL)
        if therefore_match:
            return therefore_match.group(1).strip()

        # 4. Fallback to last number in the response
        numbers = re.findall(r"[-+]?\d*\.?\d+", response)
        if numbers:
            return numbers[-1]

        return response.strip()
    else:
        # For other question types, just return the last sentence or phrase
        response = response.strip()
        sentences = response.split(".")
        if sentences:
            last_sentence = sentences[-1].strip()
            # If it's very long, take the last part
            if len(last_sentence) > 50:
                last_sentence = last_sentence[-50:].strip()
            return last_sentence

        # Fallback to returning a short version of the response
        if len(response) > 50:
            return response[-50:].strip()
        return response

In [None]:
# -*- coding: utf-8 -*-
"""NLP_ass3 Improved

Enhancements to fine-tuning process for better math reasoning abilities
"""

# Keeping the original imports and setup
import random
import json
import torch
import os
import numpy as np
import gc
import time
from tqdm import tqdm
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
import re
from jinja2 import Template
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
import matplotlib.pyplot as plt
from transformers import DataCollatorForLanguageModeling
from huggingface_hub import login
login()

# Random seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

def clear_memory():
    """Improved memory clearing function"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    print("Memory cleared")

# Set environment variables to limit memory usage
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Clear all memory before starting
clear_memory()

def load_combined_math_datasets(sample_size=10000):
    """IMPROVED: Load multiple math datasets for better training signal"""
    print(f"Loading multiple math datasets (up to {sample_size} examples total)...")
    datasets = []

    # 1. Load NuminaMath dataset (primary source)
    try:
        numina_dataset = load_dataset("PrimeIntellect/NuminaMath-QwQ-CoT-5M")
        # Sample more examples
        sampled_numina = numina_dataset["train"].shuffle(seed=42).select(
            range(min(5000, len(numina_dataset["train"])))
        )
        datasets.append(sampled_numina)
        print(f"Added {len(sampled_numina)} examples from NuminaMath")
    except Exception as e:
        print(f"Error loading NuminaMath: {e}")

    # 2. Add GSM8K training data for better alignment with evaluation
    try:
        gsm8k_train = load_dataset("gsm8k", "main")["train"]
        # Prioritize GSM8K examples by taking more of them
        sampled_gsm8k = gsm8k_train.shuffle(seed=42).select(
            range(min(3000, len(gsm8k_train)))
        )
        datasets.append(sampled_gsm8k)
        print(f"Added {len(sampled_gsm8k)} examples from GSM8K train set")
    except Exception as e:
        print(f"Error loading GSM8K: {e}")

    # If we have no datasets yet, fall back to the original function
    if not datasets:
        print("Falling back to original dataset loading function")
        try:
            # Try to load the dataset
            numina_dataset = load_dataset("PrimeIntellect/NuminaMath-QwQ-CoT-5M")
            sampled_data = numina_dataset["train"].shuffle(seed=42).select(range(min(sample_size, len(numina_dataset["train"]))))
            print(f"Successfully loaded {len(sampled_data)} examples from NuminaMath dataset")
            return sampled_data
        except Exception as e:
            print(f"Error loading PrimeIntellect dataset: {e}")
            try:
                # Try loading GSM8k dataset as a fallback
                print("Falling back to GSM8k dataset...")
                gsm8k_dataset = load_dataset("gsm8k", "main")
                print(f"Successfully loaded GSM8k dataset with {len(gsm8k_dataset['train'])} examples")
                return gsm8k_dataset["train"]
            except Exception as e2:
                print(f"Error loading GSM8k dataset: {e2}")
                raise ValueError("Could not load any mathematics dataset. Please check your connection and try again.")

    # Combine all datasets and limit total size
    combined_data = concatenate_datasets(datasets)
    combined_data = combined_data.shuffle(seed=42)
    if len(combined_data) > sample_size:
        combined_data = combined_data.select(range(sample_size))

    print(f"Final combined dataset size: {len(combined_data)} examples")

    # Get a sample to inspect the data structure
    sample = combined_data[0]
    print("\nSample data structure:")
    for key in sample:
        if isinstance(sample[key], str):
            print(f"{key}: {sample[key][:100]}...")
        else:
            print(f"{key}: {sample[key]}")

    return combined_data

# Also load GSM8k for evaluation
def load_gsm8k_evaluation():
    print("Loading GSM8k for evaluation...")
    gsm8k_dataset = load_dataset("gsm8k", "main")
    print(f"GSM8k dataset loaded. Test size: {len(gsm8k_dataset['test'])}")
    return gsm8k_dataset["test"]

# IMPROVED: Increase the sample size for better training signal
primeintellect_data = load_combined_math_datasets(sample_size=8000)

# Load GSM8k for evaluation
gsm8k_eval = load_gsm8k_evaluation()

def create_evaluation_set(gsm8k_eval, size=100):
    """IMPROVED: Create a larger evaluation set with better answer extraction"""
    print("Creating evaluation dataset...")

    # Get samples from GSM8k test set - increased size for more reliable evaluation
    gsm8k_samples = gsm8k_eval.select(range(min(size, len(gsm8k_eval))))

    # Format GSM8k samples with improved answer extraction
    eval_set = []
    for sample in gsm8k_samples:
        # Find the answer in the solution
        solution = sample["answer"]

        # IMPROVED: More reliable answer extraction
        # Try to extract number after #### marker
        hash_matches = re.findall(r"####\s*([-+]?\d*\.?\d+)", solution)
        if hash_matches:
            answer_value = hash_matches[-1].strip()
        else:
            # Fall back to extracting the last number in the solution
            numbers = re.findall(r"([-+]?\d*\.?\d+)", solution)
            if numbers:
                answer_value = numbers[-1].strip()
            else:
                # Last resort - use the text after #### marker
                parts = solution.split("####")
                if len(parts) > 1:
                    answer_value = parts[-1].strip()
                else:
                    answer_value = solution.strip().split("\n")[-1].strip()

        eval_set.append({
            "category": "math_reasoning",
            "question": sample["question"],
            "solution": solution,
            "answer": answer_value
        })

    # Save the evaluation set for consistency
    with open("evaluation_set.json", "w") as f:
        json.dump(eval_set, f)

    print(f"Created evaluation set with {len(eval_set)} samples")
    return eval_set

# Create the evaluation set - IMPROVED: larger size
evaluation_set = create_evaluation_set(gsm8k_eval, size=100)

def load_model_and_tokenizer(model_name, load_in_4bit=True):
    """Load model and tokenizer with 4-bit quantization for reduced memory usage"""
    print(f"Loading {model_name} with 4-bit quantization...")

    # Configure quantization with additional CPU offloading parameters
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=load_in_4bit,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Handle tokenizer peculiarities
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    print("Loading with CPU offloading enabled for memory efficiency...")

    # Check available GPU memory and determine if we need disk offloading
    try:
        free_in_GB = torch.cuda.get_device_properties(0).total_memory / 1e9
        max_memory = {0: f"{int(free_in_GB * 0.85)}GB"}
        print(f"GPU memory available: {free_in_GB:.2f} GB, allocating: {max_memory}")
    except:
        max_memory = None
        print("Could not determine GPU memory, using default allocation")

    # IMPROVED: Better error handling during model loading
    try:
        # First try with 4-bit quantization
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=quantization_config,
            torch_dtype=torch.float16,
            offload_folder="offload_folder",
            max_memory=max_memory,
            offload_state_dict=True
        )
    except Exception as e:
        print(f"Initial loading attempt failed: {e}")
        print("Trying alternative loading strategy...")
        try:
            # Try 8-bit quantization if 4-bit fails
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                load_in_8bit=True,
                torch_dtype=torch.float16,
                offload_folder="offload_folder"
            )
        except Exception as e2:
            print(f"8-bit loading also failed: {e2}")
            print("Trying with minimal configuration...")
            # Last resort - try loading with most conservative settings
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True
            )

    return model, tokenizer

# IMPROVED: Enhanced prompt engineering with more explicit reasoning guidance
def format_math_prompt(question):
    """Format math question with improved prompt engineering"""
    return f"""Solve this math problem by breaking it down into small, logical steps.

Problem: {question}

Follow these steps:
1. Understand what the problem is asking for
2. Identify the key variables and relationships
3. Plan your approach step-by-step
4. Execute each calculation carefully, showing your work
5. Check your answer for reasonableness
6. State the final numerical answer after ####

Remember to maintain clear reasoning throughout and verify your calculations.
"""

def get_chat_template(tokenizer, model_name):
    """Get appropriate chat template for the model"""
    if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template:
        template = Template(tokenizer.chat_template)
        return template

    # Default template for models without one
    if "mistral" in model_name.lower():
        template_str = "<s>[INST] {{ messages[0]['content'] }} [/INST] {{ messages[1]['content'] }}</s>"
    else:
        template_str = "<|im_start|>user\n{{ messages[0]['content'] }}<|im_end|>\n<|im_start|>assistant\n{{ messages[1]['content'] }}<|im_end|>"

    return Template(template_str)

# IMPROVED: Enhanced answer extraction with multiple strategies
def extract_answer(response, question_type):
    """Extract the final answer from model responses with improved pattern matching"""
    if question_type == "math_reasoning":
        # Try multiple extraction strategies in order of reliability

        # 1. Find answer after #### marker (most reliable)
        hash_match = re.search(r"#{3,}\s*([-+]?\d*\.?\d+)", response, re.DOTALL)
        if hash_match:
            return hash_match.group(1).strip()

        # 2. Look for explicit "The answer is X" pattern
        answer_match = re.search(r"(?:the\s+answer\s+is|final\s+answer\s+is|answer\s*[=:]\s*)([-+]?\d*\.?\d+)",
                                response.lower(), re.DOTALL)
        if answer_match:
            return answer_match.group(1).strip()

        # 3. Look for "Therefore" pattern
        therefore_match = re.search(r"(?:therefore|thus|hence|so),?\s*([-+]?\d*\.?\d+)",
                                  response.lower(), re.DOTALL)
        if therefore_match:
            return therefore_match.group(1).strip()

        # 4. Fallback to last number in the response
        numbers = re.findall(r"[-+]?\d*\.?\d+", response)
        if numbers:
            return numbers[-1]

        return response.strip()
    else:
        # For other question types, just return the last sentence or phrase
        response = response.strip()
        sentences = response.split(".")
        if sentences:
            last_sentence = sentences[-1].strip()
            # If it's very long, take the last part
            if len(last_sentence) > 50:
                last_sentence = last_sentence[-50:].strip()
            return last_sentence

        # Fallback to returning a short version of the response
        if len(response) > 50:
            return response[-50:].strip()
        return response

# IMPROVED: Better evaluation with more robust error handling
def evaluate_model(model, tokenizer, evaluation_data, model_name):
    """Evaluate model on the evaluation set with improved generation parameters"""
    template = get_chat_template(tokenizer, model_name)
    results = []

    # Track successful and failed evaluations
    success_count = 0
    failure_count = 0

    for idx, item in enumerate(tqdm(evaluation_data, desc=f"Evaluating {model_name}")):
        try:
            question = item["question"]
            true_answer = item["answer"]
            category = item["category"]

            # IMPROVED: Use enhanced prompt format
            prompt = format_math_prompt(question)

            # Format message using template
            messages = [{"role": "user", "content": prompt}]

            # Some models use chat templating differently
            try:
                input_text = template.render(messages=messages)
            except:
                # Fallback for models with different templating
                if "mistral" in model_name.lower():
                    input_text = f"<s>[INST] {prompt} [/INST]"
                else:
                    input_text = f"<|im_start|>user\n{prompt}<|im_end|>"

            # Tokenize with proper handling
            inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
            input_ids = inputs.input_ids.to(model.device)

            # IMPROVED: Enhanced generation parameters for math reasoning
            with torch.no_grad():
                outputs = model.generate(
                    input_ids,
                    max_new_tokens=512,  # Longer generation for complete reasoning
                    temperature=0.2,     # Lower temperature for more deterministic outputs
                    top_p=0.92,          # Narrower distribution for higher quality
                    do_sample=True,      # Still use sampling for some diversity
                    num_beams=2,         # Simple beam search for better outputs
                    repetition_penalty=1.1, # Discourage repetitive loops
                    pad_token_id=tokenizer.pad_token_id
                )

            # Decode the full response
            full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract just the model's response (not the prompt)
            response = full_output[len(tokenizer.decode(input_ids[0], skip_special_tokens=True)):].strip()

            # Extract the answer with improved extraction
            extracted_answer = extract_answer(response, category)

            # IMPROVED: More robust correctness checking
            is_correct = False
            if category == "math_reasoning":
                # Clean up numbers for comparison - handle commas, currency symbols
                extracted_clean = re.sub(r'[^\d.-]', '', extracted_answer)
                true_clean = re.sub(r'[^\d.-]', '', true_answer)

                try:
                    # Compare as floats with small tolerance for rounding errors
                    extracted_float = float(extracted_clean)
                    true_float = float(true_clean)
                    is_correct = abs(extracted_float - true_float) < 1e-6
                except:
                    # If conversion fails, fall back to string comparison
                    is_correct = extracted_clean == true_clean

            results.append({
                "category": category,
                "question": question,
                "true_answer": true_answer,
                "model_response": response,
                "extracted_answer": extracted_answer,
                "is_correct": is_correct
            })

            success_count += 1

            # Show example of first few evaluations
            if idx < 2:  # Show first two examples
                print(f"\nExample {idx+1}:")
                print(f"Question: {question[:100]}..." if len(question) > 100 else f"Question: {question}")
                print(f"True answer: {true_answer}")
                print(f"Model extracted answer: {extracted_answer}")
                print(f"Correct: {'✓' if is_correct else '✗'}")

        except Exception as e:
            print(f"Error evaluating example {idx}: {e}")
            failure_count += 1
            continue

    # Calculate overall accuracy
    correct = sum(1 for r in results if r["is_correct"])
    accuracy = correct / len(results) if results else 0

    print(f"\n{model_name} Evaluation Results:")
    print(f"Overall Accuracy: {accuracy:.4f} ({correct}/{len(results)})")
    print(f"Successful evaluations: {success_count}, Failed: {failure_count}")

    return {
        "model_name": model_name,
        "overall_accuracy": accuracy,
        "detailed_results": results
    }

#Process PrimeIntellect Dataset for Training

In [None]:
def process_numina_data_for_training(data, tokenizer, model_name):
    """Process the dataset for training with improved formatting and memory management"""
    print("Processing dataset for training...")

    template = get_chat_template(tokenizer, model_name)
    processed_data = []

    # Identify correct field names based on dataset structure
    sample = data[0]

    # IMPROVED: More robust field detection logic
    if 'prompt' in sample and 'response' in sample:
        question_key, answer_key = 'prompt', 'response'
    elif 'question' in sample and 'answer' in sample:
        question_key, answer_key = 'question', 'answer'
    elif 'input' in sample and 'output' in sample:
        question_key, answer_key = 'input', 'output'
    else:
        # Make an educated guess based on available fields
        keys = list(sample.keys())
        str_keys = [k for k in keys if isinstance(sample[k], str)]

        if len(str_keys) >= 2:
            question_key, answer_key = str_keys[0], str_keys[1]
            print(f"Using inferred fields - Question: '{question_key}', Answer: '{answer_key}'")
        else:
            raise ValueError(f"Cannot identify question and answer fields. Keys: {keys}")

    print(f"Using fields - Question: '{question_key}', Answer: '{answer_key}'")

    # Process the data in smaller chunks for better memory management
    chunk_size = 100
    for chunk_start in range(0, len(data), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(data))
        print(f"Processing chunk {chunk_start} to {chunk_end-1}...")

        for i in range(chunk_start, chunk_end):
            try:
                item = data[i]
                question = item[question_key]
                answer = item[answer_key]

                # Skip invalid entries
                if not isinstance(question, str) or not isinstance(answer, str):
                    print(f"Skipping item {i}: question or answer is not a string")
                    continue

                # IMPROVED: Enhanced prompt formatting
                formatted_question = format_math_prompt(question)

                # IMPROVED: Ensure answers end with the output marker if not present
                if '####' not in answer:
                    # Try to find the final answer
                    numbers = re.findall(r"([-+]?\d*\.?\d+)", answer)
                    if numbers:
                        final_number = numbers[-1].strip()
                        # Only add #### if not already at the end
                        if not answer.strip().endswith(final_number):
                            answer = answer.strip() + f"\n\n#### {final_number}"

                # Format into chat template
                try:
                    messages = [
                        {"role": "user", "content": formatted_question},
                        {"role": "assistant", "content": answer}
                    ]
                    formatted_text = template.render(messages=messages)
                except Exception as e:
                    # Fallback for models with different templating
                    if "mistral" in model_name.lower():
                        formatted_text = f"<s>[INST] {formatted_question} [/INST] {answer}</s>"
                    else:
                        formatted_text = f"<|im_start|>user\n{formatted_question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>"

                # Tokenize with truncation to control length - IMPROVED: longer context
                tokenized = tokenizer(formatted_text, truncation=True, max_length=1536)

                processed_data.append({
                    "input_ids": tokenized["input_ids"],
                    "attention_mask": tokenized["attention_mask"],
                })

                # Show sample for debugging
                if i == chunk_start:
                    print(f"\nSample processed data (item {i}):")
                    print(f"Original question: {question[:100]}..." if len(question) > 100 else f"Original question: {question}")
                    print(f"Original answer: {answer[:100]}..." if len(answer) > 100 else f"Original answer: {answer}")
                    print(f"Tokenized length: {len(tokenized['input_ids'])}")

            except Exception as e:
                print(f"Error processing item {i}: {e}")
                continue

        # Clear memory after each chunk
        clear_memory()
        print(f"Processed {len(processed_data)} examples so far")

    # Final check
    if len(processed_data) == 0:
        raise ValueError("No examples were successfully processed. Please check the dataset format.")

    print(f"Successfully processed {len(processed_data)} examples")

    # Create dataset-like structure
    return Dataset.from_dict({
        "input_ids": [item["input_ids"] for item in processed_data],
        "attention_mask": [item["attention_mask"] for item in processed_data]
    })

In [None]:
def create_labels_from_input_ids(batched_input_ids, tokenizer, model_name):
    """Create labels for training where we only want to predict the assistant's response"""
    labels = []

    for input_ids in batched_input_ids:
        # Convert input_ids to string
        full_text = tokenizer.decode(input_ids)

        # Identify assistant's part based on model
        if "mistral" in model_name.lower():
            # For Mistral, find text after [/INST]
            parts = full_text.split("[/INST]")
            if len(parts) > 1:
                user_text = parts[0] + "[/INST]"
            else:
                user_text = full_text
        else:
            # For other models, find text between markers
            parts = re.split(r"<\|im_start\|>assistant", full_text)
            if len(parts) > 1:
                user_text = parts[0] + "<|im_start|>assistant"
            else:
                user_text = full_text

        # Tokenize user part
        user_ids = tokenizer(user_text, add_special_tokens=False)["input_ids"]

        # Create labels with -100 for user part (to ignore in loss calculation)
        label = [-100] * min(len(user_ids), len(input_ids))

        # Fill the rest with actual values
        if len(user_ids) < len(input_ids):
            label.extend(input_ids[len(user_ids):])

        labels.append(label)

    return labels

#Two-Stage Fine-tuning - SFT followed by RLHF

In [None]:
def fine_tune_model(model_name, training_data, tokenizer=None, output_dir=None):
    """Fine-tuning process with improved LoRA configuration and training parameters"""
    if output_dir is None:
        output_dir = f"fine_tuned_{model_name.split('/')[-1].lower().replace('-', '_')}"

    os.makedirs(output_dir, exist_ok=True)

    print(f"\n{'='*50}")
    print(f"Starting fine-tuning for {model_name}")
    print(f"{'='*50}")

    # Configure quantization for memory efficiency
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Load model and tokenizer if not provided
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if not tokenizer.pad_token_id:
            tokenizer.pad_token = tokenizer.eos_token

    print("Loading model with memory efficiency settings...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            torch_dtype=torch.bfloat16
        )
    except Exception as e:
        print(f"Error loading with 4-bit: {e}")
        try:
            # Fallback to 8-bit if 4-bit fails
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                load_in_8bit=True,
                device_map="auto",
                torch_dtype=torch.bfloat16
            )
        except Exception as e2:
            print(f"Error loading with 8-bit: {e2}")
            # Last resort - try loading with minimal settings
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                torch_dtype=torch.bfloat16,
                low_cpu_mem_usage=True
            )

    # Prepare model for training
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    # Process data if needed
    if not isinstance(training_data, Dataset):
        training_data = process_numina_data_for_training(training_data, tokenizer, model_name)

    # Add labels to dataset
    def add_labels(examples):
        examples["labels"] = create_labels_from_input_ids(examples["input_ids"], tokenizer, model_name)
        return examples

    print("Preparing dataset with labels...")
    training_data = training_data.map(add_labels, batched=True)

    # IMPROVED: Enhanced LoRA configuration for better math reasoning
    # Higher rank and alpha values for more expressive adaptation
    peft_config = LoraConfig(
        r=32,              # IMPROVED: Increased from 8 to 32 for more capacity
        lora_alpha=64,     # IMPROVED: Increased from 16 to 64 for stronger updates
        lora_dropout=0.1,  # IMPROVED: Increased dropout for better generalization
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            # IMPROVED: Target more modules for better adaptation
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ]
    )

    model = get_peft_model(model, peft_config)
    print(f"Model prepared with LoRA adapters - rank={peft_config.r}, alpha={peft_config.lora_alpha}")

    # IMPROVED: Better training settings
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=2,             # IMPROVED: Train for 2 epochs instead of steps
        per_device_train_batch_size=2,  # Small batch size but for more epochs
        gradient_accumulation_steps=8,  # IMPROVED: Increased for larger effective batch
        learning_rate=1e-4,             # IMPROVED: Slightly lower but more stable
        weight_decay=0.05,              # IMPROVED: Higher weight decay for regularization
        warmup_ratio=0.05,              # IMPROVED: Longer warmup phase
        max_grad_norm=0.5,              # IMPROVED: Higher for stability
        logging_steps=10,
        save_strategy="epoch",          # IMPROVED: Save each epoch
        save_total_limit=3,             # Keep top 3 checkpoints
        optim="paged_adamw_32bit",      # Memory-efficient optimizer
        fp16=True,                      # Mixed precision
        gradient_checkpointing=True,    # Memory efficiency
        lr_scheduler_type="cosine",     # IMPROVED: Better scheduler
        report_to="none",               # Disable wandb/tensorboard to save memory
    )

    # Create a better data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # We're doing causal language modeling
    )

    # Create trainer with improved collator
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=training_data,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()

    # Save model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

    # Clear memory
    del model
    del trainer
    torch.cuda.empty_cache()

    return output_dir

#Evaluate Fine-tuned Models

In [None]:
def evaluate_finetuned_model(base_model_name, adapter_path, evaluation_set):
    """Evaluate a fine-tuned model with proper error handling"""
    print(f"\nEvaluating fine-tuned model from {adapter_path}...")

    # Make sure the adapter path exists
    if not os.path.exists(adapter_path):
        print(f"Error: Adapter path {adapter_path} not found")
        # Try to find any checkpoint
        import glob
        possible_paths = glob.glob(f"{os.path.dirname(adapter_path)}/checkpoint-*")
        if possible_paths:
            adapter_path = possible_paths[-1]  # Use the latest checkpoint
            print(f"Using alternative adapter path: {adapter_path}")
        else:
            print("No checkpoints found.")
            return None

    try:
        # Load tokenizer from base model
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        if not tokenizer.pad_token:
            tokenizer.pad_token = tokenizer.eos_token

        # First try to load a checkpoint with the whole model (some checkpoints save full model)
        try:
            print(f"Attempting to load as full model from {adapter_path}...")
            adapter_model = AutoModelForCausalLM.from_pretrained(
                adapter_path,
                device_map="auto",
                load_in_4bit=True,
                torch_dtype=torch.float16
            )
            print("Successfully loaded as full model")
        except Exception as e:
            print(f"Could not load as full model: {e}")
            print("Loading base model first...")

            # Load base model with minimal settings to save memory
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                device_map="auto",
                load_in_4bit=True,
                torch_dtype=torch.float16
            )

            # Load adapter
            print(f"Loading adapter from {adapter_path}...")
            adapter_model = PeftModel.from_pretrained(base_model, adapter_path)

            # Delete base model reference to save memory
            del base_model
            clear_memory()

        # Evaluate model
        results = evaluate_model(adapter_model, tokenizer, evaluation_set, f"Fine-tuned {base_model_name}")

        # Save results
        with open(os.path.join(os.path.dirname(adapter_path), "ft_eval_results.json"), "w") as f:
            json.dump(results, f)

        # Free memory
        del adapter_model
        clear_memory()

        return results

    except Exception as e:
        print(f"Error during fine-tuned model evaluation: {e}")
        import traceback
        traceback.print_exc()

        return None

#Compare Results and Report

In [None]:
def find_best_checkpoint(output_dir):
    """Find the best checkpoint in the output directory"""
    import glob
    import os

    print(f"Looking for checkpoints in {output_dir}...")

    # Look for checkpoints
    checkpoints = glob.glob(f"{output_dir}/checkpoint-*")

    if not checkpoints:
        print("No checkpoints found, using the main output directory")
        return output_dir

    # Find checkpoint with trainer_state.json to get loss information
    valid_checkpoints = []
    for cp in checkpoints:
        state_file = os.path.join(cp, "trainer_state.json")
        if os.path.exists(state_file):
            try:
                with open(state_file, "r") as f:
                    state = json.load(f)
                if "log_history" in state and state["log_history"]:
                    # Get the last loss
                    last_loss = float("inf")
                    for entry in reversed(state["log_history"]):
                        if "loss" in entry:
                            last_loss = entry["loss"]
                            break
                    valid_checkpoints.append((cp, last_loss))
            except:
                # If we can't read the file, just use the checkpoint
                valid_checkpoints.append((cp, float("inf")))

    if not valid_checkpoints:
        # If no valid checkpoints with loss info, sort by step number
        checkpoints.sort(key=lambda x: int(x.split("-")[-1]))
        print(f"Using latest checkpoint by step: {checkpoints[-1]}")
        return checkpoints[-1]

    # Otherwise use the checkpoint with the lowest loss
    valid_checkpoints.sort(key=lambda x: x[1])
    print(f"Using best checkpoint with loss {valid_checkpoints[0][1]}: {valid_checkpoints[0][0]}")
    return valid_checkpoints[0][0]

In [None]:
def visualize_results(base_results, ft_results, output_path="model_comparison.png"):
    """Create visualization comparing base and fine-tuned model performance"""
    import matplotlib.pyplot as plt

    # If we don't have both results, skip visualization
    if not base_results or not ft_results:
        print("Cannot create visualization - missing results")
        return

    try:
        base_acc = base_results["overall_accuracy"]
        ft_acc = ft_results["overall_accuracy"]

        # Calculate improvement
        abs_improvement = ft_acc - base_acc
        rel_improvement = (abs_improvement / base_acc * 100) if base_acc > 0 else float('inf')

        print(f"\n===== MODEL COMPARISON =====")
        print(f"Base model accuracy: {base_acc:.4f}")
        print(f"Fine-tuned model accuracy: {ft_acc:.4f}")
        print(f"Absolute improvement: {abs_improvement:.4f}")
        print(f"Relative improvement: {rel_improvement:.1f}%")

        # Create bar chart
        plt.figure(figsize=(10, 6))
        models = ["Base Model", "Fine-tuned Model"]
        accuracies = [base_acc, ft_acc]
        bars = plt.bar(models, accuracies, color=["blue", "green"])

        # Add values on bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{height:.4f}', ha='center', va='bottom')

        # Add improvement text
        plt.annotate(f"+{abs_improvement:.4f} (+{rel_improvement:.1f}%)",
                   xy=(1, ft_acc),
                   xytext=(1.3, (base_acc + ft_acc)/2),
                   arrowprops=dict(arrowstyle="->", color="red"))

        plt.title('Model Accuracy Comparison on GSM8K Math Reasoning')
        plt.ylabel('Accuracy')
        plt.ylim(0, max(accuracies) + 0.1)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()

        # Save figure
        plt.savefig(output_path)
        print(f"Comparison visualization saved to {output_path}")

        # Show sample correct examples from fine-tuned model
        print("\n===== EXAMPLE IMPROVEMENTS =====")
        ft_correct = [r for r in ft_results["detailed_results"] if r["is_correct"]]
        base_correct = [r for r in base_results["detailed_results"] if r["is_correct"]]

        # Find questions where fine-tuned was correct but base was wrong
        ft_improved = []
        for ft_r in ft_correct:
            q = ft_r["question"]
            # Find matching question in base results
            base_r = next((r for r in base_results["detailed_results"] if r["question"] == q), None)
            if base_r and not base_r["is_correct"]:
                ft_improved.append((base_r, ft_r))

        print(f"Found {len(ft_improved)} questions where fine-tuning improved the result")

        # Show a few examples
        for i, (base_r, ft_r) in enumerate(ft_improved[:3]):
            print(f"\nExample {i+1}:")
            print(f"Question: {base_r['question'][:150]}..." if len(base_r['question']) > 150 else f"Question: {base_r['question']}")
            print(f"Base model answer: {base_r['extracted_answer']} (❌)")
            print(f"Fine-tuned model answer: {ft_r['extracted_answer']} (✅)")
            print(f"True answer: {base_r['true_answer']}")

    except Exception as e:
        print(f"Error creating visualization: {e}")

#Main

In [None]:
def main():
    """Run the complete fine-tuning and evaluation pipeline"""
    try:
        # Model name
        model_name = "HuggingFaceH4/mistral-7b-sft-beta"
        output_dir = "fine_tuned_mistral_math_improved"

        # 1. Evaluate base model first for proper comparison
        print("\n===== EVALUATING BASE MODEL =====")
        base_model, base_tokenizer = load_model_and_tokenizer(model_name)
        base_results = evaluate_model(base_model, base_tokenizer, evaluation_set, model_name)

        # Save base model results
        with open("base_model_results.json", "w") as f:
            json.dump(base_results, f)

        # Free memory
        del base_model
        clear_memory()

        # 2. Process data for fine-tuning
        print("\n===== PROCESSING TRAINING DATA =====")
        processed_data = process_numina_data_for_training(primeintellect_data, base_tokenizer, model_name)

        # 3. Fine-tune the model
        print("\n===== STARTING FINE-TUNING =====")
        fine_tuned_dir = fine_tune_model(
            model_name,
            processed_data,
            tokenizer=base_tokenizer,
            output_dir=output_dir
        )

        # 4. Find best checkpoint
        best_checkpoint = find_best_checkpoint(fine_tuned_dir)

        # 5. Evaluate fine-tuned model
        print("\n===== EVALUATING FINE-TUNED MODEL =====")
        ft_results = evaluate_finetuned_model(model_name, best_checkpoint, evaluation_set)

        # 6. Create comparison visualization
        visualize_results(base_results, ft_results, os.path.join(output_dir, "model_comparison.png"))

        print("\n===== FINE-TUNING PIPELINE COMPLETED =====")
        print(f"Base model accuracy: {base_results['overall_accuracy']:.4f}")
        if ft_results:
            print(f"Fine-tuned model accuracy: {ft_results['overall_accuracy']:.4f}")
            improvement = ft_results['overall_accuracy'] - base_results['overall_accuracy']
            print(f"Absolute improvement: {improvement:.4f}")
            rel_improvement = (improvement / base_results['overall_accuracy'] * 100) if base_results['overall_accuracy'] > 0 else float('inf')
            print(f"Relative improvement: {rel_improvement:.1f}%")

    except Exception as e:
        print(f"Error in main pipeline: {e}")
        import traceback
        traceback.print_exc()

In [None]:
def test_model_on_problem(model_path, problem_text):
    """Test a model on a specific problem to verify it works"""
    print(f"\n===== TESTING MODEL ON EXAMPLE PROBLEM =====")

    try:
        # Determine if this is a base model or adapter
        is_adapter = os.path.exists(os.path.join(model_path, "adapter_config.json"))

        # Load tokenizer from either the model path or base model
        tokenizer_path = model_path
        if is_adapter:
            # For adapters, get base model from adapter config
            with open(os.path.join(model_path, "adapter_config.json"), "r") as f:
                adapter_config = json.load(f)
            if "base_model_name_or_path" in adapter_config:
                tokenizer_path = adapter_config["base_model_name_or_path"]
            else:
                # Default to Mistral if we can't determine base model
                tokenizer_path = "HuggingFaceH4/mistral-7b-sft-beta"

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        if not tokenizer.pad_token:
            tokenizer.pad_token = tokenizer.eos_token

        # Format the prompt
        prompt = format_math_prompt(problem_text)

        # Format for the model
        input_text = f"<s>[INST] {prompt} [/INST]"
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids

        # Load the model
        if is_adapter:
            # Load base model first
            base_model = AutoModelForCausalLM.from_pretrained(
                tokenizer_path,
                device_map="auto",
                load_in_4bit=True,
                torch_dtype=torch.float16
            )
            # Then load adapter
            model = PeftModel.from_pretrained(base_model, model_path)
            del base_model  # Free memory
        else:
            # Load as full model
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                device_map="auto",
                load_in_4bit=True,
                torch_dtype=torch.float16
            )

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                input_ids.to(model.device),
                max_new_tokens=512,
                temperature=0.2,
                top_p=0.95,
                do_sample=True,
                num_beams=2,
                repetition_penalty=1.05,
                pad_token_id=tokenizer.pad_token_id
            )

        # Decode and extract response
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = full_output[len(tokenizer.decode(input_ids[0], skip_special_tokens=True)):].strip()

        # Extract answer
        extracted_answer = extract_answer(response, "math_reasoning")

        print(f"\nProblem: {problem_text}")
        print(f"\nModel response:")
        print(response)
        print(f"\nExtracted answer: {extracted_answer}")

        # Save the response
        with open(os.path.join(os.path.dirname(model_path), "example_response.txt"), "w") as f:
            f.write(f"Problem: {problem_text}\n\n")
            f.write(f"Model response:\n{response}\n\n")
            f.write(f"Extracted answer: {extracted_answer}\n")

        # Free memory
        del model
        clear_memory()

        return extracted_answer

    except Exception as e:
        print(f"Error testing model: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run evaluation on a single example if full evaluation is too heavy
def minimal_eval(model_path):
    """Run minimal evaluation on a single example"""
    test_example = {
        "question": "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
        "answer": "18"
    }

    print("\n===== RUNNING MINIMAL EVALUATION =====")
    result = test_model_on_problem(model_path, test_example["question"])

    if result:
        # Clean up for comparison
        result_clean = re.sub(r'[^\d.-]', '', result)
        answer_clean = re.sub(r'[^\d.-]', '', test_example["answer"])

        try:
            is_correct = float(result_clean) == float(answer_clean)
        except:
            is_correct = result_clean == answer_clean

        print(f"Correct answer: {test_example['answer']}")
        print(f"Model answer correct: {'✓' if is_correct else '✗'}")

    print("\n===== MINIMAL EVALUATION COMPLETE =====")

In [None]:
if __name__ == "__main__":
    try:
        print("Starting improved fine-tuning pipeline...")

        main()

    except Exception as e:
        print(f"Error in main execution: {e}")
        import traceback
        traceback.print_exc()

        # If full pipeline fails, try minimal evaluation on a checkpoint
        print("\nFull pipeline failed. Trying minimal evaluation...")

        # Look for any checkpoints
        import glob
        checkpoints = glob.glob("fine_tuned_mistral_math*/checkpoint-*")
        if checkpoints:
            checkpoint = checkpoints[-1]  # Use the latest checkpoint
            print(f"Found checkpoint: {checkpoint}")
            minimal_eval(checkpoint)
        else:
            print("No checkpoints found for minimal evaluation.")