In [1]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import re 
from peft import PeftModel, LoraConfig, TaskType, LoftQConfig, prepare_model_for_kbit_training
from datasets import load_dataset
from tqdm import tqdm
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.get_device_name(0)

In [2]:
MODEL_ID = "LoftQ/Mistral-7B-v0.1-4bit-64rank"

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, 
    dtype=torch.bfloat16,  # you may change it with different models
    # quantization_config=BitsAndBytesConfig(
    #    load_in_4bit=True,
    #    bnb_4bit_compute_dtype=torch.bfloat16, 
    #    bnb_4bit_use_double_quant=False,
    #    bnb_4bit_quant_type='nf4',
    # ),
)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = prepare_model_for_kbit_training(base_model) # TODO: Appears to be bugged. Fix it
peft_model = PeftModel.from_pretrained(
    model,
    MODEL_ID,
    subfolder="loftq_init",
    is_trainable=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.save_pretrained("./tokenizers/mistral7b64rank", from_pt=True)
model.save_pretrained("./models/mistral7b64rank", from_pt=True)
peft_model.save_pretrained("./peftmodels/mistral7b64rank", from_pt=True)




In [3]:
# Load MetaMathQA dataset
dataset = load_dataset("meta-math/MetaMathQA")

# Take a subset for faster experimentation (optional)
train_dataset = dataset["train"].select(range(10000))  # Adjust size as needed

# Preview the data
print(train_dataset[0])

{'type': 'MATH_AnsAug', 'query': "Gracie and Joe are choosing numbers on the complex plane. Joe chooses the point $1+2i$. Gracie chooses $-1+i$. How far apart are Gracie and Joe's points?", 'original_question': "Gracie and Joe are choosing numbers on the complex plane. Joe chooses the point $1+2i$. Gracie chooses $-1+i$. How far apart are Gracie and Joe's points?", 'response': "The distance between two points $(x_1,y_1)$ and $(x_2,y_2)$ in the complex plane is given by the formula $\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}$.\nIn this case, Joe's point is $(1,2)$ and Gracie's point is $(-1,1)$.\nSo the distance between their points is $\\sqrt{((-1)-(1))^2+((1)-(2))^2}=\\sqrt{(-2)^2+(-1)^2}=\\sqrt{4+1}=\\sqrt{5}$.\nTherefore, Gracie and Joe's points are $\\boxed{\\sqrt{5}}$ units apart.\nThe answer is: \\sqrt{5}"}


In [None]:
eval = dataset["train"].select(range(150000, 160000))
eval

In [4]:
def format_instruction(sample):
    """Format the data into a prompt template"""
    return f"""Below is a math problem. Write a response that appropriately solves the problem.

    ### Problem:
    {sample['query']}

    ### Solution:
    {sample['response']}"""

def format_instruction_eval(sample):
    """Format the data into a prompt template"""
    return f"""Below is a math problem. Write a response that appropriately solves the problem.

    ### Problem:
    {sample['query']}

    ### Solution:"""


In [5]:
peft_model.print_trainable_parameters()

trainable params: 167,772,160 || all params: 7,409,504,256 || trainable%: 2.2643


In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./metamath-qlora",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,     # Effective batch size = 16
    gradient_checkpointing=True,       # Reduces memory usage
    optim="paged_adamw_32bit",        # Optimizer for QLoRA
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,                         # Use bfloat16
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False,
    report_to="tensorboard"                   # or "wandb" if you use it
)

  self.setter(val)


In [8]:
from trl import SFTTrainer

# Initialize trainer
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    args=training_args,
    formatting_func=format_instruction
)

# Start training
trainer.train()

trainer.model.save_pretrained("./metamath-qlora-final2")
tokenizer.save_pretrained("./metamath-qlora-final2")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
10,0.6975
20,0.5581
30,0.5243
40,0.4929
50,0.4845
60,0.4799
70,0.4694
80,0.4448
90,0.4609
100,0.4602


('./metamath-qlora-final2/tokenizer_config.json',
 './metamath-qlora-final2/special_tokens_map.json',
 './metamath-qlora-final2/tokenizer.json')

In [7]:
trainer.model.save_pretrained("./metamath-qlora-final2")
tokenizer.save_pretrained("./metamath-qlora-final2")

NameError: name 'trainer' is not defined

In [None]:
model = AutoModelForCausalLM.from_pretrained("./metamath-qlora-final")
tokenizer = AutoTokenizer.from_pretrained("./metamath-qlora-final")

In [None]:
import re 
def extract_answer(text):
    """
    Extract the final numerical answer from generated text.
    Looks for patterns like: "The answer is X" or boxed answers.
    
    Args:
        text (str): Generated response text
        
    Returns:
        str: Extracted answer or the last number found
    """
    # Common patterns in math solutions
    patterns = [
        r"####\s*([+-]?\d+\.?\d*)",           # #### 42 format
        r"the answer is[:\s]+([+-]?\d+\.?\d*)", # "the answer is: 42"
        r"\\boxed\{([^}]+)\}",                 # LaTeX \boxed{42}
        r"answer:\s*([+-]?\d+\.?\d*)",         # "answer: 42"
    ]
    
    text_lower = text.lower()
    
    for pattern in patterns:
        match = re.search(pattern, text_lower)
        if match:
            return match.group(1).strip()
    
    # Fallback: return last number in text
    numbers = re.findall(r"[+-]?\d+\.?\d*", text)
    if numbers:
        return numbers[-1]
    
    return "NO_ANSWER"

def extract_ground_truth(response):
    """
    Extract ground truth answer from the dataset response.
    MetaMathQA uses #### to mark the final answer.
    
    Args:
        response (str): Ground truth response from dataset
        
    Returns:
        str: The correct answer
    """
    match = re.search(r"####\s*([+-]?\d+\.?\d*)", response)
    if match:
        return match.group(1).strip()
    
    # Fallback: return last number
    numbers = re.findall(r"[+-]?\d+\.?\d*", response)
    if numbers:
        return numbers[-1]
    
    return "NO_ANSWER"

def compare_answers(predicted, ground_truth):
    """
    Compare predicted answer with ground truth.
    Handles numerical comparison with some tolerance for floats.
    
    Args:
        predicted (str): Predicted answer
        ground_truth (str): Correct answer
        
    Returns:
        bool: True if answers match
    """
    try:
        # Try numerical comparison
        pred_num = float(predicted)
        truth_num = float(ground_truth)
        
        # Allow small floating point differences
        return abs(pred_num - truth_num) < 0.01
    except (ValueError, TypeError):
        # Fall back to string comparison
        return predicted.strip() == ground_truth.strip()

In [None]:

print("Starting evaluation...\n")
NUM_SAMPLES = 100        # How many validation samples to test (set to None for all)
MAX_NEW_TOKENS = 256     # Maximum length of generated answer
TEMPERATURE = 0.1        # Lower = more deterministic (good for math)
BATCH_SIZE = 1           # Process one at a time for simplicity

results = []
correct = 0
total = 0

# Iterate through validation samples
for idx, sample in enumerate(tqdm(eval, desc="Evaluating")):
    
    # Get question and ground truth
    question = sample["query"]
    ground_truth_response = sample["response"]
    ground_truth_answer = extract_ground_truth(ground_truth_response)
    
    # Format prompt (same as training!)
    prompt = format_instruction_eval(sample)
    
    # Tokenize input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",      # Return PyTorch tensors
        truncation=True,          # Truncate if too long
        max_length=512,           # Match training max length
    ).to(model.device)
    
    # Generate answer
    with torch.no_grad():  # Don't compute gradients (saves memory)
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,  # Maximum tokens to generate
            temperature=TEMPERATURE,         # Sampling temperature (lower = more deterministic)
            do_sample=True if TEMPERATURE > 0 else False,  # Use sampling if temp > 0
            top_p=0.95,                     # Nucleus sampling
            repetition_penalty=1.1,         # Penalize repetition
            pad_token_id=tokenizer.eos_token_id,  # Padding token
        )
    
    # Decode generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract just the generated answer (remove the prompt)
    generated_answer_text = generated_text[len(prompt):].strip()
    
    # Extract numerical answer from generated text
    predicted_answer = extract_answer(generated_answer_text)
    
    # Compare with ground truth
    is_correct = compare_answers(predicted_answer, ground_truth_answer)
    
    if is_correct:
        correct += 1
    total += 1
    
    # Store result
    results.append({
        "question": question,
        "predicted_answer": predicted_answer,
        "ground_truth_answer": ground_truth_answer,
        "correct": is_correct,
        "full_generation": generated_answer_text,
    })
    
    # Print first few examples for inspection
    if idx < 3:
        print(f"\n{'='*80}")
        print(f"Example {idx + 1}:")
        print(f"\nQuestion: {question}")
        print(f"\nGenerated: {generated_answer_text[:200]}...")
        print(f"\nPredicted Answer: {predicted_answer}")
        print(f"Ground Truth: {ground_truth_answer}")
        print(f"Correct: {'✓' if is_correct else '✗'}")
        print(f"{'='*80}\n")

# ==============================================================================
# CALCULATE METRICS
# ==============================================================================

accuracy = (correct / total) * 100 if total > 0 else 0

print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80)
print(f"Total samples evaluated: {total}")
print(f"Correct answers: {correct}")
print(f"Incorrect answers: {total - correct}")
print(f"Accuracy: {accuracy:.2f}%")
print("="*80)

# ==============================================================================
# SAVE RESULTS
# ==============================================================================

import json

output_file = "evaluation_results.json"
with open(output_file, "w") as f:
    json.dump({
        "summary": {
            "total": total,
            "correct": correct,
            "accuracy": accuracy,
        },
        "results": results,
    }, f, indent=2)

print(f"\nDetailed results saved to: {output_file}")

# ==============================================================================
# ERROR ANALYSIS (Optional)
# ==============================================================================

print("\n" + "="*80)
print("ERROR ANALYSIS - Sample Wrong Answers")
print("="*80)

wrong_answers = [r for r in results if not r["correct"]][:5]  # Show first 5 errors

for i, error in enumerate(wrong_answers, 1):
    print(f"\nError {i}:")
    print(f"Question: {error['question'][:100]}...")
    print(f"Predicted: {error['predicted_answer']}")
    print(f"Correct: {error['ground_truth_answer']}")
    print("-" * 40)

In [None]:
inputs = tokenizer(format_instruction_eval(train_dataset[0]), return_tensors="pt").to("cuda")
output = model.generate(**inputs)

print(tokenizer.decode(output[0]))

In [2]:
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

INFO 11-13 22:35:33 [__init__.py:216] Automatically detected platform cuda.


In [11]:
llm = LLM(model="./models/mistral7b64rank", tokenizer="./tokenizers/mistral7b64rank", trust_remote_code=True)

INFO 11-14 00:37:47 [utils.py:233] non-default args: {'tokenizer': './tokenizers/mistral7b64rank', 'trust_remote_code': True, 'disable_log_stats': True, 'model': './models/mistral7b64rank'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-14 00:37:47 [model.py:547] Resolved architecture: MistralForCausalLM
INFO 11-14 00:37:47 [model.py:1730] Downcasting torch.float32 to torch.bfloat16.
INFO 11-14 00:37:47 [model.py:1510] Using max model len 32768
INFO 11-14 00:37:47 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
[1;36m(EngineCore_DP0 pid=38199)[0;0m INFO 11-14 00:37:47 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=38199)[0;0m INFO 11-14 00:37:47 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='./models/mistral7b64rank', speculative_config=None, tokenizer='./tokenizers/mistral7b64rank', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=bitsandbytes, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_e

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.66it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.66it/s]
[1;36m(EngineCore_DP0 pid=38199)[0;0m 
Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=38199)[0;0m ERROR 11-14 00:37:57 [core.py:708] EngineCore failed to start.
[1;36m(EngineCore_DP0 pid=38199)[0;0m ERROR 11-14 00:37:57 [core.py:708] Traceback (most recent call last):
[1;36m(EngineCore_DP0 pid=38199)[0;0m ERROR 11-14 00:37:57 [core.py:708]   File "/home/ell/playground/env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
[1;36m(EngineCore_DP0 pid=38199)[0;0m ERROR 11-14 00:37:57 [core.py:708]     engine_core = EngineCoreProc(*args, **kwargs)
[1;36m(EngineCore_DP0 pid=38199)[0;0m ERROR 11-14 00:37:57 [core.py:708]   File "/home/ell/playground/env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 498, in __init__
[1;36m(EngineCore_DP0 pid=38199)[0;0m ERROR 11-14 00:37:57 [core.py:708]     super().__init__(vllm_config, executor_class, log_stats,
[1;36m(EngineCore_DP0 pid=38199)[0;0m ERROR 11-14 00:37:57 [core.py:708]   File "/home/ell/playground/env/lib/python3.10/site-packages/vllm/v1/engine

[1;36m(EngineCore_DP0 pid=38199)[0;0m Process EngineCore_DP0:
[1;36m(EngineCore_DP0 pid=38199)[0;0m Traceback (most recent call last):
[1;36m(EngineCore_DP0 pid=38199)[0;0m   File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
[1;36m(EngineCore_DP0 pid=38199)[0;0m     self.run()
[1;36m(EngineCore_DP0 pid=38199)[0;0m   File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
[1;36m(EngineCore_DP0 pid=38199)[0;0m     self._target(*self._args, **self._kwargs)
[1;36m(EngineCore_DP0 pid=38199)[0;0m   File "/home/ell/playground/env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
[1;36m(EngineCore_DP0 pid=38199)[0;0m     raise e
[1;36m(EngineCore_DP0 pid=38199)[0;0m   File "/home/ell/playground/env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
[1;36m(EngineCore_DP0 pid=38199)[0;0m     engine_core = EngineCoreProc(*args, **kwargs)
[1;36m(EngineCore_DP0 pid=3819

RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}

In [4]:
model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM(model=model_id, dtype=torch.bfloat16, quantization="bitsandbytes", load_format="bitsandbytes", gpu_memory_utilization=.5)

INFO 11-13 22:37:35 [utils.py:233] non-default args: {'load_format': 'bitsandbytes', 'dtype': torch.bfloat16, 'gpu_memory_utilization': 0.5, 'disable_log_stats': True, 'quantization': 'bitsandbytes', 'model': 'unsloth/tinyllama-bnb-4bit'}
INFO 11-13 22:37:36 [model.py:547] Resolved architecture: LlamaForCausalLM
INFO 11-13 22:37:36 [model.py:1510] Using max model len 2048
INFO 11-13 22:37:36 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
[1;36m(EngineCore_DP0 pid=4125)[0;0m INFO 11-13 22:37:37 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=4125)[0;0m INFO 11-13 22:37:37 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='unsloth/tinyllama-bnb-4bit', speculative_config=None, tokenizer='unsloth/tinyllama-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=bi

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 38.27it/s]
[1;36m(EngineCore_DP0 pid=4125)[0;0m 
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.74it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.73it/s]
[1;36m(EngineCore_DP0 pid=4125)[0;0m 


[1;36m(EngineCore_DP0 pid=4125)[0;0m INFO 11-13 22:38:07 [gpu_model_runner.py:2653] Model loading took 0.7738 GiB and 26.713282 seconds
[1;36m(EngineCore_DP0 pid=4125)[0;0m INFO 11-13 22:38:13 [backends.py:548] Using cache directory: /home/ell/.cache/vllm/torch_compile_cache/38b5f3c097/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=4125)[0;0m INFO 11-13 22:38:13 [backends.py:559] Dynamo bytecode transform time: 5.53 s
[1;36m(EngineCore_DP0 pid=4125)[0;0m ERROR 11-13 22:38:14 [core.py:708] EngineCore failed to start.
[1;36m(EngineCore_DP0 pid=4125)[0;0m ERROR 11-13 22:38:14 [core.py:708] Traceback (most recent call last):
[1;36m(EngineCore_DP0 pid=4125)[0;0m ERROR 11-13 22:38:14 [core.py:708]   File "/home/ell/playground/env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
[1;36m(EngineCore_DP0 pid=4125)[0;0m ERROR 11-13 22:38:14 [core.py:708]     engine_core = EngineCoreProc(*args, **kwargs)
[1;36m(EngineCore_DP0 pi

[1;36m(EngineCore_DP0 pid=4125)[0;0m Process EngineCore_DP0:
[1;36m(EngineCore_DP0 pid=4125)[0;0m Traceback (most recent call last):
[1;36m(EngineCore_DP0 pid=4125)[0;0m   File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
[1;36m(EngineCore_DP0 pid=4125)[0;0m     self.run()
[1;36m(EngineCore_DP0 pid=4125)[0;0m   File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
[1;36m(EngineCore_DP0 pid=4125)[0;0m     self._target(*self._args, **self._kwargs)
[1;36m(EngineCore_DP0 pid=4125)[0;0m   File "/home/ell/playground/env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
[1;36m(EngineCore_DP0 pid=4125)[0;0m     raise e
[1;36m(EngineCore_DP0 pid=4125)[0;0m   File "/home/ell/playground/env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
[1;36m(EngineCore_DP0 pid=4125)[0;0m     engine_core = EngineCoreProc(*args, **kwargs)
[1;36m(EngineCore_DP0 pid=4125)[0;0m   

RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}