In [2]:
!pip install -q --upgrade transformers datasets peft bitsandbytes trl
!pip install -q accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.7/450.7 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from accelerate.utils import write_basic_config
write_basic_config()

PosixPath('/root/.cache/huggingface/accelerate/default_config.yaml')

In [4]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    AutoConfig
)
from transformers import EarlyStoppingCallback 
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from peft import LoraConfig, get_peft_model,PeftModel
from trl import SFTTrainer
import transformers
from torch.utils.data import DataLoader
from tqdm import tqdm

In [5]:
# HF_TOKEN =  "your_hugging_face_token"
PUSH_REPO_NAME = ""     
SAVE_DIR = ""
MODEL_ID = "Qwen/Qwen2.5-Math-1.5B-Instruct"
# "Qwen/Qwen2.5-0.5B-Instruct" for Qwen 0.5

SEED = 7
torch.manual_seed(SEED)
np.random.seed(SEED)

login(token=HF_TOKEN)

In [6]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)

DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
# DEFAULT_BOS_TOKEN = "<s>"
special_tokens_dict = {}
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
# if tokenizer.bos_token is None:
#     special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if special_tokens_dict:
    tokenizer.add_special_tokens(special_tokens_dict)

tokenizer_config.json:   0%|          | 0.00/7.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [8]:
print("Loading base Qwen model + LoRA adapters...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=HF_TOKEN
)


base_model.resize_token_embeddings(len(tokenizer))


model = get_peft_model(base_model, lora_config)

Loading base Qwen model + LoRA adapters...




config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [9]:
print("Loading MATH dataset (all splits)...")
dataset = load_dataset("Maxwell-Jia/MATH", trust_remote_code=True)

def preprocess_function(examples):
    inputs = [
        "<|im_start|>system"
        "You are expert math assistant.<|im_end|>"
        "<|im_start|>user"
        "Solve the following math problem: "
        f"{problem}\n"
        "Show all intermediate steps and include the final answer in LaTeX format "
        "in a box like \\boxed{{}}."
        "<|im_start|>assistant "
        for problem in examples["problem"]
        ]
    targets = [
        f"{solution}{tokenizer.eos_token}"
        for solution in examples["solution"]
    ]
    full_texts = [inp + tgt for inp, tgt in zip(inputs, targets)]
    model_inputs = tokenizer(
        full_texts,
        max_length=512,
        truncation=True,
        padding="longest",
        return_tensors="pt"
    )
    labels = model_inputs["input_ids"].clone()
    for i in range(len(labels)):
        input_ids = tokenizer(inputs[i], add_special_tokens=False).input_ids
        input_len = len(input_ids)
        labels[i][:input_len] = -100

    model_inputs["labels"] = labels
    return model_inputs

Loading MATH dataset (all splits)...


README.md:   0%|          | 0.00/5.32k [00:00<?, ?B/s]

competition_math.py:   0%|          | 0.00/2.57k [00:00<?, ?B/s]

MATH.zip:   0%|          | 0.00/7.91M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [10]:
print("Tokenizing data...")
tokenized_train = dataset["train"].map(
    preprocess_function, batched=True, remove_columns=dataset["train"].column_names
)
tokenized_test = dataset["test"].map(
    preprocess_function, batched=True, remove_columns=dataset["test"].column_names
)

Tokenizing data...


Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False, 
    pad_to_multiple_of=8
)

In [12]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_model_qwen",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    warmup_ratio=0.1,
    num_train_epochs=5,             
    learning_rate=5e-5,
    fp16=True,
    logging_steps=100,
    optim="paged_adamw_8bit",
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    run_name="qwen_finetuning_lora32",
    ddp_find_unused_parameters=False,
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2
)



In [13]:
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    peft_config=lora_config,
    args=training_args,
    data_collator=data_collator,
    callbacks=[early_stopping_callback]
)

print("Starting training...")
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Starting training...


Step,Training Loss,Validation Loss
1000,0.5312,0.65118
2000,0.4689,0.64105
3000,0.4565,0.630251
4000,0.4219,0.657645
5000,0.3992,0.652556




TrainOutput(global_step=5000, training_loss=0.512893522644043, metrics={'train_runtime': 9859.2634, 'train_samples_per_second': 3.804, 'train_steps_per_second': 0.951, 'total_flos': 8.277627764736e+16, 'train_loss': 0.512893522644043, 'epoch': 2.6666666666666665})

In [14]:
model.save_pretrained("./fine_tuned_model_qwen")
tokenizer.save_pretrained("./fine_tuned_model_qwen")

print("Pushing to the Hub...")
model.push_to_hub(PUSH_REPO_NAME, use_auth_token=HF_TOKEN)
tokenizer.push_to_hub(PUSH_REPO_NAME, use_auth_token=HF_TOKEN)

print(trainer.state.log_history)



Pushing to the Hub...




adapter_model.safetensors:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

[{'loss': 1.4293, 'grad_norm': 0.980175793170929, 'learning_rate': 5.330490405117271e-06, 'epoch': 0.05333333333333334, 'step': 100}, {'loss': 0.9938, 'grad_norm': 0.6112407445907593, 'learning_rate': 1.0660980810234541e-05, 'epoch': 0.10666666666666667, 'step': 200}, {'loss': 0.6837, 'grad_norm': 0.6570835113525391, 'learning_rate': 1.5991471215351813e-05, 'epoch': 0.16, 'step': 300}, {'loss': 0.5857, 'grad_norm': 0.7227174639701843, 'learning_rate': 2.1321961620469083e-05, 'epoch': 0.21333333333333335, 'step': 400}, {'loss': 0.6093, 'grad_norm': 0.6997278332710266, 'learning_rate': 2.6652452025586356e-05, 'epoch': 0.26666666666666666, 'step': 500}, {'loss': 0.5666, 'grad_norm': 0.8497457504272461, 'learning_rate': 3.1982942430703626e-05, 'epoch': 0.32, 'step': 600}, {'loss': 0.561, 'grad_norm': 0.6438538432121277, 'learning_rate': 3.73134328358209e-05, 'epoch': 0.37333333333333335, 'step': 700}, {'loss': 0.5564, 'grad_norm': 0.6159977316856384, 'learning_rate': 4.2643923240938166e-05

In [15]:
print("Running batch inference on the test set with DataLoader...")

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

test_samples = []
for idx in range(len(dataset["test"])):
    sample = dataset["test"][idx]
    # Using the same or similar prompt style as in training
    input_text = (
        "<|im_start|>system"
        "You are expert math assistant.<|im_end|>"
        "<|im_start|>user"
        "Solve the following math problem: "
        f"{sample['problem']}\n"
        "Show all intermediate steps and include the final answer in LaTeX format "
        "in a box like \\boxed{{}}."
        "<|im_start|>assistant "
    )
    test_samples.append({
        "input_text": input_text,
        "problem": sample["problem"],
        "level": sample["level"],
        "type": sample["type"],
        "ground_truth": sample["solution"]
    })


def collate_fn(batch):
    input_texts = [sample["input_text"] for sample in batch]
    model_inputs = tokenizer(
        input_texts,
        padding=True,
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )
    model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
    return model_inputs, batch

batch_size = 32
test_dataloader = DataLoader(test_samples, batch_size=batch_size, collate_fn=collate_fn)

results_list = []
for batch_idx, (model_inputs, batch) in enumerate(tqdm(test_dataloader, desc="Evaluating")):
    current_batch_size = model_inputs["input_ids"].size(0)
    try:
        with torch.no_grad():
            output_ids = model.generate(
                input_ids=model_inputs["input_ids"],
                attention_mask=model_inputs["attention_mask"],
                max_new_tokens=512,
                do_sample=False,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id
            )
        for i in range(current_batch_size):
            predicted_text = tokenizer.decode(output_ids[i], skip_special_tokens=True)
            results_list.append({
                "problem": batch[i]["problem"],
                "level": batch[i]["level"],
                "type": batch[i]["type"],
                "ground_truth": batch[i]["ground_truth"],
                "predicted_solution": predicted_text
            })
    except Exception as e:
        print(f"Error in batch {batch_idx+1}: {e}")
        for i in range(current_batch_size):
            results_list.append({
                "problem": batch[i]["problem"],
                "level": batch[i]["level"],
                "type": batch[i]["type"],
                "ground_truth": batch[i]["ground_truth"],
                "predicted_solution": ""
            })
        continue

    # Optional intermediate saves
    if (batch_idx + 1) % 100 == 0:
        df_intermediate = pd.DataFrame(results_list)
        filename = os.path.join(SAVE_DIR, f"qwen_test_results_batch_{batch_idx+1}.csv")
        df_intermediate.to_csv(filename, index=False)
        print(f"Intermediate results saved at batch {batch_idx+1}: {filename}")

results_df = pd.DataFrame(results_list)
final_path = os.path.join(SAVE_DIR, "qwen_test_results_overall.csv")
results_df.to_csv(final_path, index=False)
print(f"Final test results saved to {final_path}")

Running batch inference on the test set with DataLoader...


Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:   1%|          | 1/157 [01:09<3:00:34, 69.45s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:   1%|▏         | 2/157 [02:07<2:41:21, 62.46s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:   2%|▏         | 3/157 [03:16<2:48:14, 65.55s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:   3%|▎         | 4/157 [04:26<2:52:18, 67.57s/it]A decoder-onl

Intermediate results saved at batch 100: /kaggle/working/qwen_test_results_batch_100.csv


Evaluating:  64%|██████▍   | 101/157 [1:55:33<1:04:16, 68.87s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:  65%|██████▍   | 102/157 [1:56:41<1:03:05, 68.82s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:  66%|██████▌   | 103/157 [1:57:49<1:01:43, 68.59s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:  66%|██████▌   | 104/157 [1:58:58<1:00:34, 68.58s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:  67%|██████▋   | 105/157 [2:00:07<59

Final test results saved to /kaggle/working/qwen_test_results_overall.csv



