In [None]:
!pip install -q transformers datasets peft bitsandbytes accelerate trl wandb

In [None]:
!pip install -q flash-attn --no-build-isolation

In [None]:
# Mount the google drive
from google.colab import drive
import os
drive.mount("/content/drive/")

In [4]:
# Login to the HuggingFace hub
from google.colab import userdata
from huggingface_hub import login

login(token=userdata.get('Add your token here')) # We use google colab environment to store the token

In [5]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="Add your project name here"

In [None]:
# Import required libraries
import torch
from trl import SFTTrainer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig
from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer, EarlyStoppingCallback, get_scheduler, BitsAndBytesConfig

# Stream large datasets to save memory
dataset = load_dataset("Floppanacci/QWQ-LongCOT-AIMO", streaming=False) # Use the streaming option if the dataset is too large to fit in memory

In [7]:
# 4-bit quantization settings
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16, # Use bfloat16 for better performance (only on appropriate hardware e.g. A100, L4, H100, etc.)
    quantization_config=bnb_config,
    attn_implementation="flash_attention_2",
    cache_dir="Add your cache directory here",
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
    padding_side="right",
    cache_dir="Add your cache directory here",
)

In [18]:
# Format the input and tokenize

def format_chat(examples, prompt="Please reason step by step, and put your final answer within \boxed{}."): # Add your prompt here
    texts = []
    for problem, solution in zip(examples["question"], examples["solution"]):
        messages = [
            {"role": "user", "content": prompt + '\n\n' + problem},
            {"role": "assistant", "content": "<think>\n" + solution + "\n</think>"}
        ]
        texts.append(tokenizer.apply_chat_template(messages, tokenize=False))
    return {"text": texts}

# Use the predefined splits from the dataset
train = dataset["train"]
val = dataset["validation"]

In [None]:
# Tokenize the splits
tokenized_train = train.map(format_chat, batched=True)
tokenized_val = val.map(format_chat, batched=True)

In [20]:
# Add before training
model.config.use_cache = False  # Disable cache for gradient checkpointing
model.enable_input_require_grads()

In [21]:
# Configure QLoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # Adjust the target modules for your model and the time you can afford to train
    lora_dropout=0.05,
    bias="lora_only",
    task_type="CAUSAL_LM",
)

In [22]:
# Wrap the model with QLoRA
model = get_peft_model(model, lora_config)

In [23]:
# Set training arguments
training_args = TrainingArguments(
    output_dir="Add your output directory here",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    gradient_accumulation_steps=4,
    eval_strategy="epoch",
    num_train_epochs=3,
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    bf16=True,
    tf32=True,
    optim="paged_adamw_8bit",
    max_grad_norm=1.0,
    warmup_ratio=0.05,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="wandb",
)

In [24]:
# Setup early stopping
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.005,
)

In [None]:
# Add data collator for dynamic padding
import numpy as np
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    callbacks=[early_stopping],
)

In [None]:
# Train the model
trainer.train(resume_from_checkpoint=False)

In [27]:
model.save_pretrained("Add your model save directory here")

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load base model in BF16 (no quantization)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)

In [None]:
# Load LoRA adapter and merge
merged_model = PeftModel.from_pretrained(
    base_model,
    "Add your model save directory here"
).merge_and_unload()

In [None]:
# Save merged BF16 model
merged_model.save_pretrained("Add your model save directory here", safe_serialization=True)
tokenizer.save_pretrained("Add your model save directory here")

In [None]:
merged_model.push_to_hub("Your HuggingFace username/Your model name", private=True)

In [None]:
tokenizer.push_to_hub("Your HuggingFace username/Your model name", private=True)