Distributed Finetuning of LLM using HF accelerate

In [None]:
# --- Install Dependencies ---
# !pip install -q transformers datasets accelerate bitsandbytes peft

In [None]:
# --- Imports ---
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW
from peft import LoraConfig, get_peft_model
from accelerate import Accelerator
from torch.utils.data import DataLoader

accelerator = Accelerator()
device = accelerator.device

print(f"Using device: {device}")

Using device: mps


In [None]:
# Mac specific
print(torch.backends.mps.is_available())
torch.backends.mps.is_built()

True


True

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [4]:
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    device_map=None,  # We let accelerate handle device placement
)

Loading model and tokenizer...


In [5]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [7]:
dataset = load_dataset("squad", split="train[:1%]")  # Tiny subset

def format_qa(example):
    prompt = f"Question: {example['question']} Context: {example['context']} Answer:"
    answer = example['answers']['text'][0] if len(example['answers']['text']) > 0 else ''
    full_text = prompt + " " + answer
    tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
    labels = tokenized["input_ids"].copy()
    prompt_len = len(tokenizer(prompt, truncation=True, max_length=512)["input_ids"])
    labels[:prompt_len] = [-100] * prompt_len  # Mask prompt in loss
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(format_qa, remove_columns=dataset.column_names)
tokenized_dataset.set_format(type='torch')

Map:   0%|          | 0/876 [00:00<?, ? examples/s]

In [8]:
train_loader = DataLoader(tokenized_dataset, shuffle=True, batch_size=2)
optimizer = AdamW(model.parameters(), lr=2e-4)



In [9]:
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

In [None]:
EPOCHS = 1
MAX_TRAIN_STEPS = 50

step = 0
model.train()

for epoch in range(EPOCHS):
    for batch in train_loader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()

        step += 1
        if step % 5 == 0:
            accelerator.print(f"Step {step} - Loss: {loss.item():.4f}")
        
        if step >= MAX_TRAIN_STEPS:
            break
    if step >= MAX_TRAIN_STEPS:
        break

Step 5 - Loss: 2.0729
Step 10 - Loss: 0.6383
Step 15 - Loss: 0.0666
Step 20 - Loss: 0.0497
Step 25 - Loss: 0.0249
Step 30 - Loss: 0.0350
Step 35 - Loss: 0.0385
Step 40 - Loss: 0.0115
Step 45 - Loss: 0.0286
Step 50 - Loss: 0.0190


In [11]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)

unwrapped_model.save_pretrained("./finetuned_tinyllama_qa")
tokenizer.save_pretrained("./finetuned_tinyllama_qa")

print("Model saved successfully!")

Model saved successfully!


In [12]:
prompt = "Question: What is the capital of France? Context: France is a country in Europe. Answer:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = unwrapped_model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Question: What is the capital of France? Context: France is a country in Europe. Answer: Paris
