# LLM Fine-tuning Quick Start

This notebook demonstrates how to fine-tune a language model using our framework.

In [None]:
import sys
sys.path.append('..')

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

## 1. Load Model and Tokenizer

In [None]:
model_name = "microsoft/phi-2"  # Using a smaller model for demo

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

print(f"Model loaded: {model_name}")
print(f"Model size: {model.num_parameters():,} parameters")

## 2. Configure PEFT

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    bias="none"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

## 3. Prepare Dataset

In [None]:
# Load a small dataset for demo
dataset = load_dataset("tatsu-lab/alpaca", split="train[:100]")

def format_instruction(sample):
    instruction = sample['instruction']
    input_text = sample.get('input', '')
    output = sample['output']
    
    if input_text:
        text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
    else:
        text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
    
    return text

# Format dataset
formatted_dataset = [format_instruction(sample) for sample in dataset]
print(f"Dataset size: {len(formatted_dataset)}")
print(f"\nExample:\n{formatted_dataset[0][:200]}...")

## 4. Training Setup

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Tokenize dataset
def tokenize_function(text):
    return tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    )

# Create dataset
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": encoding["input_ids"].squeeze()
        }

train_dataset = TextDataset(formatted_dataset[:80], tokenizer)
eval_dataset = TextDataset(formatted_dataset[80:], tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./outputs",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    warmup_steps=10,
    logging_steps=10,
    save_steps=50,
    eval_strategy="steps",
    eval_steps=20,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

## 5. Train Model

In [None]:
# Start training
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

## 6. Test Inference

In [None]:
def generate_response(instruction, model, tokenizer):
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
    
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):]

# Test the model
test_instruction = "What are the benefits of exercise?"
response = generate_response(test_instruction, model, tokenizer)
print(f"Instruction: {test_instruction}")
print(f"\nResponse: {response}")

## 7. Evaluation Metrics

In [None]:
import numpy as np

# Evaluate the model
eval_results = trainer.evaluate()

print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# Calculate perplexity
if 'eval_loss' in eval_results:
    perplexity = np.exp(eval_results['eval_loss'])
    print(f"Perplexity: {perplexity:.2f}")

## Summary

This notebook demonstrated:
1. Loading a pre-trained model
2. Applying PEFT (LoRA) for efficient fine-tuning
3. Preparing and formatting a dataset
4. Training the model
5. Running inference with the fine-tuned model
6. Evaluating model performance

For production use, refer to the main training script which includes:
- Advanced configuration options
- Multi-GPU support
- Experiment tracking
- Comprehensive evaluation metrics