# 🧙‍♂️ Welcome to Fine-Tuning with LLaMA3.2 🦙

In this notebook, we'll explore how to fine-tune the **LLaMA3.2** language model using **parameter-efficient fine-tuning** techniques like LoRA and PEFT. This notebook will also demonstrate how to create a retrieval-augmented generation (RAG) system and evaluate the fine-tuned models using popular NLP metrics.

Let's embark on a magical journey, much like Harry Potter's adventures! 🎩✨

![Harry Potter](https://upload.wikimedia.org/wikipedia/en/7/7a/Harry_Potter_and_the_Philosopher%27s_Stone_banner.jpg)

In [None]:
# Cell 2: Install Required Packages
# Code Cell:
#!pip install transformers torch peft datasets scikit-learn nltk rouge-score meteor-score langchain faiss-cpu matplotlib

### Step 1: Import Necessary Libraries
Below, we import the libraries required for data processing, model fine-tuning, and evaluation.

In [None]:
import os
import re
import json
import torch
import evaluate
import numpy as np
import matplotlib.pyplot as plt

from datasets import load_dataset, DatasetDict
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from torch.utils.data import DataLoader
from langchain_ollama import ChatOllama
from langchain.document_loaders import PyPDFLoader
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training, PeftModel, PeftConfig
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments, 
    BitsAndBytesConfig
)

In [None]:
import os
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

### Step 2: Define Constants
Here, we specify paths and configuration parameters for the model, datasets, and output directories.


In [None]:
OUTPUT_DIR = "models"
TRAINING_DATA_PATH = "data/train.jsonl"
EVALUATION_DATA_PATH = "data/test.jsonl"
MODEL = "mistral"


### Step 3: Utility Functions
We define helper functions for text preprocessing, data saving, and evaluation.

In [None]:
from typing import Any, Dict, List

def preprocess_text(page_text):
    """Preprocess text by removing noise and irrelevant content."""
    text = re.sub(r"[^\x00-\x7F]+", " ", page_text)  # Remove non-ASCII characters
    text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces with a single space
    return text

### Step 4: Load Training and Evaluation Data
We load the training and evaluation datasets for fine-tuning and testing.

In [None]:

def load_jsonl_data(file_path: str) -> List[Dict[str, Any]]:
    """
    Load data from a JSONL file.
    
    Args:
        file_path (str): Path to the JSONL file
    
    Returns:
        List[Dict[str, Any]]: List of data entries
    """
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]
    
def prepare_data(file_path: str) -> List[Dict[str, str]]:
    """
    Prepare training data in Ollama's expected format.
    
    Args:
        file_path: str
    
    Returns:
        List[Dict[str, str]]: Formatted training data for Ollama
    """
    data = load_jsonl_data(file_path=file_path)
    ollama_data = []
    for item in data:
        ollama_data.append({
            'prompt': preprocess_text(item['question']),
            'response': preprocess_text(item['answer'])
        })
    return ollama_data

training_dataset = prepare_data(TRAINING_DATA_PATH)
evaluation_dataset = prepare_data(EVALUATION_DATA_PATH)

In [None]:
training_dataset[:2]


### Step 5: Create Vector Store
Here, we use the `FAISS` library to create a vector store for retrieval-augmented generation (RAG).

In [None]:
embeddings = OllamaEmbeddings(model=MODEL)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Combine each question with its corresponding answer into a single string
combined_texts = []
for data in training_dataset:
    combined_text = f"question: {data['prompt']}\n answer: {data['response']}"
    combined_texts.append(combined_text)

split_docs = []
for combined_text in combined_texts:
    split_docs.extend(text_splitter.split_documents([Document(page_content= combined_text)]))  # Split the combined question-answer text

# Create FAISS vector store from the split documents
vector_store = FAISS.from_documents(split_docs, embeddings)
retriever = vector_store.as_retriever()

### Step 6: Fine-Tune the Model
We fine-tune the LLaMA3.2 model using following methods.
1. **LoRA**: Lightweight fine-tuning for adapters.
2. **Full Fine-Tuning**: Training all model parameters.

In [None]:
hf_token = "..."
os.environ["HF_ACCESS_TOKEN"] = hf_token


def fine_tune_model(
    training_file, 
    output_dir, 
    method="LoRA", 
    model_name="EleutherAI/gpt-neo-125m",  # More capable model
    **kwargs
):
    """
    Enhanced fine-tuning function with improved model and training strategies
    
    Args:
    - training_file: Path to JSON training data
    - output_dir: Directory to save fine-tuned model
    - method: Fine-tuning method (default: LoRA)
    - model_name: Base model to use
    """
    
    # Load tokenizer and model with improved configuration
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
    )    
    # Set padding token
    tokenizer.pad_token = tokenizer.eos_token
    
    # Prepare model for efficient training
    model = prepare_model_for_kbit_training(model)

    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
    model.to(device)
    
    # LoRA Configuration with enhanced parameters
    if method == "LoRA":
        lora_config = LoraConfig(
            r=16,  # Increased rank for more capacity
            lora_alpha=32,
            lora_dropout=0.1,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules=["q_proj", "v_proj"]  # Target specific attention modules
        )
        model = get_peft_model(model, lora_config)
    
    # Enhanced preprocessing function
    def preprocess_function(examples):
        """
        Comprehensive preprocessing with enhanced formatting
        """
        inputs = [
            f"Harry Potter Knowledge Base\n"
            f"Task: Provide an accurate and concise answer.\n"
            f"Q: {preprocess_text(question)}\n"
            f"A: {preprocess_text(answer)}{tokenizer.eos_token}"
            for question, answer in zip(examples['question'], examples['answer'])
        ]
        print(inputs[0])
        
        # Advanced tokenization
        tokenized_inputs = tokenizer(
            inputs, 
            truncation=True, 
            padding="max_length", 
            max_length=512,
            return_tensors="pt"
        )
        
        # Add labels for language modeling loss
        tokenized_inputs['labels'] = tokenized_inputs['input_ids'].clone()
        
        return tokenized_inputs
    
    # Load and preprocess dataset
    dataset = load_dataset("json", data_files=training_file)
    split_data = dataset['train'].train_test_split(test_size=0.1, seed=42)
    split_data = {
        'train': split_data['train'],
        'validation': split_data['test']
    }
    
    tokenized_data = DatasetDict(split_data)

    tokenized_data = tokenized_data.map(
        preprocess_function, 
        batched=True, 
        remove_columns=['question', 'answer']
    )

    eval_dataset = tokenized_data.get('validation')  # Check if this is None
    if eval_dataset is None:
        eval_dataset = tokenized_data.get('test')  # Fallback to another key
        print("Using 'test' as eval_dataset:", eval_dataset)
    
    # Prepare training arguments with advanced configuration
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,  # Increased epochs
        per_device_train_batch_size=4,  # Slightly increased batch size
        gradient_accumulation_steps=2,
        learning_rate=2e-5,  # Fine-tuned learning rate
        weight_decay=0.01,
        warmup_steps=100,
        logging_dir=f"{output_dir}/logs",
        logging_steps=50,
        save_strategy="steps",
        save_steps=500,
        evaluation_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        **kwargs
    )
    
    # Initialize Trainer with additional configurations
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data['train'],
        eval_dataset=eval_dataset,
        tokenizer=tokenizer
    )
    
    # Train the model
    trainer.train()
    
    # Save the fine-tuned model
    trainer.save_model(output_dir)
    
    return model, tokenizer

# Helper function for text preprocessing
def preprocess_text(text):
    """
    Enhanced text preprocessing
    """
    return text.lower().strip()

In [None]:
methods = {
        "LoRA": {
            "output_dir": os.path.join(OUTPUT_DIR, "lora_fine_tuned_model_new"),
            "method": "LoRA",
        },

        # "PEFT": {
        #     "output_dir": os.path.join(OUTPUT_DIR, "peft_fine_tuned_model"),
        #     "method": "peft",
        #     "peft_config": {"task_type": "seq2seq", "adapter_hidden_size": 64},
        # },
        # "Full": {
        #     "output_dir": os.path.join(OUTPUT_DIR, "full_fine_tuned_model"),
        #     "method": "full",
        # }
    }

In [None]:
model = None
for method_name, params in methods.items():
    model = fine_tune_model(training_file=TRAINING_DATA_PATH, **params)

### Step 7: Evaluate the Fine-Tuned Models
We evaluate each fine-tuned model and compare the results using metrics such as BLEU, ROUGE, and F1 scores.

In [None]:

from typing import Optional

def evaluate_model(
    model_path: str, 
    evaluation_file: str, 
    model_type: str = "LoRA", 
    evaluation_metrics: Optional[List[str]] = None,
    max_eval_samples: Optional[int] = None
) -> Dict[str, float]:
    """
    Comprehensively evaluate a fine-tuned model with multiple metrics.
    
    Args:
        model_path (str): Path to the fine-tuned model directory
        evaluation_file (str): Path to the JSON evaluation dataset
        model_type (str): Type of model loading ('LoRA' or 'Full')
        evaluation_metrics (List[str], optional): Metrics to compute
        max_eval_samples (int, optional): Limit number of evaluation samples
    
    Returns:
        Dict[str, float]: Computed evaluation metrics
    """
    # Default evaluation metrics if not specified
    if evaluation_metrics is None:
        evaluation_metrics = [
            'exact_match', 
            'f1_score', 
            'bleu', 
            'rouge', 
            'meteor'
        ]
    
    # Determine the best available device
    device = torch.device(
        "mps" if torch.backends.mps.is_available() 
        else "cuda" if torch.cuda.is_available() 
        else "cpu"
    )
    print(f"Using device: {device}")
    
    # Load tokenizer and model
    try:
        # Load base model configuration
        if model_type == "LoRA":
            try:
                config = PeftConfig.from_pretrained(model_path)
                base_model_name = config.base_model_name_or_path
            except Exception as config_e:
                print(f"Error loading PEFT config: {config_e}")
                base_model_name = "EleutherAI/gpt-neo-125m"  # Fallback to default
        else:
            base_model_name = model_path
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        tokenizer.pad_token = tokenizer.eos_token
        
        # Load base model
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name, 
            torch_dtype=torch.float16,
            device_map={"": device}
        )
        
        # Load and prepare model
        if model_type == "LoRA":
            # Try to load as PEFT model
            try:
                model = PeftModel.from_pretrained(
                    base_model, 
                    model_path,
                    torch_dtype=torch.float16,
                    device_map={"": device}
                )
                
                # Attempt to merge and unload LoRA weights
                try:
                    model = model.merge_and_unload()
                except Exception as merge_e:
                    print(f"Warning: Could not merge LoRA weights: {merge_e}")
                    # Fallback to using the PEFT model without merging
            except Exception as peft_e:
                print(f"Error loading PEFT model: {peft_e}")
                # Fallback to base model with LoRA config
                from peft import LoraConfig, get_peft_model
                lora_config = LoraConfig(
                    r=16,
                    lora_alpha=32,
                    lora_dropout=0.1,
                    bias="none",
                    task_type="CAUSAL_LM",
                    target_modules=["q_proj", "v_proj"]
                )
                model = get_peft_model(base_model, lora_config)
        else:
            # Full fine-tuning model
            model = base_model
        
        # Ensure model is on correct device
        model.to(device)
        
        # Verify model can generate
        print("Checking model generation capability...")
        test_input = tokenizer("Test input", return_tensors="pt").to(device)
        try:
            _ = model.generate(**test_input, max_length=10)
            print("Model generation verified successfully.")
        except Exception as gen_e:
            print(f"Generation test failed: {gen_e}")
            raise
    
    except Exception as e:
        print(f"Comprehensive model loading error: {e}")
        return {}
    
    # Load evaluation dataset
    try:
        eval_dataset = load_dataset("json", data_files=evaluation_file)['train']
    except Exception as e:
        print(f"Error loading evaluation dataset: {e}")
        return {}
    
    # Limit evaluation samples if specified
    if max_eval_samples:
        eval_dataset = eval_dataset.select(range(min(len(eval_dataset), max_eval_samples)))
    
    # Initialize metric objects
    metric_loaders = {
        'exact_match': evaluate.load("exact_match"),
        'f1_score': evaluate.load("f1"),
        'bleu': evaluate.load("bleu"),
        'rouge': evaluate.load("rouge"),
        'meteor': evaluate.load("meteor")
    }
    
    # Prepare for evaluation
    predictions = []
    references = []
    
    # Enhanced generation function
    def generate_response(input_text: str) -> str:
        """
        Generate model response with improved generation parameters.
        
        Args:
            input_text (str): Input question to generate response for
        
        Returns:
            str: Generated response
        """
        # Prepare input with context formatting
        formatted_input = (
            f"Harry Potter Knowledge Base\n"
            f"Task: Provide an accurate and concise answer.\n"
            f"{input_text}"
        )
        
        # Tokenize input
        inputs = tokenizer(
            formatted_input, 
            return_tensors="pt", 
            padding=True, 
            truncation=True, 
            max_length=512
        ).to(device)
        
        # Generate response with advanced parameters
        with torch.no_grad():
            try:
                output = model.generate(
                    **inputs,
                    max_length=512, 
                    num_return_sequences=1, 
                    pad_token_id=tokenizer.eos_token_id,
                    no_repeat_ngram_size=2,
                    temperature=0.7,
                    top_k=50,
                    top_p=0.95
                )
            except Exception as e:
                print(f"Error generating response: {e}")
                return ""
        
        # Decode and clean response
        return tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Evaluation loop
    for item in eval_dataset:
        # Generate prediction
        pred = generate_response(f"Q: {item['question']}")
        ref = item['answer']
        
        # Optional verbose logging
        print(f"Question: {item['question']}")
        print(f"Prediction: {pred}")
        print(f"Reference: {ref}\n")
        
        predictions.append(pred)
        references.append(ref)
    
    # Compute metrics
    results = {}
    print(predictions)
    for metric_name in evaluation_metrics:
        try:
            if metric_name == 'exact_match':
                results['exact_match'] = metric_loaders['exact_match'].compute(
                    predictions=predictions, 
                    references=references
                )['exact_match']
            
            elif metric_name == 'f1_score':
                results['f1_score'] = metric_loaders['f1_score'].compute(
                    predictions=predictions, 
                    references=references, 
                    average='weighted'
                )['f1']
            
            elif metric_name == 'bleu':
                results['bleu'] = metric_loaders['bleu'].compute(
                    predictions=predictions, 
                    references=references
                )['bleu']
            
            elif metric_name == 'rouge':
                rouge_results = metric_loaders['rouge'].compute(
                    predictions=predictions, 
                    references=references
                )
                results['rouge'] = {
                    'rouge1': rouge_results['rouge1'],
                    'rouge2': rouge_results['rouge2'],
                    'rougeL': rouge_results['rougeL']
                }
            
            elif metric_name == 'meteor':
                results['meteor'] = metric_loaders['meteor'].compute(
                    predictions=predictions, 
                    references=references
                )['meteor']
        
        except Exception as e:
            print(f"Error computing {metric_name} metric: {e}")
    
    return results

In [None]:
models = {
    "LoRA": os.path.join(OUTPUT_DIR, "lora_fine_tuned_model_new"),
    #"PEFT": os.path.join(OUTPUT_DIR, "peft_fine_tuned_model"),
    #"Full": os.path.join(OUTPUT_DIR, "full_fine_tuned_model")
}

evaluation_results = {}
for method_name, model_path in models.items():
    evaluation_results[method_name] = evaluate_model(
        model_path=model_path,
        evaluation_file=EVALUATION_DATA_PATH,
        model_type=method_name,    
    )

In [None]:
evaluation_results["LoRA"]

In [None]:
rag_llm = ChatOllama(model=model, n_ctx=2048, temperature=0.7, device="cpu")
rag_qa_chain = RetrievalQA.from_chain_type(llm=rag_llm, retriever=retriever, return_source_documents=True)
evaluation_results["RAG"] = evaluate_model(rag_qa_chain, evaluation_dataset)

### Step 8: Visualize Results
Finally, we display the evaluation metrics in a clear and concise format.

In [None]:
for method, metrics in evaluation_results.items():
    print(f"\nMethod: {method}")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}" if isinstance(value, (int, float)) else f"{metric}: {value}")

In [None]:
# Example visualization
metrics = ["exact_match", "f1_score", "bleu_score"]
for metric in metrics:
    plt.bar(evaluation_results.keys(), [evaluation_results[method][metric] for method in methods])
    plt.title(f"Comparison of {metric} Scores")
    plt.show()

### Step 9: Sample Question-Answer Showcase
To illustrate the differences between the models, we test them on the same sample questions and compare their responses.


In [None]:
sample_questions = [
    "Who are friends of Harry potter?",
    "Who wrote 'Harry Potter'?",
    "who are Ron Weasely's parents?",
]

# Generate answers from each model
for method, model_path in models.items():
    print(f"\n=== {method} Model Responses ===")
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    qa_chain = RetrievalQA.from_chain_type(llm=model, retriever=retriever, return_source_documents=True)
    
    for question in sample_questions:
        response = qa_chain.run(question)
        print(f"Q: {question}\nA: {response}\n")