# üß™ LoRA & Fine-Tuning Benchmark Suite

**Objective**: Compare LoRA, QLoRA, and other fine-tuning methods on Banking77 intent classification.

**Duration**: ~2-3 hours per full run

**GPU Required**: Yes (T4 or better)

Run cells sequentially. Save results to your Google Drive.

## üì¶ Part 1: Setup & Dependencies

In [None]:
# Mount Google Drive (optional, for saving results)
from google.colab import drive
drive.mount('/content/drive')

# Check GPU
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024:.1f} GB")

In [None]:
# Install all required packages
!pip install -q peft transformers datasets evaluate tensorboard bitsandbytes scikit-learn pandas numpy matplotlib seaborn

In [None]:
import torch
import numpy as np
import pandas as pd
import time
from datetime import datetime
from typing import Dict, List
import matplotlib.pyplot as plt
import seaborn as sns

from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print("‚úÖ All imports successful!")

## üì• Part 2: Load Banking77 Dataset

In [None]:
# Load Banking77 dataset
print("üì• Loading Banking77 dataset...")
dataset = load_dataset("banking77")

# Inspect dataset
print(f"Dataset structure: {dataset}")
print(f"\nSample:")
print(dataset["train"][0])
print(f"\nNumber of intent classes: {dataset['train'].features['label'].num_classes}")

In [None]:
# Split into train/val/test
train_val = dataset["train"].train_test_split(test_size=0.2, seed=42)
val_test = train_val["test"].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    "train": train_val["train"],
    "validation": val_test["train"],
    "test": val_test["test"],
})

print(f"‚úÖ Train: {len(dataset['train'])}, Val: {len(dataset['validation'])}, Test: {len(dataset['test'])}")

In [None]:
# Tokenize dataset
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        max_length=128,
        truncation=True,
    )

print("üîß Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"],
)

print(f"‚úÖ Tokenization complete!")
print(f"Sample tokenized: {tokenized_dataset['train'][0]}")

## üî¨ Part 3: Benchmark LoRA with Different Ranks

In [None]:
# Define LoRA configurations to test
lora_configs = [
    {"r": 4, "lora_alpha": 8, "lora_dropout": 0.05, "name": "LoRA-r4"},
    {"r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "name": "LoRA-r8"},
    {"r": 16, "lora_alpha": 32, "lora_dropout": 0.1, "name": "LoRA-r16"},
    {"r": 32, "lora_alpha": 64, "lora_dropout": 0.1, "name": "LoRA-r32"},
]

results = []
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"üöÄ Starting LoRA benchmarks on {device.upper()}\n")

for i, config in enumerate(lora_configs):
    print(f"\n{'='*60}")
    print(f"Benchmark {i+1}/{len(lora_configs)}: {config['name']}")
    print(f"{'='*60}")
    
    # Load base model
    print("Loading model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=77,
    ).to(device)
    
    # Apply LoRA
    lora_config = LoraConfig(
        r=config["r"],
        lora_alpha=config["lora_alpha"],
        target_modules=["q_proj", "v_proj"],
        lora_dropout=config["lora_dropout"],
        bias="none",
        task_type=TaskType.SEQ_CLS,
    )
    
    model = get_peft_model(model, lora_config)
    
    # Count parameters
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    param_ratio = (trainable / total) * 100
    
    print(f"Parameters: {trainable:,} trainable / {total:,} total ({param_ratio:.2f}%)")
    
    # Setup trainer
    training_args = TrainingArguments(
        output_dir=f"./lora_{config['name']}",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        learning_rate=2e-4,
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
    )
    
    # Train with timing
    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()
    
    print("Training...")
    trainer.train()
    
    training_time = time.time() - start_time
    peak_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024
    
    # Evaluate
    print("Evaluating...")
    predictions = trainer.predict(tokenized_dataset["test"])
    preds = np.argmax(predictions.predictions, axis=1)
    accuracy = accuracy_score(dataset["test"]["label"], preds)
    f1 = f1_score(dataset["test"]["label"], preds, average="weighted")
    
    # Store results
    result = {
        "method": "LoRA",
        "rank": config["r"],
        "lora_alpha": config["lora_alpha"],
        "dropout": config["lora_dropout"],
        "trainable_params": trainable,
        "total_params": total,
        "param_ratio": param_ratio,
        "training_time_sec": training_time,
        "peak_memory_gb": peak_memory,
        "accuracy": accuracy,
        "f1_score": f1,
    }
    results.append(result)
    
    print(f"\n‚úÖ Complete!")
    print(f"   Accuracy: {accuracy:.2%}")
    print(f"   F1 Score: {f1:.4f}")
    print(f"   Time: {training_time:.1f}s")
    print(f"   Memory: {peak_memory:.1f}GB")
    
    # Cleanup
    torch.cuda.empty_cache()

print(f"\n{'='*60}")
print("‚úÖ All benchmarks complete!")

## üìä Part 4: Results & Visualization

In [None]:
# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("rank")

print("\nüìã Benchmark Results:")
print(results_df[["rank", "accuracy", "f1_score", "training_time_sec", "peak_memory_gb", "param_ratio"]].to_string())

# Save to CSV
results_df.to_csv("lora_benchmark_results.csv", index=False)
print("\nüíæ Saved to lora_benchmark_results.csv")

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle("LoRA Benchmark Results - Banking77", fontsize=16, fontweight="bold")

# Plot 1: Accuracy vs Rank
axes[0, 0].plot(results_df["rank"], results_df["accuracy"] * 100, marker="o", linewidth=2, markersize=8)
axes[0, 0].set_xlabel("LoRA Rank", fontsize=11)
axes[0, 0].set_ylabel("Accuracy (%)", fontsize=11)
axes[0, 0].set_title("Accuracy vs LoRA Rank", fontsize=12, fontweight="bold")
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xticks(results_df["rank"])

# Plot 2: Memory vs Rank
axes[0, 1].plot(results_df["rank"], results_df["peak_memory_gb"], marker="s", linewidth=2, markersize=8, color="orange")
axes[0, 1].set_xlabel("LoRA Rank", fontsize=11)
axes[0, 1].set_ylabel("Peak Memory (GB)", fontsize=11)
axes[0, 1].set_title("Memory Usage vs LoRA Rank", fontsize=12, fontweight="bold")
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xticks(results_df["rank"])

# Plot 3: Training Time vs Rank
axes[1, 0].plot(results_df["rank"], results_df["training_time_sec"], marker="^", linewidth=2, markersize=8, color="green")
axes[1, 0].set_xlabel("LoRA Rank", fontsize=11)
axes[1, 0].set_ylabel("Training Time (seconds)", fontsize=11)
axes[1, 0].set_title("Training Time vs LoRA Rank", fontsize=12, fontweight="bold")
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_xticks(results_df["rank"])

# Plot 4: Parameter Efficiency
axes[1, 1].bar(results_df["rank"].astype(str), results_df["param_ratio"], color="purple", alpha=0.7)
axes[1, 1].set_xlabel("LoRA Rank", fontsize=11)
axes[1, 1].set_ylabel("Trainable Parameters (%)", fontsize=11)
axes[1, 1].set_title("Parameter Efficiency", fontsize=12, fontweight="bold")
axes[1, 1].grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.savefig("benchmark_results.png", dpi=150, bbox_inches="tight")
plt.show()

print("‚úÖ Visualization saved as benchmark_results.png")

## üîç Part 5: Analysis & Recommendations

In [None]:
print("\nüìä BENCHMARK ANALYSIS\n")

# Find best accuracy
best_acc_idx = results_df["accuracy"].idxmax()
best_acc = results_df.loc[best_acc_idx]
print(f"ü•á Best Accuracy: Rank {best_acc['rank']} with {best_acc['accuracy']:.2%}")

# Find best efficiency (accuracy per MB)
results_df["efficiency"] = results_df["accuracy"] / (results_df["peak_memory_gb"] * 1024)
best_eff_idx = results_df["efficiency"].idxmax()
best_eff = results_df.loc[best_eff_idx]
print(f"‚ö° Best Efficiency: Rank {best_eff['rank']} with {best_eff['accuracy']:.2%} accuracy at {best_eff['peak_memory_gb']:.1f}GB")

# Find fastest training
fastest_idx = results_df["training_time_sec"].idxmin()
fastest = results_df.loc[fastest_idx]
print(f"üöÄ Fastest Training: Rank {fastest['rank']} in {fastest['training_time_sec']:.1f}s")

# Memory optimization
min_mem = results_df["peak_memory_gb"].min()
max_mem = results_df["peak_memory_gb"].max()
print(f"\nüíæ Memory Range: {min_mem:.1f}GB - {max_mem:.1f}GB")

# Accuracy improvement
min_acc = results_df["accuracy"].min()
max_acc = results_df["accuracy"].max()
acc_gain = (max_acc - min_acc) * 100
print(f"üìà Accuracy Range: {min_acc:.2%} - {max_acc:.2%} (gain: {acc_gain:.1f}%)")

print("\n‚úÖ RECOMMENDATION:")
print(f"   Use Rank {best_eff['rank']} for best balance of accuracy and efficiency")
print(f"   - Achieves {best_eff['accuracy']:.2%} accuracy")
print(f"   - Uses only {best_eff['peak_memory_gb']:.1f}GB GPU memory")
print(f"   - Trains in {best_eff['training_time_sec']:.0f} seconds")

## üóÑÔ∏è Part 6: Vector Database Comparison (Optional)

In [None]:
# Install vector DB packages
!pip install -q faiss-cpu chromadb sentence-transformers

from sentence_transformers import SentenceTransformer
import time

# Load embedding model
print("Loading embedding model...")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create test data
banking_queries = [
    "I want to transfer money to my account",
    "What's my account balance",
    "How do I set up a loan",
    "Check my recent transactions",
    "I need to report a fraud",
] * 200  # Repeat to get 1000 queries

print(f"Generating {len(banking_queries)} embeddings...")
embeddings = embedding_model.encode(banking_queries, show_progress_bar=True)
print(f"‚úÖ Embeddings shape: {embeddings.shape}")

In [None]:
# Benchmark FAISS
import faiss

print("\nüî¨ Benchmarking FAISS...")

index = faiss.IndexFlatL2(embeddings.shape[1])

# Indexing
start = time.time()
index.add(embeddings.astype(np.float32))
indexing_time = time.time() - start

# Query
queries = embeddings[0:10].astype(np.float32)
start = time.time()
distances, indices = index.search(queries, k=5)
query_time = (time.time() - start) / len(queries) * 1000  # ms

print(f"‚úÖ FAISS Results:")
print(f"   Indexing: {indexing_time:.2f}s for {len(embeddings):,} vectors")
print(f"   Query: {query_time:.1f}ms per query")
print(f"   Memory: ~{embeddings.nbytes / 1024 / 1024:.1f}MB")

## üíæ Save Results to Drive

In [None]:
# Save files to Google Drive
import shutil

drive_path = "/content/drive/MyDrive/Banking_LLM_Benchmarks"
!mkdir -p "{drive_path}"

# Copy results
!cp lora_benchmark_results.csv "{drive_path}/"
!cp benchmark_results.png "{drive_path}/"

print(f"‚úÖ Results saved to Drive: {drive_path}")
print("   - lora_benchmark_results.csv")
print("   - benchmark_results.png")