# Model Comparison: Base vs Fine-tuned Gemma-2-2B

This notebook compares the responses from the original `google/gemma-2-2b` model with our fine-tuned `djohnston5/gemma-2-2b-sft` model on 10 Alpaca-style queries.

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from IPython.display import display, Markdown

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


  return torch._C._cuda_getDeviceCount() > 0


## Model Configuration

In [3]:
import gc

BASE_MODEL_NAME = "google/gemma-2-2b"
FINETUNED_MODEL_NAME = "djohnston5/gemma-2-2b-sft_crisp-armadillo-20"
# FINETUNED_MODEL_NAME = "djohnston5/gemma-2-2b-sft_magic-sea-21"


def load_model(model_name: str):
    """Load a model and tokenizer."""
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer


def unload_model(model):
    """Delete model and free GPU memory."""
    del model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print("Model unloaded, memory cleared.")

## Load Alpaca-Style Queries

Load 10 queries from `data/prompts/comparison_queries.json`.

In [4]:
import json
from pathlib import Path

PROMPTS_PATH = Path("data/prompts/comparison_queries.json")
QUERIES = json.loads(PROMPTS_PATH.read_text())

print(f"Loaded {len(QUERIES)} queries from {PROMPTS_PATH}")

Loaded 10 queries from data/prompts/comparison_queries.json


## Format Prompts & Generate Responses

In [5]:
def format_alpaca_prompt(instruction: str, input_text: str = "") -> str:
    """Format instruction and input into Alpaca-style prompt."""
    if input_text:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
"""
    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""


def generate_response(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 256,
    temperature: float = 0.7,
) -> str:
    """Generate a response from the model given a prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the response part (after the prompt)
    response = response[len(prompt):].strip()
    return response

## Run Comparison

Load one model at a time to conserve memory.

In [None]:
base_responses = []
finetuned_responses = []

# --- Base model inference ---
print(f"Loading {BASE_MODEL_NAME}...")
base_model, base_tokenizer = load_model(BASE_MODEL_NAME)

for i, query in enumerate(QUERIES, 1):
    print(f"[Base] Processing query {i}/{len(QUERIES)}...")
    prompt = format_alpaca_prompt(query["instruction"], query["input"])
    base_responses.append(generate_response(base_model, base_tokenizer, prompt))

unload_model(base_model)

# --- Fine-tuned model inference ---
print(f"\nLoading {FINETUNED_MODEL_NAME}...")
finetuned_model, finetuned_tokenizer = load_model(FINETUNED_MODEL_NAME)

for i, query in enumerate(QUERIES, 1):
    print(f"[Fine-tuned] Processing query {i}/{len(QUERIES)}...")
    prompt = format_alpaca_prompt(query["instruction"], query["input"])
    finetuned_responses.append(generate_response(finetuned_model, finetuned_tokenizer, prompt))

unload_model(finetuned_model)

# --- Combine results ---
results = [
    {
        "query_num": i,
        "instruction": query["instruction"],
        "input": query["input"],
        "base_response": base_resp,
        "finetuned_response": ft_resp,
    }
    for i, (query, base_resp, ft_resp) in enumerate(zip(QUERIES, base_responses, finetuned_responses), 1)
]

print("\nDone!")

Loading google/gemma-2-2b...


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.11it/s]


[Base] Processing query 1/10...


KeyboardInterrupt: 

: 

## Display Results

In [None]:
for result in results:
    md = f"""
---
## Query {result['query_num']}

**Instruction:** {result['instruction']}
"""
    if result["input"]:
        md += f"\n**Input:** {result['input']}\n"
    
    md += f"""
### Base Model (`{BASE_MODEL_NAME}`)
{result['base_response']}

### Fine-tuned Model (`{FINETUNED_MODEL_NAME}`)
{result['finetuned_response']}
"""
    display(Markdown(md))

## Save Results to CSV (Optional)

In [None]:
import pandas as pd
from pathlib import Path

df = pd.DataFrame(results)
output_path = Path("comparison_results.csv")
df.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")