In [1]:
!pip install transformers datasets==3.5.1 torch bitsandbytes accelerate hf_xet --q
!pip install peft trl wandb --q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, LoraConfig

from datasets import load_dataset
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import json

In [3]:
PATH_DIR = "/content/drive/MyDrive/coding/projects/LLM Finetuning + Benchmarking"
CACHE_DIR = PATH_DIR + "/hf_cache"
LORA_CHECKPOINT = PATH_DIR + "/finetune_output/checkpoint-800"

model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

In [4]:
# Check GPU availability
print(f"GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'None'}")
print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB" if torch.cuda.is_available() else "No GPU")

GPU: Tesla T4
Memory: 15.8GB


In [5]:
# Configure 4-bit Quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [6]:
# Load Base Model and Tokenizer
print(f"Loading base model: {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto", # Ensures efficient device placement
    torch_dtype=torch.float16,
    cache_dir=CACHE_DIR
)
print("Base model loaded.")

Loading base model: HuggingFaceTB/SmolLM2-1.7B-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Base model loaded.


In [7]:
# Load LoRA Adapters from Checkpoint
print(f"Loading LoRA adapters from: {LORA_CHECKPOINT}...")
# This loads the LoRA adapters on top of the 4-bit base model
model = PeftModel.from_pretrained(model, LORA_CHECKPOINT)
print("LoRA adapters loaded successfully!")

Loading LoRA adapters from: /content/drive/MyDrive/coding/projects/LLM Finetuning + Benchmarking/finetune_output/checkpoint-800...
LoRA adapters loaded successfully!


In [8]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(49152, 2048, padding_idx=2)
        (layers): ModuleList(
          (0-23): 24 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              

In [9]:
# Load BoolQ dataset
print("Loading BoolQ dataset...")
boolq = load_dataset("boolq", split="validation", cache_dir=CACHE_DIR)  # Use validation split for faster testing
print(f"Loaded {len(boolq)} examples")

Loading BoolQ dataset...


README.md:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

Loaded 3270 examples


In [10]:
def format_boolq_prompt(passage, question):
    """Format BoolQ example using proper chat template"""
    messages = [
        {
            "role": "user",
            "content": f"Read this passage and answer the question with only 'Yes' or 'No'.\n\nPassage: {passage}\n\nQuestion: {question}"
        }
    ]

    # Use the model's chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

print(format_boolq_prompt(boolq[0]["passage"], boolq[0]["question"]))

<|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
Read this passage and answer the question with only 'Yes' or 'No'.

Passage: All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily prod

In [11]:
def evaluate_boolq_sample(example):
    """Evaluate a single BoolQ example"""
    passage = example['passage']
    question = example['question']
    true_answer = "Yes" if example['answer'] else "No"

    prompt = format_boolq_prompt(passage, question)

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,  # We only need "Yes" or "No"
            temperature=0.1,   # Low temperature for consistent answers
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode response
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    response = response.strip()

    # Extract Yes/No from response
    predicted_answer = None
    if "yes" in response.lower():
        predicted_answer = "Yes"
    elif "no" in response.lower():
        predicted_answer = "No"
    else:
        predicted_answer = "Unknown"

    return {
        'question': question,
        'true_answer': true_answer,
        'predicted_answer': predicted_answer,
        'raw_response': response,
        'correct': predicted_answer == true_answer
    }

In [12]:
# Run evaluation on subset (to save time and compute)
print("Starting evaluation...")
num_samples = 1000  # Evaluate on first 1000 examples
results = []

for i in tqdm(range(min(num_samples, len(boolq))), desc="Evaluating"):
    try:
        result = evaluate_boolq_sample(boolq[i])
        results.append(result)
    except Exception as e:
        print(f"Error on sample {i}: {e}")
        continue

Starting evaluation...


Evaluating:   0%|          | 0/1000 [00:00<?, ?it/s]

In [13]:
results_df = pd.DataFrame(results)
results_df.predicted_answer.value_counts()

Unnamed: 0_level_0,count
predicted_answer,Unnamed: 1_level_1
Yes,603
No,391
Unknown,6


In [14]:
# Calculate accuracy
correct_predictions = sum(1 for r in results if r['correct'])
total_predictions = len(results)
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

print(f"\n=== Finetune RESULTS ===")
print(f"Model: {model_name} (4-bit quantized QLORA)")
print(f"Dataset: BoolQ validation set")
print(f"Samples evaluated: {total_predictions}")
print(f"Correct predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")


=== Finetune RESULTS ===
Model: HuggingFaceTB/SmolLM2-1.7B-Instruct (4-bit quantized QLORA)
Dataset: BoolQ validation set
Samples evaluated: 1000
Correct predictions: 794
Accuracy: 0.794 (79.4%)


In [15]:
# Show some examples
print(f"\n=== SAMPLE PREDICTIONS ===")
for i, result in enumerate(results[:5]):
    print(f"\nExample {i+1}:")
    print(f"Question: {result['question']}")
    print(f"True Answer: {result['true_answer']}")
    print(f"Predicted: {result['predicted_answer']}")
    print(f"Raw Response: '{result['raw_response']}'")
    print(f"Correct: {result['correct']}")


=== SAMPLE PREDICTIONS ===

Example 1:
Question: does ethanol take more energy make that produces
True Answer: No
Predicted: Yes
Raw Response: 'Yes'
Correct: False

Example 2:
Question: is house tax and property tax are same
True Answer: Yes
Predicted: No
Raw Response: 'No'
Correct: False

Example 3:
Question: is pain experienced in a missing body part or paralyzed area
True Answer: Yes
Predicted: Yes
Raw Response: 'Yes'
Correct: True

Example 4:
Question: is harry potter and the escape from gringotts a roller coaster ride
True Answer: Yes
Predicted: Yes
Raw Response: 'Yes'
Correct: True

Example 5:
Question: is there a difference between hydroxyzine hcl and hydroxyzine pam
True Answer: Yes
Predicted: Yes
Raw Response: 'Yes'
Correct: True


In [17]:
# Save results for comparison later
finetuned_results = {
    'model': model_name,
    'quantization': '4-bit',
    'dataset': 'BoolQ',
    'samples': total_predictions,
    'accuracy': accuracy,
    'results': results
}

with open('finetuned_results.json', 'w') as f:
    json.dump(finetuned_results, f, indent=2)

print(f"Finetuned results saved to 'finetuned_results.json'")
print(f"Finetune accuracy: {accuracy:.3f}")

Finetuned results saved to 'finetuned_results.json'
Finetune accuracy: 0.794
