In [None]:
!pip install "gdown"
!pip install "unsloth"

import subprocess
import sys
import os
import warnings
import gdown
from datetime import datetime
import json
import time
import pandas as pd
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

warnings.filterwarnings('ignore')

In [2]:
# CUDA check
try:
    import torch
    print(f"✅ PyTorch: {torch.__version__}")
    
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
        print(f"✅ GPU: {gpu_name}")
        print(f"✅ VRAM: {vram_gb:.1f} GB")
        
        if vram_gb < 8:
            print("⚠️ Warning: > 8GB VRAM causes fragmentation issues at current.")
    else:
        print("❌ No CUDA GPU detected!")
        
except ImportError:
    print("❌ PyTorch not found")

try:
    from unsloth import FastLanguageModel
    from peft import PeftModel
    print("✅ Unsloth and PEFT available")
except ImportError as e:
    print(f"❌ Missing dependencies: {e}")
    print("Installing required packages...")

    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        except:
            print(f"⚠️ Failed to install {package}")

✅ PyTorch: 2.5.1+cu121
✅ GPU: NVIDIA GeForce RTX 4090
✅ VRAM: 25.4 GB
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
✅ Unsloth and PEFT available


## Inference config

In [3]:
CONFIG = {
    #defaults
    "base_model_name": "unsloth/llama-3-8b-Instruct",
    "finetuned_model_path": "llm",
    "max_seq_length": 2048,
    "max_new_tokens": 300,
    "temperature": 0.7,
    "top_p": 0.9,
    "repetition_penalty": 1.1,
    "load_in_4bit": False, 
}

# Auto-adjusts based on GPU
if torch.cuda.is_available():
    vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    
    if vram_gb >= 20:  
        CONFIG.update({
            "max_seq_length": 4096,
            "max_new_tokens": 500,
            "load_in_4bit": False,
        })
    elif vram_gb < 10: 
        CONFIG.update({
            "max_seq_length": 512,
            "max_new_tokens": 200,
        })

In [4]:
# Helpers
def clear_gpu_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def create_hansard_prompt(question):
    return f"""<|start_header_id|>system<|end_header_id|>
You are a helpful assistant specializing in parliamentary procedures and government documentation.<|eot_id|><|start_header_id|>user<|end_header_id|>
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

def format_time(seconds):
    if seconds < 1:
        return f"{seconds*1000:.0f}ms"
    else:
        return f"{seconds:.2f}s"

print("✅ Helper functions loaded")

✅ Helper functions loaded


## Model

In [5]:
clear_gpu_memory()

print("🔄 Loading base Llama-3-8B model...")
start_time = time.time()

try:
    # Load base model
    base_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=CONFIG["base_model_name"],
        max_seq_length=CONFIG["max_seq_length"],
        dtype=torch.bfloat16 if not CONFIG["load_in_4bit"] else None,
        load_in_4bit=CONFIG["load_in_4bit"],
        device_map="auto"
    )
    
    FastLanguageModel.for_inference(base_model)
    
    load_time = time.time() - start_time
    memory_used = torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else 0
    
    print(f"✅ Base model loaded in {format_time(load_time)}")
    print(f"📊 GPU Memory: {memory_used:.1f} GB")
    
    # Set pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
except Exception as e:
    print(f"❌ Base model loading failed: {e}")
    raise

🔄 Loading base Llama-3-8B model...
==((====))==  Unsloth 2025.6.6: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.65 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

✅ Base model loaded in 206.19s
📊 GPU Memory: 16.1 GB


In [6]:
print("🔄 Loading fine-tuned Hansard model...")
start_time = time.time()

try:
    finetuned_model = PeftModel.from_pretrained(
        base_model, 
        CONFIG["finetuned_model_path"]
    )
    
    load_time = time.time() - start_time
    total_memory = torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else 0
    
    print(f"✅ Fine-tuned model loaded in {format_time(load_time)}")
    print(f"📊 Total GPU Memory: {total_memory:.1f} GB")
    
    model_loaded = True
    
except Exception as e:
    print(f"❌ Fine-tuned model loading failed: {e}")
    print("ℹ️ Will proceed with base model only for comparison")
    finetuned_model = None
    model_loaded = False

🔄 Loading fine-tuned Hansard model...
✅ Fine-tuned model loaded in 1.64s
📊 Total GPU Memory: 16.2 GB


## Single Question Comparison

In [7]:
def generate_response(model, prompt, model_name="Model"):
    """Generate response from a specific model"""
    try:
        inputs = tokenizer(
            [prompt], 
            return_tensors="pt", 
            truncation=True,
            max_length=CONFIG["max_seq_length"] - CONFIG["max_new_tokens"]
        ).to("cuda")
        
        start_time = time.time()
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=CONFIG["max_new_tokens"],
                do_sample=True,
                temperature=CONFIG["temperature"],
                top_p=CONFIG["top_p"],
                repetition_penalty=CONFIG["repetition_penalty"],
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True
            )
        
        generation_time = time.time() - start_time
        
        response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        
        # Extract assistant response
        if "<|start_header_id|>assistant<|end_header_id|>" in response:
            response = response.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
        else:
            response = response[len(prompt):].strip()
        
        return {
            "response": response,
            "generation_time": generation_time,
            "input_tokens": inputs['input_ids'].shape[1],
            "output_tokens": outputs.shape[1],
            "success": True,
            "model_name": model_name
        }
        
    except Exception as e:
        return {
            "response": f"Error: {str(e)}",
            "generation_time": 0,
            "success": False,
            "model_name": model_name
        }

def compare_single_question(question):
    """Compare both models on a single question"""
    prompt = create_hansard_prompt(question)
    
    print(f"❓ Question: {question}")
    print("=" * 80)
    
    # Base model
    print("\n🔸 STANDARD LLAMA-3 RESPONSE:")
    print("-" * 40)
    base_result = generate_response(base_model, prompt, "Base Llama-3")
    
    if base_result["success"]:
        print(base_result["response"])
        print(f"\n⏱️ Time: {format_time(base_result['generation_time'])} | Tokens: {base_result.get('output_tokens', 0)}")
    else:
        print(f"❌ {base_result['response']}")
    
    # Fine-tuned model
    if model_loaded and finetuned_model is not None:
        print("\n🔹 FINE-TUNED HANSARD MODEL RESPONSE:")
        print("-" * 40)
        ft_result = generate_response(finetuned_model, prompt, "Fine-tuned Hansard")
        
        if ft_result["success"]:
            print(ft_result["response"])
            print(f"\n⏱️ Time: {format_time(ft_result['generation_time'])} | Tokens: {ft_result.get('output_tokens', 0)}")
            
            # Speed comparison
            speed_diff = ft_result['generation_time'] - base_result['generation_time']
            print(f"📈 Speed difference: {format_time(speed_diff)} ({'faster' if speed_diff < 0 else 'slower'})")
        else:
            print(f"❌ {ft_result['response']}")
            ft_result = None
    else:
        print("\n⚠️ Fine-tuned model not available")
        ft_result = None
    
    return {
        "question": question,
        "base_result": base_result,
        "finetuned_result": ft_result
    }

## Batch Comparison

Now let's run a comprehensive comparison across multiple parliamentary questions.

In [8]:
TEST_QUESTIONS = [
    "Trace the evolution of arguments concerning the Northern Ireland border during the Brexit debates in the House of Commons.",
    "Summarize the exchanges between the Prime Minister and the Leader of the Opposition regarding the 'Partygate' scandal in PMQs sessions from 2022.",
    "Analyze the arguments made for and against the Police, Crime, Sentencing and Courts Bill during its second reading in the House of Commons.",
    "Identify the key arguments put forth by the Scottish National Party (SNP) regarding a second independence referendum in post-2020 debates.",
    "Provide an example of a Secretary of State for Health and Social Care being questioned on NHS waiting times in a 2023 debate.",
    "How does the Speaker intervene to maintain order during particularly contentious debates? Provide examples from the transcripts.",
    "What were the primary economic arguments presented by the Shadow Chancellor during the response to the 2023 Autumn Statement?",
    "Find instances of MPs using rhetorical devices, such as anaphora or chiasmus, to strengthen their arguments in a major debate.",
    "What were the main justifications provided by the government for the implementation of the furlough scheme during early COVID-19 debates?",
    "Compare and contrast the positions of the Conservative and Labour front benches on green energy policy as detailed in 2023 debates."
]

In [9]:
def run_batch_comparison(questions):
    """Run comparison on a list of questions and collect results"""
    results = []
    total_questions = len(questions)
    start_time = time.time()
    
    print(f"🚀 Starting batch comparison with {total_questions} questions...")
    print("This may take several minutes depending on your GPU.")
    
    for i, question in enumerate(questions):
        print(f"==================== Question {i+1}/{total_questions} ====================")
        result = compare_single_question(question)
        results.append(result)
        
        elapsed_time = time.time() - start_time
        avg_time_per_q = elapsed_time / (i + 1)
        eta = avg_time_per_q * (total_questions - (i + 1))
        
        print(f"\n⏱️ Progress: {i+1}/{total_questions} | Elapsed: {format_time(elapsed_time)} | ETA: {format_time(eta)}")
        
    return results

# Run batch comparison
batch_results = run_batch_comparison(TEST_QUESTIONS)

# Print summary
print("\n" + "="*80)
print("🏁 BATCH COMPARISON COMPLETE")
print("="*80)

for i, result in enumerate(batch_results):
    base_time = result['base_result']['generation_time'] if result['base_result']['success'] else 0
    ft_time = result['finetuned_result']['generation_time'] if result['finetuned_result'] and result['finetuned_result']['success'] else 0
    
    print(f"Question {i+1}: Base {format_time(base_time)} | Fine-tuned {format_time(ft_time)}")

🚀 Starting batch comparison with 10 questions...
This may take several minutes depending on your GPU.
❓ Question: Trace the evolution of arguments concerning the Northern Ireland border during the Brexit debates in the House of Commons.

🔸 STANDARD LLAMA-3 RESPONSE:
----------------------------------------
Commons. The evolution of arguments concerning this issue can be traced back to the 2016 referendum on the United Kingdom's (UK) membership in the European Union (EU). Here is a summary of the key points and arguments made by various parties:

1. **Pre-referendum (2015-2016)**: The then-Prime Minister David Cameron and other pro-EU politicians argued that the UK would not leave the EU without a deal, and that the Northern Ireland border would remain largely unchanged. They claimed that the Good Friday Agreement (GFA) would be respected, and that the border would not be a hard border.
2. **Post-referendum (2016-2017)**: After the Brexit vote, the UK Government, led by Prime Minister T