# Simple BERTScore Test - Local RoBERTa Model

This notebook loads RoBERTa-large directly from a flat directory with 4 files:
- config.json
- merges.txt  
- pytorch_model.bin
- vocab.json

No cache, no complex structure. Just works.

In [None]:
# Imports
from pathlib import Path
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
import numpy as np

print("âœ“ Imports successful")

In [None]:
# Check that the model directory exists with required files
model_dir = Path("roberta-large")

required_files = ["config.json", "merges.txt", "pytorch_model.bin", "vocab.json"]

print(f"Checking directory: {model_dir.absolute()}")
print()

for file in required_files:
    file_path = model_dir / file
    if file_path.exists():
        size_mb = file_path.stat().st_size / (1024 * 1024)
        print(f"âœ“ {file:25s} ({size_mb:.1f} MB)")
    else:
        print(f"âœ— {file:25s} MISSING")
        raise FileNotFoundError(f"Required file missing: {file}")

print("\nâœ“ All required files found!")

In [None]:
# Load tokenizer directly from local directory
print("Loading tokenizer from local directory...")
tokenizer = AutoTokenizer.from_pretrained(
    str(model_dir),
    local_files_only=True  # Don't try to download anything
)
print("âœ“ Tokenizer loaded")
print(f"  Vocab size: {tokenizer.vocab_size}")

In [None]:
# Load model directly from local directory
print("Loading RoBERTa model from local directory...")
print("(This may take 10-30 seconds)")
model = AutoModel.from_pretrained(
    str(model_dir),
    local_files_only=True  # Don't try to download anything
)
print("âœ“ Model loaded")
print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Test with sample text
test_text = "The company reported strong quarterly earnings with revenue growth of 15%."

print("Testing tokenization...")
tokens = tokenizer(test_text, return_tensors="pt")
print(f"âœ“ Tokenized: {len(tokens['input_ids'][0])} tokens")
print(f"  Tokens: {tokenizer.convert_ids_to_tokens(tokens['input_ids'][0][:10])}...")

print("\nTesting model inference...")
with torch.no_grad():
    outputs = model(**tokens)
    
print(f"âœ“ Model inference successful")
print(f"  Output shape: {outputs.last_hidden_state.shape}")
print(f"  Embedding dim: {outputs.last_hidden_state.shape[-1]}")

## BERTScore Calculation

Now let's compute BERTScore manually using our local model:

We'll compute it ourselves instead of using bert_score library's score() function, which doesn't accept model parameters in older versions.

In [None]:
# Compute BERTScore manually using our local model
# (bert_score library doesn't accept model parameter in older versions)

def compute_bertscore(reference, candidate, model, tokenizer):
    """
    Compute BERTScore manually using local model.
    
    Returns: (precision, recall, f1)
    """
    # Tokenize
    ref_tokens = tokenizer(reference, return_tensors="pt", padding=True, truncation=True, max_length=512)
    cand_tokens = tokenizer(candidate, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get embeddings
    with torch.no_grad():
        ref_outputs = model(**ref_tokens)
        cand_outputs = model(**cand_tokens)
    
    # Get last hidden states (remove padding tokens)
    ref_embeds = ref_outputs.last_hidden_state[0]  # [seq_len, hidden_dim]
    cand_embeds = cand_outputs.last_hidden_state[0]
    
    # Remove CLS and SEP tokens (first and last)
    ref_embeds = ref_embeds[1:-1]
    cand_embeds = cand_embeds[1:-1]
    
    # Normalize embeddings
    ref_embeds = F.normalize(ref_embeds, p=2, dim=1)
    cand_embeds = F.normalize(cand_embeds, p=2, dim=1)
    
    # Compute cosine similarity matrix
    sim_matrix = torch.mm(cand_embeds, ref_embeds.t())  # [cand_len, ref_len]
    
    # Precision: for each candidate token, find max similarity with reference
    precision = sim_matrix.max(dim=1)[0].mean().item()
    
    # Recall: for each reference token, find max similarity with candidate
    recall = sim_matrix.max(dim=0)[0].mean().item()
    
    # F1
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0
    
    return precision, recall, f1


# Test sentences
reference = "The company reported strong quarterly earnings with revenue growth of 15%."
candidate = "Quarterly results showed solid performance, with revenues up 15%."

print("Computing BERTScore manually with local model...")
print(f"Reference: {reference}")
print(f"Candidate: {candidate}")
print()

P, R, F1 = compute_bertscore(reference, candidate, model, tokenizer)

print("âœ“ BERTScore calculated successfully!")
print()
print(f"Precision: {P:.4f}")
print(f"Recall:    {R:.4f}")
print(f"F1 Score:  {F1:.4f}")

## Multiple Examples

In [None]:
# Test with multiple examples using our manual BERTScore function
test_cases = [
    {
        "ref": "Tesla reported record deliveries in Q4 2024, exceeding analyst expectations.",
        "cand": "Tesla's Q4 2024 deliveries surpassed predictions from analysts."
    },
    {
        "ref": "The Federal Reserve maintained interest rates at current levels.",
        "cand": "Fed kept rates unchanged at today's meeting."
    },
    {
        "ref": "Amazon Web Services announced new AI infrastructure capabilities.",
        "cand": "AWS unveiled enhanced artificial intelligence infrastructure."
    }
]

print("Testing multiple sentence pairs:\n")
print("="*80)

for i, case in enumerate(test_cases, 1):
    P, R, F1 = compute_bertscore(case['ref'], case['cand'], model, tokenizer)
    
    print(f"\nExample {i}:")
    print(f"  Ref:  {case['ref'][:60]}..." if len(case['ref']) > 60 else f"  Ref:  {case['ref']}")
    print(f"  Cand: {case['cand'][:60]}..." if len(case['cand']) > 60 else f"  Cand: {case['cand']}")
    print(f"  â†’ BERTScore F1: {F1:.4f} (P: {P:.4f}, R: {R:.4f})")

print("\n" + "="*80)
print("\nâœ“ All tests completed successfully!")
print("\nðŸ’¡ Your local RoBERTa model is working perfectly!")

## Summary

This notebook demonstrates that you can use RoBERTa-large for BERTScore with just 4 files in a flat directory:

1. âœ… Load tokenizer from local directory with `local_files_only=True`
2. âœ… Load model from local directory with `local_files_only=True`
3. âœ… Compute BERTScore manually using the local model
4. âœ… No caching, no complex directory structure needed
5. âœ… No internet required after initial setup

**Key Steps:**
```python
# Load from local flat directory
tokenizer = AutoTokenizer.from_pretrained("roberta-large", local_files_only=True)
model = AutoModel.from_pretrained("roberta-large", local_files_only=True)

# Compute BERTScore manually
def compute_bertscore(reference, candidate, model, tokenizer):
    # Tokenize both texts
    ref_tokens = tokenizer(reference, return_tensors="pt")
    cand_tokens = tokenizer(candidate, return_tensors="pt")
    
    # Get embeddings from model
    with torch.no_grad():
        ref_embeds = model(**ref_tokens).last_hidden_state[0]
        cand_embeds = model(**cand_tokens).last_hidden_state[0]
    
    # Normalize and compute similarity
    ref_embeds = F.normalize(ref_embeds[1:-1], p=2, dim=1)  # Remove CLS/SEP
    cand_embeds = F.normalize(cand_embeds[1:-1], p=2, dim=1)
    
    # Cosine similarity matrix
    sim_matrix = torch.mm(cand_embeds, ref_embeds.t())
    
    # Greedy matching
    precision = sim_matrix.max(dim=1)[0].mean().item()
    recall = sim_matrix.max(dim=0)[0].mean().item()
    f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1
```

**No dependency on bert_score library's score() function** - we compute it ourselves using just transformers and torch!