# Language Model Fine-tuning

This notebook demonstrates fine-tuning a language model for a generation task.

<!-- Task: Given a name, generate its reverse (e.g., emma → amme) -->
Task: Given a news description, classify the dataset for a generation task

In [1]:
import os
import sys

hf_token = os.getenv("HF_TOKEN")

# Workaround for colab environments to prevent interactive token input
if 'google.colab' in sys.modules:
    # Mock the get function to return our token
    try:
        import google.colab.userdata
        original_get = google.colab.userdata.get
        
        def mock_get(key):
            if key == 'HF_TOKEN':
                return hf_token
            return original_get(key)
        
        google.colab.userdata.get = mock_get
        print("Patched Colab userdata to use environment token")
    except Exception as e:
        print(f"Could not patch Colab userdata: {e}")

Patched Colab userdata to use environment token


In [2]:
# data = open('names.txt').read().splitlines()
# print(f"Total names: {len(data)}")
# data[0:10]

from datasets import load_dataset
# dataset = load_dataset("sh0416/ag_news")
dataset = load_dataset("ag_news")

## Load the model and tokenizer

In [3]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "HuggingFaceTB/SmolLM2-135M"
# model_name = "facebook/opt-350m"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add special tokens
tokenizer.add_special_tokens({
    "pad_token": "<|pad|>",
    "bos_token": "<|startoftext|>",
})

# Resize embeddings to account for new tokens
# This call resizes the embedding matrix, initializes new rows randomly , and keeps old embeddings unchanged
model.resize_token_embeddings(len(tokenizer))

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Model parameters: 134,516,160


## Test the model before fine-tuning

In [4]:
# Get 5 random samples from the dataset
samples = dataset['train'].shuffle(seed=333).select(range(3))
for sample in samples:
    print(sample)

{'text': 'No Strawberries and Cream: Sharapova Ousted by Pierce Maria Sharapova suffered a harsh introduction Saturday to the expectations that will stalk her for the rest of her career. After a third-round loss at the US ', 'label': 1}
{'text': 'Bush, Kerry Forces Woo the Undecided (AP) AP - Put Barbara White down as undecided in the race for the White House.', 'label': 0}
{'text': 'Electronic Arts Gets an Exclusive NFL Deal The video game maker Electronic Arts announced an exclusive five-year deal with the National Football League and its players yesterday to design games using the NFL brand, stadiums, player names and uniforms.', 'label': 1}


In [5]:
# test_names = ['emma', 'noah', 'olivia']

print("Before fine-tuning:")
print("="*50)

for sample in samples:
    prompt = f"<|startoftext|>Classify the following news text: {sample['text']}. \nClass:"
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_new_tokens=20,
            temperature=1.0,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"{response}")
    print("-"*50)

Before fine-tuning:


Classify the following news text: No Strawberries and Cream: Sharapova Ousted by Pierce Maria Sharapova suffered a harsh introduction Saturday to the expectations that will stalk her for the rest of her career. After a third-round loss at the US . 
Class: Football <br><br> Date: 17. Feburary <br><br>
--------------------------------------------------
Classify the following news text: Bush, Kerry Forces Woo the Undecided (AP) AP - Put Barbara White down as undecided in the race for the White House.. 
Class: 20th - 23rd November 2005 This is the beginning of
--------------------------------------------------
Classify the following news text: Bush, Kerry Forces Woo the Undecided (AP) AP - Put Barbara White down as undecided in the race for the White House.. 
Class: 20th - 23rd November 2005 This is the beginning of
--------------------------------------------------
Classify the following news text: Electronic Arts Gets an Exclusive NFL Deal The video game maker Electronic Arts announced a

## Create dataset

Format: `<|startoftext|>Reverse the name: emma. Answer: amme<|endoftext|>`

In [6]:
# import torch
# from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch

# Define the class mapping
class_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

class NewsClassificationDataset(Dataset):
    def __init__(self, dataset_split, tokenizer, max_length=256):
        self.data = dataset_split
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.class_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        label_num = item['label']
        label_text = self.class_names[label_num]
        
        # Create the training format: input text -> class name
        full_text = f"{self.tokenizer.bos_token}Classify the following news: {text}\nClass: {label_text}{self.tokenizer.eos_token}"
        prompt = f"{self.tokenizer.bos_token}Classify the following news: {text}\nClass:"
        
        return {
            'full_text': full_text,
            'prompt': prompt,
            'text': text,
            'label_text': label_text
        }

    def collate_fn(self, batch):
        full_texts = [item['full_text'] for item in batch]
        prompts = [item['prompt'] for item in batch]

        # Tokenize full texts
        tokenized = self.tokenizer(
            full_texts,
            truncation=True,
            padding=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # Create labels - copy input_ids but mask the prompt part
        labels = tokenized['input_ids'].clone()
        
        # Mask the prompt tokens in the labels (set to -100 so they're ignored in loss)
        for i, prompt in enumerate(prompts):
            prompt_tokens = self.tokenizer(prompt, add_special_tokens=False)['input_ids']
            prompt_len = len(prompt_tokens)
            if prompt_len < labels.shape[1]:
                labels[i, :prompt_len] = -100  # Ignore prompt tokens in loss calculation

        return {
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'labels': labels
        }

# Create datasets
original_train = dataset['train'].shuffle(seed=42)
split_data = original_train.train_test_split(test_size=0.3, seed=42)
train_data = split_data['train']
val_data = split_data['test']  # Note: train_test_split returns 'test' for the validation split

train_dataset = NewsClassificationDataset(train_data, tokenizer)
val_dataset = NewsClassificationDataset(val_data, tokenizer)
test_dataset = NewsClassificationDataset(dataset['test'], tokenizer)

# Create dataloaders
batch_size = 16
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=train_dataset.collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=val_dataset.collate_fn
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    collate_fn=test_dataset.collate_fn
)

# Verify the dataloaders work
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

# Example of how the data looks
sample_item = train_dataset[0]
print(f"\nSample training format:")
print(f"Full text: {sample_item['full_text']}")
print()
print(f"Prompt: {sample_item['prompt']}")
print()
print(f"Expected label: {sample_item['label_text']}")

# Check a batch
sample_batch = next(iter(train_loader))
print(f"\nBatch shapes:")
print(f"Input IDs: {sample_batch['input_ids'].shape}")
print(f"Labels: {sample_batch['labels'].shape}")
print(f"Attention mask: {sample_batch['attention_mask'].shape}")

Train batches: 5250
Val batches: 2250
Test batches: 475

Sample training format:
Full text: <|startoftext|>Classify the following news: Agents search Will County office in permit probe More than two dozen federal agents with subpoenas spent five hours Wednesday searching the Will County land use department #39;s computer records in an investigation related to building permits.
Class: Sci/Tech<|endoftext|>

Prompt: <|startoftext|>Classify the following news: Agents search Will County office in permit probe More than two dozen federal agents with subpoenas spent five hours Wednesday searching the Will County land use department #39;s computer records in an investigation related to building permits.
Class:

Expected label: Sci/Tech

Batch shapes:
Input IDs: torch.Size([16, 97])
Labels: torch.Size([16, 97])
Attention mask: torch.Size([16, 97])


## Examine a batch

In [7]:
# Look at a sample batch
sample_batch = next(iter(train_loader))
print("Sample batch shape:")
print(f"  input_ids: {sample_batch['input_ids'].shape}")
print(f"  attention_mask: {sample_batch['attention_mask'].shape}")
print(f"  labels: {sample_batch['labels'].shape}")

# Decode first example
print("\nFirst example:")
input_text = tokenizer.decode(sample_batch['input_ids'][0], skip_special_tokens=True)
print(f"Input: {input_text}")

# Show the labels (what the model should generate)
labels = sample_batch['labels'][0]
# Replace -100 with pad token for visualization
labels_viz = labels.clone()
labels_viz[labels_viz == -100] = tokenizer.pad_token_id
label_text = tokenizer.decode(labels_viz, skip_special_tokens=True)
print(f"Target (what model should generate): {label_text}")

# Show which tokens are ignored in loss (-100)
ignored_positions = (labels == -100).sum().item()
total_positions = labels.shape[0]
print(f"Tokens ignored in loss: {ignored_positions}/{total_positions}")
print(f"Tokens used for loss: {total_positions - ignored_positions}/{total_positions}")

Sample batch shape:
  input_ids: torch.Size([16, 108])
  attention_mask: torch.Size([16, 108])
  labels: torch.Size([16, 108])

First example:
Input: Classify the following news: Microsoft Entering the Anti-Spyware Market Microsoft Entering the Anti-Spyware Market\\Microsoft decided to enter the Anti-Virus market sometime back when they bought Romanian antivirus firm GeCad in 2003. And they have been testing the application privately since sometime. No details are out on when and how they will release it. And now, they are contemplating entering ...
Class: Sci/Tech
Target (what model should generate):  Sci/Tech
Tokens ignored in loss: 91/108
Tokens used for loss: 17/108


## Training loop

In [8]:
import torch.optim as optim
from tqdm import tqdm

learning_rate = 5e-5
num_epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Learning rate scheduler
total_steps = len(train_loader) * num_epochs
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)

print(f"Training on device: {device}")
print(f"Total training steps: {total_steps}")

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch_idx, batch in enumerate(progress_bar):
        # if batch_idx >= 1000:
        #     break  # Limit batches per epoch
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        # Update progress bar
        if batch_idx % 50 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            progress_bar.set_postfix({'loss': f'{avg_loss:.4f}'})

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    print(f"\nEpoch {epoch+1}/{num_epochs}")
    print(f"  Average training loss: {avg_train_loss:.4f}")
    print(f"  Average validation loss: {avg_val_loss:.4f}")
    print(f"  Learning rate: {scheduler.get_last_lr()[0]:.2e}")
    print("-" * 50)

Training on device: cuda
Total training steps: 26250


Epoch 1/5: 100%|██████████| 5250/5250 [19:53<00:00,  4.40it/s, loss=0.0268]
Epoch 1/5: 100%|██████████| 5250/5250 [19:53<00:00,  4.40it/s, loss=0.0268]
Validation: 100%|██████████| 2250/2250 [02:57<00:00, 12.71it/s]
Validation: 100%|██████████| 2250/2250 [02:57<00:00, 12.71it/s]



Epoch 1/5
  Average training loss: 0.0266
  Average validation loss: 0.0060
  Learning rate: 4.52e-05
--------------------------------------------------


Epoch 2/5: 100%|██████████| 5250/5250 [19:55<00:00,  4.39it/s, loss=0.0037]
Epoch 2/5: 100%|██████████| 5250/5250 [19:55<00:00,  4.39it/s, loss=0.0037]
Validation: 100%|██████████| 2250/2250 [02:55<00:00, 12.81it/s]
Validation: 100%|██████████| 2250/2250 [02:55<00:00, 12.81it/s]



Epoch 2/5
  Average training loss: 0.0037
  Average validation loss: 0.0061
  Learning rate: 3.27e-05
--------------------------------------------------


Epoch 3/5: 100%|██████████| 5250/5250 [19:51<00:00,  4.41it/s, loss=0.0016]
Epoch 3/5: 100%|██████████| 5250/5250 [19:51<00:00,  4.41it/s, loss=0.0016]
Validation: 100%|██████████| 2250/2250 [02:57<00:00, 12.71it/s]
Validation: 100%|██████████| 2250/2250 [02:57<00:00, 12.71it/s]



Epoch 3/5
  Average training loss: 0.0016
  Average validation loss: 0.0064
  Learning rate: 1.73e-05
--------------------------------------------------


Epoch 4/5: 100%|██████████| 5250/5250 [19:51<00:00,  4.41it/s, loss=0.0005]
Epoch 4/5: 100%|██████████| 5250/5250 [19:51<00:00,  4.41it/s, loss=0.0005]
Validation: 100%|██████████| 2250/2250 [02:55<00:00, 12.80it/s]
Validation: 100%|██████████| 2250/2250 [02:55<00:00, 12.80it/s]



Epoch 4/5
  Average training loss: 0.0005
  Average validation loss: 0.0077
  Learning rate: 4.77e-06
--------------------------------------------------


Epoch 5/5: 100%|██████████| 5250/5250 [19:51<00:00,  4.41it/s, loss=0.0001]
Epoch 5/5: 100%|██████████| 5250/5250 [19:51<00:00,  4.41it/s, loss=0.0001]
Validation: 100%|██████████| 2250/2250 [02:56<00:00, 12.74it/s]


Epoch 5/5
  Average training loss: 0.0001
  Average validation loss: 0.0089
  Learning rate: 0.00e+00
--------------------------------------------------





## Generate and evaluate

In [9]:
def classify_news(model, tokenizer, text, device, max_new_tokens=10):
    """Generate classification for a news text"""
    prompt = f"{tokenizer.bos_token}Classify the following news: {text}\nClass:"
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=200).to(device)
    
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode the full response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract just the generated class
    if "Class:" in full_response:
        generated = full_response.split("Class:")[-1].strip()
        # Get just the first word (the class name)
        generated = generated.split()[0] if generated else ""
        return generated
    return ""

# Test on some examples
print("Testing classification after fine-tuning:")
print("="*70)

class_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
test_samples = dataset['test'].shuffle(seed=42).select(range(10))

correct = 0
total = 0

for sample in test_samples:
    text = sample['text']
    true_label = sample['label']
    true_class = class_names[true_label]
    
    generated_class = classify_news(model, tokenizer, text, device)
    
    is_correct = generated_class.lower() == true_class.lower()
    correct += is_correct
    total += 1
    
    symbol = "✓" if is_correct else "✗"
    
    print(f"Text: {text}")
    print(f"True class: {true_class}")
    print(f"Generated: {generated_class} {symbol}")
    print("-" * 70)

print(f"\nAccuracy: {correct}/{total} = {correct/total*100:.1f}%")

Testing classification after fine-tuning:
Text: Indian board plans own telecast of Australia series The Indian cricket board said on Wednesday it was making arrangements on its own to broadcast next month #39;s test series against Australia, which is under threat because of a raging TV rights dispute.
True class: Sports
Generated: Sports ✓
----------------------------------------------------------------------
Text: Indian board plans own telecast of Australia series The Indian cricket board said on Wednesday it was making arrangements on its own to broadcast next month #39;s test series against Australia, which is under threat because of a raging TV rights dispute.
True class: Sports
Generated: Sports ✓
----------------------------------------------------------------------
Text: Stocks Higher on Drop in Jobless Claims A sharp drop in initial unemployment claims and bullish forecasts from Nokia and Texas Instruments sent stocks slightly higher in early trading Thursday.
True class: Busi

## Evaluate on test set

In [10]:
def evaluate_classification_accuracy(model, tokenizer, test_dataset, device, sample_size=200):
    """Evaluate classification accuracy on test set."""
    
    class_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    
    # Sample from test set - convert to list of indices and sample randomly
    total_samples = min(sample_size, len(test_dataset))
    indices = list(range(len(test_dataset)))
    import random
    random.seed(42)
    sampled_indices = random.sample(indices, total_samples)
    
    correct = 0
    errors = []
    class_correct = {name: 0 for name in class_names.values()}
    class_total = {name: 0 for name in class_names.values()}

    for i in tqdm(sampled_indices, desc="Evaluating"):
        sample = test_dataset[i]
        text = sample['text']
        true_label = sample['label'] 
        true_class = class_names[true_label]
        
        generated_class = classify_news(model, tokenizer, text[:300], device)
        
        class_total[true_class] += 1
        
        if generated_class.lower() == true_class.lower():
            correct += 1
            class_correct[true_class] += 1
        else:
            errors.append((text[:100], true_class, generated_class))

    overall_accuracy = correct / total_samples
    
    # Calculate per-class accuracy
    class_accuracies = {}
    for class_name in class_names.values():
        if class_total[class_name] > 0:
            class_accuracies[class_name] = class_correct[class_name] / class_total[class_name]

    return {
        'overall_accuracy': overall_accuracy,
        'class_accuracies': class_accuracies,
        'class_counts': class_total,
        'errors': errors[:10]  # Show only first 10 errors
    }

# Evaluate on test set
print("Evaluating on test set...")
results = evaluate_classification_accuracy(model, tokenizer, dataset['test'], device, sample_size=500)

print(f"\nOverall Accuracy: {results['overall_accuracy']*100:.2f}%")
print("\nPer-class accuracy:")
for class_name, accuracy in results['class_accuracies'].items():
    count = results['class_counts'][class_name]
    print(f"  {class_name}: {accuracy*100:.1f}% ({count} samples)")

print(f"\nSome examples of errors:")
print("Text (first 100 chars) | True | Predicted")
print("="*60)
for text, true_class, predicted in results['errors']:
    print(f"{text}... | {true_class} | {predicted}")

Evaluating on test set...


Evaluating: 100%|██████████| 500/500 [01:03<00:00,  7.93it/s]


Overall Accuracy: 90.80%

Per-class accuracy:
  World: 91.7% (133 samples)
  Sports: 97.3% (112 samples)
  Business: 86.9% (137 samples)
  Sci/Tech: 88.1% (118 samples)

Some examples of errors:
Text (first 100 chars) | True | Predicted
Bryant Makes First Appearance at Trial (AP) AP - NBA star Kobe Bryant arrived at his sexual assault ... | Sci/Tech | Sports
Vodafone targets Japan with 3G offensive Vodafone has unveiled plans for 10 new third-generation han... | Business | Sci/Tech
Our mobile margins will fall: Telstra TELSTRA chief financial officer John Stanhope has admitted Tel... | Sci/Tech | Business
Aussies battle EU over cheese, champagne AP - The United States and Australia have prevailed in an i... | Business | World
Shares of Video Game Makers Rise Sharply Shares of video game makers rose sharply Friday after analy... | Business | Sci/Tech
Summer Box Office Hits a High, Despite Lows In a summer when many of the studios' biggest bets faile... | Business | World
Level 3 Acquir




## Analysis by name length

In [11]:
# import matplotlib.pyplot as plt
# from collections import defaultdict

# # Group test names by length
# names_by_length = defaultdict(list)
# for name in test_data[:500]:  # Use first 500 for speed
#     names_by_length[len(name)].append(name)

# # Evaluate accuracy by length
# length_accuracies = {}
# lengths = sorted(names_by_length.keys())

# for length in lengths:
#     if len(names_by_length[length]) >= 5:  # Only evaluate if we have enough samples
#         names_subset = names_by_length[length][:20]  # Limit to 20 per length for speed
#         results = evaluate_accuracy(model, tokenizer, names_subset, device)
#         length_accuracies[length] = results['exact_match_accuracy']

# # Plot
# if length_accuracies:
#     plt.figure(figsize=(10, 6))
#     lengths = list(length_accuracies.keys())
#     accuracies = list(length_accuracies.values())

#     plt.bar(lengths, accuracies)
#     plt.xlabel('Name Length')
#     plt.ylabel('Exact Match Accuracy')
#     plt.title('Reversal Accuracy by Name Length')
#     plt.ylim(0, 1.0)

#     # Add percentage labels on bars
#     for i, (l, acc) in enumerate(zip(lengths, accuracies)):
#         plt.text(l, acc + 0.01, f'{acc*100:.0f}%', ha='center')

#     plt.show()

#     # Print summary
#     print("Accuracy by name length:")
#     for length, acc in sorted(length_accuracies.items()):
#         count = len(names_by_length[length])
#         print(f"  Length {length}: {acc*100:.1f}% ({count} names in test set)")

## Test with different prompt formats

In [12]:
# # Try different prompt variations to see if the model generalizes
# test_prompts = [
#     ("Classify the following news: {text}\nClass:", "Training Format"),
#     ("What category is this news: {text}\nAnswer:", "Variation 1"),
#     ("This news article is about: {text}\nCategory:", "Variation 2"),
#     ("{text}\nTopic:", "Variation 3"),
# ]

# # Get a test sample
# test_sample = dataset['test'][42]
# test_text = test_sample['text'][:200]  # Limit text length
# true_label = test_sample['label']
# class_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
# expected_class = class_names[true_label]

# print(f"Testing different prompts for news classification")
# print(f"Text: {test_text[:100]}...")
# print(f"Expected class: {expected_class}")
# print("="*80)

# for prompt_template, prompt_type in test_prompts:
#     prompt = tokenizer.bos_token + prompt_template.format(text=test_text)
    
#     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=300).to(device)
    
#     model.eval()
#     with torch.no_grad():
#         outputs = model.generate(
#             inputs["input_ids"],
#             max_new_tokens=10,
#             temperature=0.1,
#             do_sample=True,
#             pad_token_id=tokenizer.pad_token_id,
#             eos_token_id=tokenizer.eos_token_id
#         )
    
#     full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     # Extract the generated part
#     prompt_text = prompt_template.format(text=test_text)
#     if len(full_response) > len(prompt_text):
#         generated = full_response[len(prompt_text):].strip()
#         generated = generated.split()[0] if generated else ""
#     else:
#         generated = ""
    
#     is_correct = generated.lower() == expected_class.lower()
#     symbol = "✓" if is_correct else "✗"
    
#     print(f"{prompt_type}:")
#     print(f"  Generated: '{generated}' {symbol}")
#     print()