# Fine-tuning GPT for Summarization using Hugging Face Trainer

In [None]:
# Install required packages
!pip install transformers datasets accelerate evaluate rouge_score -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00

In [None]:
import torch # ✅
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer, # ✅
    TrainingArguments, # ✅
    DataCollatorForLanguageModeling # ✅
)
from datasets import load_dataset, Dataset # ✅
import numpy as np # ✅
from evaluate import load # ✅
import pandas as pd # ✅

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# 1. LOAD AND PREPARE THE MODEL
print("Loading model and tokenizer...")
model_name = "distilbert/distilgpt2"  # Small, fast model perfect for Colab
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token (GPT2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")

Loading model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded: distilbert/distilgpt2
Model parameters: 81,912,576


In [None]:
# 2. LOAD AND PREPARE DATASET
print("\nLoading dataset...")
# Using CNN/DailyMail summarization dataset (small subset)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1000]")  # Only 1000 samples for quick training
test_dataset = load_dataset("cnn_dailymail", "3.0.0", split="validation[:100]")  # 100 samples for testing

print(f"Training samples: {len(dataset)}")
print(f"Test samples: {len(test_dataset)}")

# Show example
print("\nExample from dataset:")
print(f"Article (first 200 chars): {dataset[0]['article'][:200]}...")
print(f"Summary: {dataset[0]['highlights']}")


Loading dataset...


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Training samples: 1000
Test samples: 100

Example from dataset:
Article (first 200 chars): LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on ...
Summary: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [None]:
# Article: LONDON, England (Reuters) -- Harry Potter s.... Summary: Harry Potter star Daniel ....

# Causal LM
# Article
# Article {:}
# Article: {LONDON}
# Article: LONDON, {England}

# SFT
# Article: LONDON, England (Reuters) -- Harry Potter s.... Summary: {Harry}
# Article: LONDON, England (Reuters) -- Harry Potter s.... Summary: Harry Harry : High loss
# Article: LONDON, England (Reuters) -- Harry Potter s.... Summary: Harry {Potter} : Low loss

In [None]:
# 3. TOKENIZATION AND DATA PREPROCESSING
def preprocess_function(examples):
    """
    Convert article-summary pairs into the format:
    "Article: [article text] Summary: [summary text]<|endoftext|>"
    """
    inputs = []
    for article, summary in zip(examples['article'], examples['highlights']):
        # Create input-output format for language modeling
        text = f"Article: {article[:500]} Summary: {summary}{tokenizer.eos_token}"  # Limit article to 500 chars
        inputs.append(text)

    # Tokenize
    model_inputs = tokenizer(
        inputs,
        truncation=True,
        padding=True,
        max_length=512,  # Keep it manageable for Colab
        return_tensors="pt"
    )

    # For CPT language modeling, labels are the same as input_ids
    model_inputs["labels"] = model_inputs["input_ids"].clone()

    labels = model_inputs["input_ids"].clone()
    # For SFT, Mask everything before the summary
    # for i, input_id in enumerate(inputs):
    #     summary_start = tokenizer.encode("Summary:")[0]
    #     summary_index = (input_id == summary_start).nonzero(as_tuple=True)[0].item() + 1
    #     labels[i, :summary_index] = -100  # Hugging Face ignores -100 in loss
    # model_inputs["labels"] = labels

    return model_inputs

print("\nPreprocessing training data...")
tokenized_train = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

print("Data preprocessing complete!")


Preprocessing training data...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Data preprocessing complete!


In [None]:
# 4. SETUP DATA COLLATOR
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal language modeling, not masked
)

In [None]:
# 5. EVALUATION METRICS
rouge = load("rouge")

def compute_metrics(eval_preds):
    """
    Compute ROUGE metrics for evaluation
    Properly handles logits from model predictions
    """
    predictions, labels = eval_preds

    # Convert logits to token IDs (this is the root cause fix)
    if len(predictions.shape) == 3:  # predictions are logits [batch, seq_len, vocab_size]
        predictions = np.argmax(predictions, axis=-1)

    # Replace -100 in labels with pad_token_id (standard practice)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Ensure we have 2D arrays of token IDs
    if len(predictions.shape) != 2:
        print(f"Warning: Unexpected prediction shape {predictions.shape}")
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}

    # Decode to text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Extract summaries from the format "Article: ... Summary: [summary]"
    pred_summaries = []
    label_summaries = []

    for pred, label in zip(decoded_preds, decoded_labels):
        # Extract summary part after "Summary: "
        if "Summary: " in pred:
            pred_summary = pred.split("Summary: ")[-1].strip()
        else:
            pred_summary = pred.strip()

        if "Summary: " in label:
            label_summary = label.split("Summary: ")[-1].strip()
        else:
            label_summary = label.strip()

        # Ensure we have non-empty summaries
        pred_summaries.append(pred_summary if pred_summary else "No summary generated")
        label_summaries.append(label_summary if label_summary else "No summary available")

    # Compute ROUGE scores
    result = rouge.compute(
        predictions=pred_summaries,
        references=label_summaries,
        use_stemmer=True
    )

    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# 6. TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=2,  # Keep it low for quick training
    per_device_train_batch_size=4,  # Small batch size for Colab
    per_device_eval_batch_size=4,
    warmup_steps=50,
    logging_steps=1,
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    gradient_accumulation_steps=2,  # Effective batch size = 4 * 2 = 8
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_first_step=True,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to=None,  # Disable wandb/tensorboard for simplicity
)

print("Training arguments configured!")

Training arguments configured!


In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgndp[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# 7. CREATE TRAINER
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Trainer created!")

  trainer = Trainer(


Trainer created!


In [None]:
# 8. TRAINING
print("\n" + "="*50)
print("STARTING TRAINING")
print("="*50)

# Train the model
trainer.train()

print("\n" + "="*50)
print("TRAINING COMPLETED!")
print("="*50)


STARTING TRAINING


[34m[1mwandb[0m: Currently logged in as: [33mgndp[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
10,3.4864,3.622026,0.244365,0.075139,0.178481
20,3.7855,3.524376,0.248416,0.076497,0.178334
30,3.7713,3.470621,0.246974,0.07466,0.176293
40,3.281,3.416454,0.248562,0.075814,0.178364
50,3.4406,3.374845,0.253725,0.0776,0.180926
60,3.2786,3.349658,0.250896,0.075903,0.180631
70,3.4347,3.331536,0.250157,0.077084,0.17904
80,3.1675,3.325768,0.251206,0.075611,0.1798
90,3.3294,3.317469,0.255491,0.080988,0.188032
100,3.2569,3.308981,0.27015,0.084676,0.201106


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].



TRAINING COMPLETED!


In [None]:
# 9. EVALUATION
print("\nEvaluating model...")
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")


Evaluating model...



Evaluation Results:
eval_loss: 3.3026
eval_rouge1: 0.3147
eval_rouge2: 0.1035
eval_rougeL: 0.2446
eval_runtime: 5.4586
eval_samples_per_second: 18.3200
eval_steps_per_second: 4.5800
epoch: 2.0000


In [None]:
# 10. SAVE MODEL
model.save_pretrained("./fine-tuned-gpt2-summarizer")
tokenizer.save_pretrained("./fine-tuned-gpt2-summarizer")
print("\nModel saved to './fine-tuned-gpt2-summarizer'")


Model saved to './fine-tuned-gpt2-summarizer'


In [None]:
# 11. INFERENCE EXAMPLE
print("\n" + "="*50)
print("TESTING THE FINE-TUNED MODEL")
print("="*50)

# Determine device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # Move model to device

def generate_summary(article_text, max_length=100):
    """Generate summary for a given article"""
    input_text = f"Article: {article_text[:500]} Summary:"
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=400, truncation=True)
    inputs = inputs.to(device)  # Move inputs to device

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=len(inputs[0]) + max_length,
            num_return_sequences=1,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the summary part
    if "Summary:" in generated_text:
        summary = generated_text.split("Summary:")[-1].strip()
    else:
        summary = generated_text.strip()

    return summary

# Test on a sample article
test_article = test_dataset[0]['article']
original_summary = test_dataset[0]['highlights']
print("Original Article (first 300 chars):")
print(test_article[:300] + "...\n")
print("Original Summary:")
print(original_summary + "\n")
print("Generated Summary:")
generated_summary = generate_summary(test_article)
print(generated_summary)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



TESTING THE FINE-TUNED MODEL
Original Article (first 300 chars):
(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thoug...

Original Summary:
Zully Broussard decided to give a kidney to a stranger .
A new computer program helped her donation spur transplants for six kidney patients .

Generated Summary:
Zully Broussard: "I can do everything I can to help her"
Zully Broussard is the first woman in the world to donate kidneys to a stranger .
She is the first woman in the world to donate kidneys to a stranger .
The transplant is the first to use human kidneys in the world .
A man named Zully Broussard, whose kidneys were donated to a stranger in Thailand .
The donor was born in San Francisco, California


In [None]:
# 12. COMPARISON WITH BASE MODEL
print("\n" + "="*50)
print("COMPARISON WITH BASE MODEL")
print("="*50)

# Load base model for comparison
base_model = GPT2LMHeadModel.from_pretrained("distilgpt2")

def generate_summary_base(article_text, max_length=100):
    """Generate summary using base model"""
    input_text = f"Article: {article_text[:500]} Summary:"
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=400, truncation=True)

    with torch.no_grad():
        outputs = base_model.generate(
            inputs,
            max_length=len(inputs[0]) + max_length,
            num_return_sequences=1,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "Summary:" in generated_text:
        summary = generated_text.split("Summary:")[-1].strip()
    else:
        summary = generated_text.strip()

    return summary

print("Base Model Output:")
base_summary = generate_summary_base(test_article)
print(base_summary)

print("\nFine-tuned Model Output:")
print(generated_summary)

print("\nOriginal Summary:")
print(original_summary)

print("\n" + "="*50)
print("TUTORIAL COMPLETE!")
print("="*50)

print("""
SUMMARY OF WHAT WE DID:

1. ✅ Loaded DistilGPT2 (small, fast model)
2. ✅ Prepared CNN/DailyMail summarization dataset
3. ✅ Formatted data for instruction following
4. ✅ Set up Hugging Face Trainer
5. ✅ Fine-tuned the model (SFT)
6. ✅ Evaluated with ROUGE metrics
7. ✅ Compared base vs fine-tuned model

KEY CONCEPTS DEMONSTRATED:
- Supervised Fine-Tuning (SFT)
- Data formatting for instruction following
- Language modeling objective
- Evaluation metrics for summarization
- Model comparison

The model should now be better at following the summarization instruction format!
""")


COMPARISON WITH BASE MODEL
Base Model Output:
"I have a lot of friends who have a life extension, and I'm excited to see the results from this, because I think that they're going to be able to use this data to help them get to the point where they want to live."
Broussard said she is still looking for more information about the transplant, but the results are not clear.
She said she is not sure what the donation will be for her.
She also said she has

Fine-tuned Model Output:
NEW: Zully Broussard: "I'm so grateful for this one person who I don't know."
"I'm so grateful for this one person who I don't know," she says .
"I'm so grateful for this one person who I don't know," says Broussard .
"I'm so thankful for this one person who I don't know," says Broussard .
Broussard has been waiting for years to see whether

Original Summary:
Zully Broussard decided to give a kidney to a stranger .
A new computer program helped her donation spur transplants for six kidney patients .

TUTORIAL COM

# Part 2: DPO


In [None]:
print("\n" + "="*60)
print("PART 2: DIRECT PREFERENCE OPTIMIZATION (DPO)")
print("="*60)

print("""
Now we'll use DPO to improve the model further by:
1. Using reference summaries as "preferred" responses
2. Using model-generated summaries as "rejected" responses
3. Training with preference learning to reduce repetition
""")

# Install TRL for DPO
!pip install trl -q

from trl import DPOTrainer, DPOConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
import random

# 13. PREPARE DPO DATASET
print("\nPreparing DPO dataset...")

def create_dpo_dataset(dataset, model, tokenizer, num_samples=200):
    """
    Create a preference dataset:
    - Chosen: Reference summaries (high quality)
    - Rejected: Model-generated summaries (potentially repetitive)
    """
    dpo_data = []

    print(f"Generating model outputs for {num_samples} samples...")

    for i, example in enumerate(dataset.select(range(num_samples))):
        if i % 50 == 0:
            print(f"Processing sample {i}/{num_samples}")

        article = example['article']
        reference_summary = example['highlights']

        # Generate summary with current model (these will be "rejected")
        prompt = f"Article: {article[:500]} Summary:"
        inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=400, truncation=True)

        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_length=len(inputs[0]) + 80,
                num_return_sequences=1,
                temperature=0.8,  # Higher temperature for more variation
                pad_token_id=tokenizer.eos_token_id,
                do_sample=True,
                top_p=0.9,
                repetition_penalty=1.0  # No repetition penalty initially
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract model summary
        if "Summary:" in generated_text:
            model_summary = generated_text.split("Summary:")[-1].strip()
        else:
            model_summary = generated_text.strip()

        # Create DPO format
        dpo_example = {
            "prompt": prompt,
            "chosen": f"{prompt}{reference_summary}",  # Reference summary (preferred)
            "rejected": f"{prompt}{model_summary}"     # Model summary (to be discouraged)
        }

        dpo_data.append(dpo_example)

    return Dataset.from_list(dpo_data)

# Create DPO dataset
dpo_dataset = create_dpo_dataset(dataset, model, tokenizer, num_samples=200)
dpo_eval_dataset = create_dpo_dataset(test_dataset, model, tokenizer, num_samples=50)

print(f"DPO training dataset size: {len(dpo_dataset)}")
print(f"DPO eval dataset size: {len(dpo_eval_dataset)}")

# Show example
print("\nDPO Dataset Example:")
print("Prompt:", dpo_dataset[0]['prompt'][:100] + "...")
print("Chosen (Reference):", dpo_dataset[0]['chosen'].split("Summary:")[-1][:100] + "...")
print("Rejected (Model):", dpo_dataset[0]['rejected'].split("Summary:")[-1][:100] + "...")

# 14. SETUP DPO TRAINING
print("\nSetting up DPO training...")

# Create a copy of the model for DPO (we'll use the SFT model as reference)
reference_model = GPT2LMHeadModel.from_pretrained(model_name)  # Original model as reference
dpo_model = model  # Our fine-tuned model

# DPO Configuration
dpo_config = DPOConfig(
    output_dir="./dpo_results",
    num_train_epochs=1,  # Quick training
    per_device_train_batch_size=2,  # Smaller batch for DPO
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-6,  # Lower learning rate for DPO
    logging_steps=25,
    eval_steps=50,
    save_steps=100,
    warmup_steps=10,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    remove_unused_columns=False,
    beta=0.1,  # DPO temperature parameter
)

# 15. CREATE DPO TRAINER
print("Creating DPO trainer...")

dpo_trainer = DPOTrainer(
    model=dpo_model,
    ref_model=reference_model,
    config=dpo_config,
    train_dataset=dpo_dataset,
    eval_dataset=dpo_eval_dataset,
    tokenizer=tokenizer,
)

print("DPO Trainer created!")

# 16. DPO TRAINING
print("\n" + "="*50)
print("STARTING DPO TRAINING")
print("="*50)

# Train with DPO
dpo_trainer.train()

print("\n" + "="*50)
print("DPO TRAINING COMPLETED!")
print("="*50)

# 17. SAVE DPO MODEL
dpo_model.save_pretrained("./dpo-fine-tuned-gpt2-summarizer")
tokenizer.save_pretrained("./dpo-fine-tuned-gpt2-summarizer")
print("\nDPO model saved!")

# 18. COMPARE ALL THREE MODELS
print("\n" + "="*60)
print("COMPARING ALL THREE MODELS")
print("="*60)

# Load all models for comparison
base_model = GPT2LMHeadModel.from_pretrained("distilgpt2")
sft_model = model  # Our SFT model
dpo_model = dpo_model  # Our DPO model

def generate_and_analyze_summary(model, article_text, model_name, max_length=100):
    """Generate summary and analyze for repetition"""
    input_text = f"Article: {article_text[:500]} Summary:"
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=400, truncation=True)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=len(inputs[0]) + max_length,
            num_return_sequences=1,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "Summary:" in generated_text:
        summary = generated_text.split("Summary:")[-1].strip()
    else:
        summary = generated_text.strip()

    # Analyze repetition
    words = summary.split()
    unique_words = len(set(words))
    total_words = len(words)
    repetition_ratio = 1 - (unique_words / max(total_words, 1))

    print(f"\n{model_name} Model:")
    print(f"Summary: {summary}")
    print(f"Length: {total_words} words")
    print(f"Unique words: {unique_words}")
    print(f"Repetition ratio: {repetition_ratio:.3f}")

    return summary, repetition_ratio

# Test on the same article
test_article = test_dataset[0]['article']
original_summary = test_dataset[0]['highlights']

print("Original Article (first 300 chars):")
print(test_article[:300] + "...\n")

print("Reference Summary:")
print(original_summary)

# Compare all models
base_summary, base_rep = generate_and_analyze_summary(base_model, test_article, "Base")
sft_summary, sft_rep = generate_and_analyze_summary(sft_model, test_article, "SFT")
dpo_summary, dpo_rep = generate_and_analyze_summary(dpo_model, test_article, "DPO")

# 19. REPETITION ANALYSIS
print("\n" + "="*50)
print("REPETITION ANALYSIS SUMMARY")
print("="*50)

print(f"Base Model Repetition: {base_rep:.3f}")
print(f"SFT Model Repetition: {sft_rep:.3f}")
print(f"DPO Model Repetition: {dpo_rep:.3f}")

if dpo_rep < sft_rep:
    print("✅ DPO successfully reduced repetition!")
else:
    print("⚠️  DPO didn't reduce repetition (may need more training)")

print("\n" + "="*60)
print("COMPLETE TUTORIAL FINISHED!")
print("="*60)

print("""
FULL PIPELINE COMPLETED:

SUPERVISED FINE-TUNING (SFT):
✅ Fine-tuned base model on summarization task
✅ Improved instruction following
✅ Model learned summarization format

DIRECT PREFERENCE OPTIMIZATION (DPO):
✅ Created preference pairs (reference vs model outputs)
✅ Used contrastive learning to improve quality
✅ Reduced repetition and improved coherence

KEY CONCEPTS DEMONSTRATED:
- Supervised Fine-Tuning (SFT)
- Direct Preference Optimization (DPO)
- Preference learning vs instruction tuning
- Repetition analysis and quality metrics
- Complete RLHF pipeline simulation

ADVANCED TECHNIQUES SHOWN:
- Using model outputs as negative examples
- Reference summaries as positive examples
- Contrastive preference learning
- Multi-stage training pipeline
""")