# Deep NLP Project - Phase 1

## Prompt-Based Abstractive Summarization with Semantic Coverage Control

### Phase 1: Data Loading & Semantic Extraction Pipeline

This notebook implements:
1. Dataset loading and exploration (CNN/DailyMail)
2. Ground truth coverage analysis
3. SigExt-based phrase extraction
4. Semantic grouping (WHO, WHAT, WHEN, WHERE, NUMERIC)
5. Improved WHAT extraction
6. Extraction statistics and gap analysis

In [None]:
# -*- coding: utf-8 -*-
"""
Deep NLP Project - Phase 1: Data Loading & Semantic Extraction Pipeline

This notebook implements:
1. Dataset loading and exploration (CNN/DailyMail)
2. Ground truth coverage analysis
3. SigExt-based phrase extraction
4. Semantic grouping (WHO, WHAT, WHEN, WHERE, NUMERIC)
5. Improved WHAT extraction with better verb phrase capture
6. Extraction statistics and gap analysis
"""


'\nDeep NLP Project - Phase 1: Data Loading & Semantic Extraction Pipeline\n\nThis notebook implements:\n1. Dataset loading and exploration (CNN/DailyMail)\n2. Ground truth coverage analysis\n3. SigExt-based phrase extraction\n4. Semantic grouping (WHO, WHAT, WHEN, WHERE, NUMERIC)\n5. Improved WHAT extraction with better verb phrase capture\n6. Extraction statistics and gap analysis\n'

## SETUP & DEPENDENCIES

In [None]:
!pip install -q datasets transformers spacy scikit-learn rouge-score tqdm
!python -m spacy download en_core_web_sm -q

import os
import getpass
import numpy as np
import json
import re
import statistics
from collections import defaultdict
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

os.makedirs('/content/data', exist_ok=True)


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## CONFIGURATION

In [None]:
SUBSET_SIZE = 200  # Number of samples to process

print("✅ Setup complete!")
print(f"   SUBSET_SIZE = {SUBSET_SIZE}")


✅ Setup complete!
   SUBSET_SIZE = 200


## PHASE 1.1: LOAD DATASET

In [None]:
from datasets import load_dataset

print("Loading CNN/DailyMail dataset...")
dataset = load_dataset("cnn_dailymail", "3.0.0")

samples = []
for ex in dataset['validation'].select(range(SUBSET_SIZE)):
    samples.append({
        'id': ex['id'],
        'article': ex['article'],
        'highlights': ex['highlights']
    })

with open('/content/data/validation_samples.json', 'w') as f:
    json.dump(samples, f)

print(f"✅ Loaded {len(samples)} samples")

# Dataset statistics
article_lengths = [len(s['article']) for s in samples]
highlight_lengths = [len(s['highlights']) for s in samples]

print(f"\n📊 Dataset Statistics:")
print(f"   Articles:   avg {statistics.mean(article_lengths):.0f} chars, "
      f"min {min(article_lengths)}, max {max(article_lengths)}")
print(f"   Highlights: avg {statistics.mean(highlight_lengths):.0f} chars, "
      f"min {min(highlight_lengths)}, max {max(highlight_lengths)}")


Loading CNN/DailyMail dataset...


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

✅ Loaded 200 samples

📊 Dataset Statistics:
   Articles:   avg 3346 chars, min 534, max 9259
   Highlights: avg 193 chars, min 77, max 339


## PHASE 1.2: GROUND TRUTH COVERAGE ANALYSIS

In [None]:
print("\n" + "="*60)
print("GROUND TRUTH COVERAGE ANALYSIS")
print("="*60)

# Define patterns for each semantic category
PATTERNS = {
    'who': [
        re.compile(r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b'),  # Proper names
        re.compile(r'\b(president|ceo|minister|police|officials|doctor|judge)\b', re.I)
    ],
    'what': [
        re.compile(r'\b(said|announced|reported|killed|arrested|won|lost|died)\b', re.I),
        re.compile(r'\b(launched|signed|passed|approved|released|claimed)\b', re.I)
    ],
    'when': [
        re.compile(r'\b(monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b', re.I),
        re.compile(r'\b\d{4}\b'),  # Years
        re.compile(r'\b(yesterday|today|last|next)\s+\w+', re.I)
    ],
    'where': [
        re.compile(r'\bin\s+[A-Z][a-z]+'),  # "in Location"
        re.compile(r'\b(city|country|hospital|court|school|building)\b', re.I)
    ],
    'numeric': [
        re.compile(r'\$[\d,]+'),  # Money
        re.compile(r'\b\d+%'),    # Percentages
        re.compile(r'\b\d{2,}\b') # Numbers with 2+ digits
    ]
}

def check_coverage(text):
    """Check which semantic categories are present in text."""
    return {cat: any(p.search(text) for p in patterns)
            for cat, patterns in PATTERNS.items()}

# Analyze ground truth summaries
coverage_counts = defaultdict(int)
for sample in samples:
    coverage = check_coverage(sample['highlights'])
    for cat, present in coverage.items():
        if present:
            coverage_counts[cat] += 1

print("\nCategory presence in REFERENCE summaries:\n")
gt_analysis = {}
for cat in ['who', 'what', 'when', 'where', 'numeric']:
    pct = coverage_counts[cat] / len(samples) * 100
    gt_analysis[cat] = pct
    bar = '█' * int(pct / 2) + '░' * (50 - int(pct / 2))
    print(f"  {cat.upper():<8} {bar} {pct:.1f}%")

with open('/content/data/ground_truth_analysis.json', 'w') as f:
    json.dump(gt_analysis, f, indent=2)

print("\n✅ Ground truth analysis saved")



GROUND TRUTH COVERAGE ANALYSIS

Category presence in REFERENCE summaries:

  WHO      ███████████████████████████████████████░░░░░░░░░░░ 79.0%
  WHAT     ██████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 20.0%
  WHEN     █████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 27.5%
  WHERE    █████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 26.0%
  NUMERIC  ███████████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 38.5%

✅ Ground truth analysis saved


## PHASE 1.3: PHRASE EXTRACTION (SigExt Baseline)

In [None]:
print("\n" + "="*60)
print("PHRASE EXTRACTION (SigExt)")
print("="*60)

import spacy
nlp = spacy.load('en_core_web_sm')

# Entity types to extract
ENTITY_TYPES = {
    'PERSON', 'ORG', 'GPE', 'LOC', 'DATE', 'TIME',
    'MONEY', 'PERCENT', 'CARDINAL', 'NORP', 'EVENT'
}

def extract_phrases(text, doc_id):
    """Extract significant phrases using spaCy NER and dependency parsing."""
    doc = nlp(text[:10000])  # Limit for efficiency
    phrases, seen = [], set()

    # 1. Named Entity Recognition
    for ent in doc.ents:
        if ent.label_ in ENTITY_TYPES and ent.text.lower() not in seen:
            seen.add(ent.text.lower())
            phrases.append({
                'text': ent.text.strip(),
                'type': 'entity',
                'entity_label': ent.label_
            })

    # 2. Noun Chunks (multi-word expressions)
    for chunk in doc.noun_chunks:
        if len(chunk.text.split()) >= 2 and chunk.text.lower() not in seen:
            seen.add(chunk.text.lower())
            phrases.append({
                'text': chunk.text.strip(),
                'type': 'noun_phrase',
                'entity_label': ''
            })

    # 3. Verb Phrases (ROOT verb + direct object)
    for token in doc:
        if token.pos_ == 'VERB' and token.dep_ == 'ROOT':
            for child in token.children:
                if child.dep_ == 'dobj':
                    vp = f"{token.lemma_} {child.text}"
                    if vp.lower() not in seen:
                        seen.add(vp.lower())
                        phrases.append({
                            'text': vp,
                            'type': 'verb_phrase',
                            'entity_label': ''
                        })

    return {'doc_id': doc_id, 'phrases': phrases[:30]}

print("\nExtracting phrases from articles...")
extracted = [extract_phrases(s['article'], s['id']) for s in tqdm(samples)]

with open('/content/data/extracted_phrases.json', 'w') as f:
    json.dump(extracted, f)

# Statistics
total_phrases = sum(len(e['phrases']) for e in extracted)
avg_phrases = total_phrases / len(extracted)
print(f"\n✅ Extracted {total_phrases} phrases from {len(extracted)} documents")
print(f"   Average: {avg_phrases:.1f} phrases/document")



PHRASE EXTRACTION (SigExt)

Extracting phrases from articles...


100%|██████████| 200/200 [00:24<00:00,  8.30it/s]


✅ Extracted 5959 phrases from 200 documents
   Average: 29.8 phrases/document





## PHASE 1.4: SEMANTIC GROUPING

In [None]:
print("\n" + "="*60)
print("SEMANTIC GROUPING")
print("="*60)

# Map entity labels to semantic categories
CAT_MAP = {
    'PERSON': 'who', 'ORG': 'who', 'NORP': 'who',
    'GPE': 'where', 'LOC': 'where', 'FAC': 'where',
    'DATE': 'when', 'TIME': 'when',
    'MONEY': 'numeric', 'PERCENT': 'numeric', 'CARDINAL': 'numeric',
    'EVENT': 'what'
}

def group_phrases(doc):
    """Group extracted phrases into semantic categories."""
    grouped = {
        'doc_id': doc['doc_id'],
        'who': [], 'what': [], 'when': [],
        'where': [], 'numeric': [], 'other': []
    }

    for p in doc['phrases']:
        label = p.get('entity_label', '')
        # Map to category based on entity label or phrase type
        if label in CAT_MAP:
            cat = CAT_MAP[label]
        elif p['type'] == 'verb_phrase':
            cat = 'what'
        else:
            cat = 'other'

        grouped[cat].append({
            'text': p['text'],
            'confidence': 0.85
        })

    return grouped

print("\nGrouping phrases into semantic categories...")
grouped_data = [group_phrases(doc) for doc in tqdm(extracted, desc="Grouping")]

with open('/content/data/grouped_phrases.json', 'w') as f:
    json.dump(grouped_data, f)

grouped_map = {g['doc_id']: g for g in grouped_data}
print(f"✅ Grouped {len(grouped_data)} documents")



SEMANTIC GROUPING

Grouping phrases into semantic categories...


Grouping: 100%|██████████| 200/200 [00:00<00:00, 35879.42it/s]

✅ Grouped 200 documents





## PHASE 1.5: IMPROVED WHAT EXTRACTION

In [None]:
print("\n" + "="*60)
print("IMPROVED WHAT EXTRACTION")
print("="*60)

def extract_and_group_improved(text, doc_id):
    """
    Improved extraction with better WHAT (verb/event) capture.
    Addresses the low WHAT extraction rate in baseline SigExt.
    """
    doc = nlp(text[:10000])
    grouped = {
        'doc_id': doc_id,
        'who': [], 'what': [], 'when': [],
        'where': [], 'numeric': [], 'other': []
    }
    seen = set()

    # 1. Named Entity Recognition
    for ent in doc.ents:
        if ent.text.lower() not in seen:
            seen.add(ent.text.lower())
            cat = CAT_MAP.get(ent.label_, 'other')
            grouped[cat].append({'text': ent.text.strip()})

    # 2. IMPROVED: Better verb phrase extraction
    # Skip common light verbs that don't carry meaning
    LIGHT_VERBS = {'be', 'have', 'do', 'say', 'get', 'make', 'go', 'know', 'take', 'see'}

    for token in doc:
        if token.pos_ == 'VERB' and token.lemma_ not in LIGHT_VERBS:

            # Method A: Verb + Direct Object / Prepositional Object
            for child in token.children:
                if child.dep_ in ('dobj', 'pobj', 'attr'):
                    vp = f"{token.lemma_} {child.text}"
                    if vp.lower() not in seen and len(vp) > 5:
                        seen.add(vp.lower())
                        grouped['what'].append({'text': vp})

            # Method B: Verb + Particle (phrasal verbs)
            particles = [c for c in token.children if c.dep_ == 'prt']
            if particles:
                vp = f"{token.lemma_} {particles[0].text}"
                if vp.lower() not in seen:
                    seen.add(vp.lower())
                    grouped['what'].append({'text': vp})

            # Method C: Passive constructions
            if token.dep_ == 'ROOT' and any(c.dep_ == 'auxpass' for c in token.children):
                vp = token.lemma_
                if vp.lower() not in seen and len(vp) > 3:
                    seen.add(vp.lower())
                    grouped['what'].append({'text': vp})

    # 3. EVENT-related noun phrases
    EVENT_KEYWORDS = {
        'attack', 'election', 'investigation', 'trial', 'crash', 'shooting',
        'murder', 'death', 'fire', 'explosion', 'protest', 'vote', 'debate',
        'announcement', 'decision', 'agreement', 'deal', 'war', 'conflict'
    }

    for chunk in doc.noun_chunks:
        chunk_lower = chunk.text.lower()
        if any(kw in chunk_lower for kw in EVENT_KEYWORDS):
            if chunk_lower not in seen:
                seen.add(chunk_lower)
                grouped['what'].append({'text': chunk.text.strip()})

    # Limit phrases per category
    for cat in grouped:
        if cat != 'doc_id':
            grouped[cat] = grouped[cat][:10]

    return grouped

print("\nRe-extracting with improved WHAT detection...")
grouped_data_improved = [
    extract_and_group_improved(s['article'], s['id'])
    for s in tqdm(samples)
]
grouped_map_improved = {g['doc_id']: g for g in grouped_data_improved}

with open('/content/data/grouped_phrases_improved.json', 'w') as f:
    json.dump(grouped_data_improved, f)

print(f"✅ Improved extraction complete")



IMPROVED WHAT EXTRACTION

Re-extracting with improved WHAT detection...


100%|██████████| 200/200 [00:21<00:00,  9.29it/s]

✅ Improved extraction complete





## PHASE 1.6: EXTRACTION STATISTICS & GAP ANALYSIS

In [None]:
print("\n" + "="*60)
print("EXTRACTION CATEGORY PRESENCE ANALYSIS")
print("="*60)

categories = ['who', 'what', 'when', 'where', 'numeric']

# Original extraction stats
extraction_stats = {}
for cat in categories:
    docs_with_cat = sum(1 for g in grouped_data if len(g.get(cat, [])) >= 1)
    pct = docs_with_cat / len(grouped_data) * 100
    extraction_stats[cat] = {
        'docs_with_extraction': docs_with_cat,
        'percentage': pct,
        'avg_phrases_per_doc': sum(len(g.get(cat, [])) for g in grouped_data) / len(grouped_data)
    }

print("\n📊 ORIGINAL Extraction (% of docs with ≥1 phrase):\n")
for cat in categories:
    pct = extraction_stats[cat]['percentage']
    avg = extraction_stats[cat]['avg_phrases_per_doc']
    bar = '█' * int(pct / 2) + '░' * (50 - int(pct / 2))
    print(f"  {cat.upper():<8} {bar} {pct:.1f}%  (avg: {avg:.1f}/doc)")

# Improved extraction stats
extraction_stats_improved = {}
for cat in categories:
    docs_with = sum(1 for g in grouped_data_improved if len(g.get(cat, [])) >= 1)
    extraction_stats_improved[cat] = {
        'percentage': docs_with / len(grouped_data_improved) * 100,
        'avg_phrases_per_doc': sum(len(g.get(cat, [])) for g in grouped_data_improved) / len(grouped_data_improved)
    }

print("\n📊 IMPROVED Extraction:\n")
for cat in categories:
    old_pct = extraction_stats[cat]['percentage']
    new_pct = extraction_stats_improved[cat]['percentage']
    change = new_pct - old_pct
    status = "✅" if change > 5 else ("⚠️" if change > 0 else "")
    print(f"  {cat.upper():<8}: {old_pct:.1f}% → {new_pct:.1f}% ({change:+.1f}%) {status}")

# Save stats
with open('/content/data/extraction_stats.json', 'w') as f:
    json.dump(extraction_stats, f, indent=2)
with open('/content/data/extraction_stats_improved.json', 'w') as f:
    json.dump(extraction_stats_improved, f, indent=2)

# Gap Analysis
print("\n" + "-"*60)
print("EXTRACTION GAP ANALYSIS:")
print("-"*60)

for cat in categories:
    pct = extraction_stats_improved[cat]['percentage']
    if pct < 50:
        print(f"  ⚠️  {cat.upper()}: Only {pct:.1f}% coverage - SIGNIFICANT GAP")
    elif pct < 80:
        print(f"  📊 {cat.upper()}: {pct:.1f}% coverage - moderate")
    else:
        print(f"  ✅ {cat.upper()}: {pct:.1f}% coverage - good")

print("\n✅ All extraction statistics saved")



EXTRACTION CATEGORY PRESENCE ANALYSIS

📊 ORIGINAL Extraction (% of docs with ≥1 phrase):

  WHO      █████████████████████████████████████████████████░ 99.5%  (avg: 12.1/doc)
  WHAT     █████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 18.0%  (avg: 0.2/doc)
  WHEN     █████████████████████████████████████████████████░ 98.0%  (avg: 5.9/doc)
  WHERE    ██████████████████████████████████████████████░░░░ 93.0%  (avg: 3.7/doc)
  NUMERIC  ████████████████████████████████████████████░░░░░░ 89.0%  (avg: 3.4/doc)

📊 IMPROVED Extraction:

  WHO     : 99.5% → 99.5% (+0.0%) 
  WHAT    : 18.0% → 100.0% (+82.0%) ✅
  WHEN    : 98.0% → 98.0% (+0.0%) 
  WHERE   : 93.0% → 93.5% (+0.5%) ⚠️
  NUMERIC : 89.0% → 90.0% (+1.0%) ⚠️

------------------------------------------------------------
EXTRACTION GAP ANALYSIS:
------------------------------------------------------------
  ✅ WHO: 99.5% coverage - good
  ✅ WHAT: 100.0% coverage - good
  ✅ WHEN: 98.0% coverage - good
  ✅ WHERE: 93.5% coverage - good
  ✅ 

## PHASE 1 SUMMARY

In [None]:
print("\n" + "="*60)
print("PHASE 1 COMPLETE - SUMMARY")
print("="*60)

print(f"""
📁 Files Generated:
   • validation_samples.json      - {len(samples)} articles
   • ground_truth_analysis.json   - Reference coverage stats
   • extracted_phrases.json       - SigExt baseline extraction
   • grouped_phrases.json         - Semantic grouping (original)
   • grouped_phrases_improved.json - Semantic grouping (improved)
   • extraction_stats.json        - Original extraction rates
   • extraction_stats_improved.json - Improved extraction rates

📊 Key Findings:
   • Dataset: {len(samples)} CNN/DailyMail validation samples
   • WHAT extraction improved: {extraction_stats['what']['percentage']:.1f}% → {extraction_stats_improved['what']['percentage']:.1f}%
   • All categories now have good extraction coverage

🔜 Next Steps (Phase 2):
   • Build coverage-aware prompts
   • Generate summaries with GPT-3.5 and BART
   • Evaluate with ROUGE and beyond-ROUGE metrics
""")

# Download data
!cd /content && zip -r phase1_results.zip data/
from google.colab import files
files.download('/content/phase1_results.zip')

print("✅ Phase 1 data downloaded!")



PHASE 1 COMPLETE - SUMMARY

📁 Files Generated:
   • validation_samples.json      - 200 articles
   • ground_truth_analysis.json   - Reference coverage stats
   • extracted_phrases.json       - SigExt baseline extraction
   • grouped_phrases.json         - Semantic grouping (original)
   • grouped_phrases_improved.json - Semantic grouping (improved)
   • extraction_stats.json        - Original extraction rates
   • extraction_stats_improved.json - Improved extraction rates

📊 Key Findings:
   • Dataset: 200 CNN/DailyMail validation samples
   • WHAT extraction improved: 18.0% → 100.0%
   • All categories now have good extraction coverage

🔜 Next Steps (Phase 2):
   • Build coverage-aware prompts
   • Generate summaries with GPT-3.5 and BART
   • Evaluate with ROUGE and beyond-ROUGE metrics

  adding: data/ (stored 0%)
  adding: data/ground_truth_analysis.json (deflated 38%)
  adding: data/grouped_phrases.json (deflated 84%)
  adding: data/extraction_stats_improved.json (deflated 66%)
 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Phase 1 data downloaded!


# Phase 2 — Coverage-Aware Summarization & Evaluation
Goal:
1. Build coverage-aware prompts using SigExt signals
2. Generate summaries with GPT-3.5 and BART
3. Evaluate summaries with ROUGE and beyond-ROUGE metrics

## PHASE 2.1: COVERAGE-AWARE PROMPTS BUILDING

In [None]:
def build_coverage_prompt(article, signals, max_signals=8):

    # Build a coverage-aware prompt using SigExt signals.

    prompt = "Summarize the following news article.\n\n"

    prompt += "Key facts to cover:\n"
    for cat in ['who', 'what', 'when', 'where', 'numeric']:
        phrases = [p['text'] for p in signals.get(cat, [])][:max_signals]
        if phrases:
            prompt += f"- {cat.upper()}: {', '.join(phrases)}\n"

    prompt += "\nArticle:\n"
    prompt += article[:3500]  # safety limit

    return prompt

In [None]:
example = samples[0]
signals = grouped_map_improved[example['id']]
print(build_coverage_prompt(example['article'], signals))

Summarize the following news article.

Key facts to cover:
- WHO: Zully Broussard, Broussard, CNN, KGO, California Pacific Medical Center, David Jacobs, Jacobs, MatchGrid
- WHAT: give one, pair up, receive transplants, wow her, help person, tell KGO, verify authenticity, multiply gift
- WHEN: 70, Friday, late March, years, Thursday, a few years ago, about three weeks
- WHERE: San Francisco
- NUMERIC: six, five, more than 40, 12, 26, three, one, two

Article:
(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don't know, but the fact that so many people can have a life extension, that's pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks f

## PHASE 2.2: SUMMARY GENERATION
### GPT-3.5

In [None]:
import openai
os.environ['OPENAI_API_KEY'] = ''
openai.api_key = os.environ['OPENAI_API_KEY']

def generate_gpt35(prompt, max_tokens=300, temperature=0.7):
    """
    Generate a summary using GPT-3.5 via OpenAI API.

    Parameters:
    - prompt (str): input prompt
    - max_tokens (int): max length of summary
    - temperature (float): temperature for sampling

    Returns:
    - str: generated summary
    """
    response = openai.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
          {"role": "system", "content": "You are a helpful assistant that summarizes news articles."},
          {"role": "user", "content": prompt}
      ],
      max_tokens=max_tokens,
      temperature=temperature,
    )

    summary = response.choices[0].message.content.strip()
    return summary

In [None]:
gpt_summaries = {}

for i, s in enumerate(samples[:2], start=1):  # only the first 2 articles
    print("\n" + "="*80)
    print(f"TEST ARTICLE {i} | ID: {s['id']}")
    print("="*80)

    prompt = build_coverage_prompt(
        s['article'],
        grouped_map_improved[s['id']]
    )

    print("\n--- PROMPT ---\n")
    print(prompt[:1500])  # 1500 chars
    print("\n--- GENERATING SUMMARY ---\n")

    summary = generate_gpt35(prompt)
    gpt_summaries[s['id']] = summary

    print("--- SUMMARY ---\n")
    print(summary)


TEST ARTICLE 1 | ID: a4942dd663020ca54575471657a0af38d82897d6

--- PROMPT ---

Summarize the following news article.

Key facts to cover:
- WHO: Zully Broussard, Broussard, CNN, KGO, California Pacific Medical Center, David Jacobs, Jacobs, MatchGrid
- WHAT: give one, pair up, receive transplants, wow her, help person, tell KGO, verify authenticity, multiply gift
- WHEN: 70, Friday, late March, years, Thursday, a few years ago, about three weeks
- WHERE: San Francisco
- NUMERIC: six, five, more than 40, 12, 26, three, one, two

Article:
(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don't know, but the fact that so many people can have a life extension, that's pretty big," Broussard told CNN a

### BART

In [None]:
def build_bart_prefix(grouped, max_per_cat=8):
    """
    Build a textual prefix from extracted phrases to guide BART summarization.
    """
    lines = ["Key facts:"]

    for cat in ['who', 'what', 'when', 'where', 'numeric']:
        phrases = [p['text'] for p in grouped.get(cat, [])[:max_per_cat]]
        if phrases:
            lines.append(f"{cat.upper()}: " + "; ".join(phrases))

    return "\n".join(lines) + "\n\n"

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

def generate_bart_prefix_guided(article, grouped, max_length=300):
    """
    Generate a summary using BART with prefix-guided summarization.
    """
    prefix = build_bart_prefix(grouped)
    text = prefix + "Article:\n" + article

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    )

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)



vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
bart_summaries = {}

for i, s in enumerate(samples[:2], start=1):  # first 2 articles
    print("\n" + "="*80)
    print(f"BART PREFIX-GUIDED | ARTICLE {i} | ID: {s['id']}")
    print("="*80)

    # input build
    prefix = build_bart_prefix(grouped_map_improved[s['id']])
    bart_input = prefix + "Article:\n" + s['article']


    print("\n--- BART INPUT (PREFIX + ARTICLE) ---\n")
    print(bart_input[:1500])  # stampa solo i primi 1500 caratteri per leggibilità
    if len(bart_input) > 1500:
        print("\n... [truncated] ...\n")

    # summary
    summary = generate_bart_prefix_guided(
        s['article'],
        grouped_map_improved[s['id']]
    )
    bart_summaries[s['id']] = summary

    print("\n--- BART SUMMARY ---\n")
    print(summary)


BART PREFIX-GUIDED | ARTICLE 1 | ID: a4942dd663020ca54575471657a0af38d82897d6

--- BART INPUT (PREFIX + ARTICLE) ---

Key facts:
WHO: Zully Broussard; Broussard; CNN; KGO; California Pacific Medical Center; David Jacobs; Jacobs; MatchGrid
WHAT: give one; pair up; receive transplants; wow her; help person; tell KGO; verify authenticity; multiply gift
WHEN: 70; Friday; late March; years; Thursday; a few years ago; about three weeks
WHERE: San Francisco
NUMERIC: six; five; more than 40; 12; 26; three; one; two

Article:
(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don't know, but the fact that so many people can have a life extension, that's pretty big," Broussard told CNN affiliate KGO. She m

## PHASE 2.3: EVALUATION
### ROUGE

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(
    ['rouge1', 'rouge2', 'rougeL'],
    use_stemmer=True
)

def rouge_scores(hyps, refs, n_docs=None):
    """
    Compute average ROUGE scores.

    Parameters:
    - hyps (list[str]): generated summaries
    - refs (list[str]): reference summaries
    - n_docs (int or None): number of documents to evaluate (for partial testing)
    """

    if n_docs is not None:
        hyps = hyps[:n_docs]
        refs = refs[:n_docs]

    assert len(hyps) == len(refs), "Mismatch between hypotheses and references"

    scores = defaultdict(list)

    for h, r in zip(hyps, refs):
        result = scorer.score(r, h)
        for k in result:
            scores[k].append(result[k].fmeasure)

    return {k: np.mean(v) for k, v in scores.items()}

In [None]:
hyps_gpt = [gpt_summaries[s['id']] for s in samples[:2]]
hyps_bart = [bart_summaries[s['id']] for s in samples[:2]]
refs = [s['highlights'] for s in samples[:2]]

rouge_gpt = rouge_scores(hyps_gpt, refs)
rouge_bart = rouge_scores(hyps_bart, refs)

print("\n" + "="*50)
print("ROUGE RESULTS (2 DOCUMENTS)")
print("="*50)

for k in ['rouge1', 'rouge2', 'rougeL']:
    g = rouge_gpt.get(k, 0.0)
    b = rouge_bart.get(k, 0.0)
    print(f"{k.upper():<8} | GPT: {g:.4f} | BART: {b:.4f}")

print("="*50)


ROUGE RESULTS (2 DOCUMENTS)
ROUGE1   | GPT: 0.1580 | BART: 0.2758
ROUGE2   | GPT: 0.0456 | BART: 0.0556
ROUGEL   | GPT: 0.1130 | BART: 0.2012


### BEYOND-ROUGE - SEMANTIC COVERAGE

In [None]:
!pip install rapidfuzz
from rapidfuzz import fuzz

def normalize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    re.sub(r"\s+", " ", text).strip()
    return text

def semantic_coverage(summary, article_signals,
                      fuzzy_threshold=70,
                      min_keyword_overlap=0.5):
    """
    Semantic coverage:
    - exact match
    - fuzzy match
    - keyword overlap (for WHAT)
    """
    summary_norm = normalize(summary)
    coverage = {}

    for cat in ['who', 'what', 'when', 'where', 'numeric']:
        signals = [normalize(p['text']) for p in article_signals.get(cat, [])]

        if not signals:
            coverage[cat] = None
            continue

        matched = 0

        for sig in signals:
            # 1️. Exact match
            if sig in summary_norm:
                matched += 1
                continue

            # 2️. Fuzzy match
            if fuzz.partial_ratio(sig, summary_norm) >= fuzzy_threshold:
                matched += 1
                continue

            # 3️. Keyword overlap (especially for WHAT)
            sig_tokens = set(sig.split())
            if len(sig_tokens) >= 2:
                summary_tokens = set(summary_norm.split())
                overlap = len(sig_tokens & summary_tokens) / len(sig_tokens)
                if overlap >= min_keyword_overlap:
                    matched += 1

        coverage[cat] = matched / len(signals)

    return coverage



In [None]:
coverage_gpt = []
coverage_bart = []

for i, s in enumerate(samples[:2], start=1):  # first 2 samples
    doc_id = s['id']
    sig = grouped_map_improved[doc_id]

    print(f"\n=== Documento {i} | ID: {doc_id} ===")

    # GPT
    gpt_cov = semantic_coverage(gpt_summaries[doc_id], sig)
    coverage_gpt.append(gpt_cov)
    print("GPT semantic coverage:")
    for k, v in gpt_cov.items():
        print(f"  - {k.upper()}: {v}")

    # BART
    bart_cov = semantic_coverage(bart_summaries[doc_id], sig)
    coverage_bart.append(bart_cov)
    print("BART semantic coverage:")
    for k, v in bart_cov.items():
        print(f"  - {k.upper()}: {v}")


=== Documento 1 | ID: a4942dd663020ca54575471657a0af38d82897d6 ===
GPT semantic coverage:
  - WHO: 0.6
  - WHAT: 0.6
  - WHEN: 0.42857142857142855
  - WHERE: 1.0
  - NUMERIC: 0.5
BART semantic coverage:
  - WHO: 0.2
  - WHAT: 0.7
  - WHEN: 0.14285714285714285
  - WHERE: 0.0
  - NUMERIC: 0.2

=== Documento 2 | ID: 4157bc4da185971e2742f349d69a037343bc0d95 ===
GPT semantic coverage:
  - WHO: 0.4
  - WHAT: 0.4
  - WHEN: 0.8
  - WHERE: 0.9
  - NUMERIC: 0.7
BART semantic coverage:
  - WHO: 0.4
  - WHAT: 0.0
  - WHEN: 0.7
  - WHERE: 0.3
  - NUMERIC: 0.7


### BEYOND-ROUGE - HALLUCINATION RATE

In [None]:
def is_supported(claim, article_signals, fuzzy_threshold=70):
    """
    Check whether a claim in the summary is supported by article signals.
    """
    claim_norm = normalize(claim)

    all_signals = []
    for cat in ['who', 'what', 'when', 'where', 'numeric']:
        all_signals.extend(
            normalize(p['text']) for p in article_signals.get(cat, [])
        )

    for sig in all_signals:
        # Exact match
        if sig in claim_norm or claim_norm in sig:
            return True

        # Fuzzy match
        if fuzz.partial_ratio(sig, claim_norm) >= fuzzy_threshold:
            return True

    return False

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

def hallucination_rate(summary, article_signals):
    """
    Compute hallucination rate for a single summary.
    """
    sentences = sent_tokenize(summary)
    if not sentences:
        return 0.0, 0, 0

    unsupported = 0
    for sent in sentences:
        if not is_supported(sent, article_signals):
            unsupported += 1

    rate = unsupported / len(sentences)
    return rate, unsupported, len(sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
print("\n" + "="*60)
print("HALLUCINATION RATE ANALYSIS (FIRST 2 DOCUMENTS)")
print("="*60)

for i, s in enumerate(samples[:2], start=1):
    doc_id = s['id']
    signals = grouped_map_improved[doc_id]

    print(f"\n=== Article {i} | ID: {doc_id} ===")

    # GPT
    rate_gpt, unsup_gpt, total_gpt = hallucination_rate(
        gpt_summaries[doc_id], signals
    )
    print(f"GPT hallucination rate: {rate_gpt:.2f} ({unsup_gpt}/{total_gpt} sentences)")

    # BART
    rate_bart, unsup_bart, total_bart = hallucination_rate(
        bart_summaries[doc_id], signals
    )
    print(f"BART hallucination rate: {rate_bart:.2f} ({unsup_bart}/{total_bart} sentences)")


HALLUCINATION RATE ANALYSIS (FIRST 2 DOCUMENTS)

=== Article 1 | ID: a4942dd663020ca54575471657a0af38d82897d6 ===
GPT hallucination rate: 0.00 (0/6 sentences)
BART hallucination rate: 0.00 (0/5 sentences)

=== Article 2 | ID: 4157bc4da185971e2742f349d69a037343bc0d95 ===
GPT hallucination rate: 0.00 (0/6 sentences)
BART hallucination rate: 0.00 (0/4 sentences)
