<a href="https://colab.research.google.com/github/faheembadar26272-netizen/AI-Summary-Generation-Implementation/blob/main/AAI_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install rank-bm25 transformers torch

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [4]:
"""
Baseline Prototype: Search + Retrieval-Augmented Summarization + Simple Factuality Check

This script implements a minimal version of the idea discussed in the report:
- Retrieve passages using BM25
- Generate a summary using a small T5 model
- Compute a simple factuality / grounding score (token overlap)
- Include a basic freshness signal from document dates

Dependencies:
    pip install rank-bm25 transformers torch

This is only a small prototype, not an optimized production system.
"""

import math
from datetime import datetime
from dataclasses import dataclass
from typing import List, Dict, Tuple

from rank_bm25 import BM25Okapi
from transformers import T5Tokenizer, T5ForConditionalGeneration


# -------------------------
# Data structures
# -------------------------

@dataclass
class Document:
    doc_id: str
    title: str
    content: str
    published: str  # ISO format "YYYY-MM-DD"


# -------------------------
# Toy corpus (you can replace with your own)
# -------------------------

CORPUS: List[Document] = [
    Document(
        doc_id="doc1",
        title="Large Language Models in Web Search",
        content=(
            "Large language models are increasingly used in web search engines to generate "
            "natural language answers. They are often combined with retrieval systems to form "
            "retrieval-augmented generation pipelines. A key challenge is factual accuracy, "
            "since models may hallucinate information not present in the retrieved documents."
        ),
        published="2024-03-15",
    ),
    Document(
        doc_id="doc2",
        title="Factuality and Hallucinations in Text Generation",
        content=(
            "Factuality in text generation refers to how well the generated statements are "
            "supported by evidence. Evaluation methods often rely on reference documents or "
            "question answering over the output. Hallucinations occur when a model produces "
            "plausible but unsupported claims. Reducing hallucinations is important for "
            "trustworthy AI systems."
        ),
        published="2023-11-02",
    ),
    Document(
        doc_id="doc3",
        title="Freshness and Recency in Search Results",
        content=(
            "Search engines rank documents not only by relevance but also by freshness. "
            "Freshness can be estimated from publication dates or update times. For some "
            "topics, such as health or finance, outdated information may be unsafe or misleading. "
            "Combining recency signals with retrieval-augmented generation can improve answer quality."
        ),
        published="2024-05-20",
    ),
]


# -------------------------
# Retrieval component (BM25)
# -------------------------

class BM25Retriever:
    def __init__(self, documents: List[Document]):
        self.documents = documents
        self.tokenized_docs = [self._tokenize(d.title + " " + d.content) for d in documents]
        self.bm25 = BM25Okapi(self.tokenized_docs)

    @staticmethod
    def _tokenize(text: str) -> List[str]:
        return text.lower().split()

    def retrieve(self, query: str, k: int = 3) -> List[Tuple[Document, float]]:
        tokens = self._tokenize(query)
        scores = self.bm25.get_scores(tokens)
        scored_docs = list(zip(self.documents, scores))
        scored_docs.sort(key=lambda x: x[1], reverse=True)
        return scored_docs[:k]


# -------------------------
# Summarization component (T5)
# -------------------------

class T5Summarizer:
    def __init__(self, model_name: str = "t5-small", device: str = "cpu"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.device = device
        self.model.to(self.device)

    def summarize(self, text: str, max_input_tokens: int = 512,
                  max_output_tokens: int = 120) -> str:
        # prepend "summarize:" for T5
        input_text = "summarize: " + text
        inputs = self.tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=max_input_tokens,
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        output_ids = self.model.generate(
            inputs["input_ids"],
            max_length=max_output_tokens,
            num_beams=4,
            early_stopping=True,
        )
        summary = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return summary


# -------------------------
# Simple factuality and freshness scoring
# -------------------------

def simple_factuality_score(summary: str, docs: List[Document]) -> float:
    """
    Rough grounding score based on token overlap:
    - tokenize summary
    - tokenize all documents used as evidence
    - compute (overlap / summary_len)
    This is only an approximate signal, not a proper metric.
    """
    sum_tokens = summary.lower().split()
    if not sum_tokens:
        return 0.0

    evidence_text = " ".join([d.content for d in docs]).lower()
    evidence_tokens = set(evidence_text.split())

    overlap = sum(1 for t in sum_tokens if t in evidence_tokens)
    score = overlap / len(sum_tokens)
    return score


def freshness_score(docs: List[Document]) -> float:
    """
    Simple freshness score based on publication dates.
    Here we measure recency in days relative to 'now' and apply a decay.
    """

    def recency(doc: Document) -> float:
        try:
            pub_date = datetime.fromisoformat(doc.published)
        except Exception:
            return 0.0
        delta_days = (datetime.now() - pub_date).days
        # decay: newer docs get higher scores, older docs lower
        # score in (0, 1], with 1 for today, then decreasing
        return 1.0 / (1.0 + max(delta_days, 0) / 30.0)

    if not docs:
        return 0.0

    scores = [recency(d) for d in docs]
    return sum(scores) / len(scores)


# -------------------------
# Orchestrator: GRASP-S baseline
# -------------------------

class GraspSBaseline:
    def __init__(self, documents: List[Document]):
        self.retriever = BM25Retriever(documents)
        self.summarizer = T5Summarizer()

    def answer_query(self, query: str, k: int = 3,
                     factuality_threshold: float = 0.3) -> Dict:
        # 1. retrieve top-k documents
        retrieved = self.retriever.retrieve(query, k=k)
        docs = [d for d, s in retrieved]

        # 2. build input text for summarization
        combined = ""
        for d in docs:
            combined += d.title + ". " + d.content + " "

        # 3. generate summary
        summary = self.summarizer.summarize(combined)

        # 4. factuality and freshness (very simple)
        fact_score = simple_factuality_score(summary, docs)
        fresh_score = freshness_score(docs)

        # 5. abstention heuristic: if fact_score too low, no summary
        should_abstain = fact_score < factuality_threshold

        return {
            "query": query,
            "summary": summary,
            "retrieved_docs": docs,
            "factuality_score": fact_score,
            "freshness_score": fresh_score,
            "abstain": should_abstain,
        }


# -------------------------
# Demo / initial experiment
# -------------------------

def print_result(result: Dict):
    print("=" * 80)
    print("Query:")
    print(result["query"])
    print("-" * 80)
    if result["abstain"]:
        print("System abstains from summarizing due to low factuality score.")
        print(f"Factuality score: {result['factuality_score']:.3f}")
        print(f"Freshness score:  {result['freshness_score']:.3f}")
        print("\nTop retrieved documents:\n")
        for d in result["retrieved_docs"]:
            print(f"- {d.title} (published {d.published})")
        return

    print("Generated summary:\n")
    print(result["summary"])
    print("-" * 80)
    print(f"Factuality score: {result['factuality_score']:.3f}")
    print(f"Freshness score:  {result['freshness_score']:.3f}")
    print("\nEvidence documents:\n")
    for d in result["retrieved_docs"]:
        print(f"- {d.title} (published {d.published})")


if __name__ == "__main__":
    # Instantiate baseline system
    system = GraspSBaseline(CORPUS)

    # A few example queries
    example_queries = [
        "How do search engines use large language models?",
        "What are hallucinations in text generation?",
        "Why is freshness important in search results?",
    ]

    for q in example_queries:
        res = system.answer_query(q, k=3, factuality_threshold=0.3)
        print_result(res)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Query:
How do search engines use large language models?
--------------------------------------------------------------------------------
Generated summary:

large language models are increasingly used in web search engines to generate natural language answers. a key challenge is factual accuracy, since models may hallucinate information not present in the retrieved documents.
--------------------------------------------------------------------------------
Factuality score: 1.000
Freshness score:  0.044

Evidence documents:

- Large Language Models in Web Search (published 2024-03-15)
- Factuality and Hallucinations in Text Generation (published 2023-11-02)
- Freshness and Recency in Search Results (published 2024-05-20)
Query:
What are hallucinations in text generation?
--------------------------------------------------------------------------------
Generated summary:

large language models are increasingly used in web search engines to generate natural language answers. they are often

In [None]:
import math
from datetime import datetime
from dataclasses import dataclass
from typing import List, Dict, Tuple

from rank_bm25 import BM25Okapi
from transformers import T5Tokenizer, T5ForConditionalGeneration


# -------------------------
# Data structures
# -------------------------

@dataclass
class Document:
    doc_id: str
    title: str
    content: str
    published: str  # ISO format "YYYY-MM-DD"


# -------------------------
# Toy corpus (you can replace with your own)
# -------------------------

CORPUS: List[Document] = [
    Document(
        doc_id="doc1",
        title="Large Language Models in Web Search",
        content=(
            "Large language models are increasingly used in web search engines to generate "
            "natural language answers. They are often combined with retrieval systems to form "
            "retrieval-augmented generation pipelines. A key challenge is factual accuracy, "
            "since models may hallucinate information not present in the retrieved documents."
        ),
        published="2024-03-15",
    ),
    Document(
        doc_id="doc2",
        title="Factuality and Hallucinations in Text Generation",
        content=(
            "Factuality in text generation refers to how well the generated statements are "
            "supported by evidence. Evaluation methods often rely on reference documents or "
            "question answering over the output. Hallucinations occur when a model produces "
            "plausible but unsupported claims. Reducing hallucinations is important for "
            "trustworthy AI systems."
        ),
        published="2023-11-02",
    ),
    Document(
        doc_id="doc3",
        title="Freshness and Recency in Search Results",
        content=(
            "Search engines rank documents not only by relevance but also by freshness. "
            "Freshness can be estimated from publication dates or update times. For some "
            "topics, such as health or finance, outdated information may be unsafe or misleading. "
            "Combining recency signals with retrieval-augmented generation can improve answer quality."
        ),
        published="2024-05-20",
    ),
]


# -------------------------
# Retrieval component (BM25)
# -------------------------

class BM25Retriever:
    def __init__(self, documents: List[Document]):
        self.documents = documents
        self.tokenized_docs = [self._tokenize(d.title + " " + d.content) for d in documents]
        self.bm25 = BM25Okapi(self.tokenized_docs)

    @staticmethod
    def _tokenize(text: str) -> List[str]:
        return text.lower().split()

    def retrieve(self, query: str, k: int = 3) -> List[Tuple[Document, float]]:
        tokens = self._tokenize(query)
        scores = self.bm25.get_scores(tokens)
        scored_docs = list(zip(self.documents, scores))
        scored_docs.sort(key=lambda x: x[1], reverse=True)
        return scored_docs[:k]


# -------------------------
# Summarization component (T5)
# -------------------------

class T5Summarizer:
    def __init__(self, model_name: str = "t5-small", device: str = "cpu"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.device = device
        self.model.to(self.device)

    def summarize(self, text: str, max_input_tokens: int = 512,
                  max_output_tokens: int = 120) -> str:
        # prepend "summarize:" for T5
        input_text = "summarize: " + text
        inputs = self.tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=max_input_tokens,
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        output_ids = self.model.generate(
            inputs["input_ids"],
            max_length=max_output_tokens,
            num_beams=4,
            early_stopping=True,
        )
        summary = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return summary


# -------------------------
# Simple factuality and freshness scoring
# -------------------------

def simple_factuality_score(summary: str, docs: List[Document]) -> float:
    """
    Rough grounding score based on token overlap:
    - tokenize summary
    - tokenize all documents used as evidence
    - compute (overlap / summary_len)
    This is only an approximate signal, not a proper metric.
    """
    sum_tokens = summary.lower().split()
    if not sum_tokens:
        return 0.0

    evidence_text = " ".join([d.content for d in docs]).lower()
    evidence_tokens = set(evidence_text.split())

    overlap = sum(1 for t in sum_tokens if t in evidence_tokens)
    score = overlap / len(sum_tokens)
    return score


def freshness_score(docs: List[Document]) -> float:
    """
    Simple freshness score based on publication dates.
    Here we measure recency in days relative to 'now' and apply a decay.
    """

    def recency(doc: Document) -> float:
        try:
            pub_date = datetime.fromisoformat(doc.published)
        except Exception:
            return 0.0
        delta_days = (datetime.now() - pub_date).days
        # decay: newer docs get higher scores, older docs lower
        # score in (0, 1], with 1 for today, then decreasing
        return 1.0 / (1.0 + max(delta_days, 0) / 30.0)

    if not docs:
        return 0.0

    scores = [recency(d) for d in docs]
    return sum(scores) / len(scores)


# -------------------------
# Orchestrator: GRASP-S baseline
# -------------------------

class GraspSBaseline:
    def __init__(self, documents: List[Document]):
        self.retriever = BM25Retriever(documents)
        self.summarizer = T5Summarizer()

    def answer_query(self, query: str, k: int = 3,
                     factuality_threshold: float = 0.3) -> Dict:
        # 1. retrieve top-k documents
        retrieved = self.retriever.retrieve(query, k=k)
        docs = [d for d, s in retrieved]

        # 2. build input text for summarization
        combined = ""
        for d in docs:
            combined += d.title + ". " + d.content + " "

        # 3. generate summary
        summary = self.summarizer.summarize(combined)

        # 4. factuality and freshness (very simple)
        fact_score = simple_factuality_score(summary, docs)
        fresh_score = freshness_score(docs)

        # 5. abstention heuristic: if fact_score too low, no summary
        should_abstain = fact_score < factuality_threshold

        return {
            "query": query,
            "summary": summary,
            "retrieved_docs": docs,
            "factuality_score": fact_score,
            "freshness_score": fresh_score,
            "abstain": should_abstain,
        }


# -------------------------
# Demo / initial experiment
# -------------------------

def print_result(result: Dict):
    print("=" * 80)
    print("Query:")
    print(result["query"])
    print("-" * 80)
    if result["abstain"]:
        print("System abstains from summarizing due to low factuality score.")
        print(f"Factuality score: {result['factuality_score']:.3f}")
        print(f"Freshness score:  {result['freshness_score']:.3f}")
        print("\nTop retrieved documents:\n")
        for d in result["retrieved_docs"]:
            print(f"- {d.title} (published {d.published})")
        return

    print("Generated summary:\n")
    print(result["summary"])
    print("-" * 80)
    print(f"Factuality score: {result['factuality_score']:.3f}")
    print(f"Freshness score:  {result['freshness_score']:.3f}")
    print("\nEvidence documents:\n")
    for d in result["retrieved_docs"]:
        print(f"- {d.title} (published {d.published})")


if __name__ == "__main__":
    # Instantiate baseline system
    system = GraspSBaseline(CORPUS)

    # A few example queries
    example_queries = [
        "How do search engines use large language models?",
        "What are hallucinations in text generation?",
        "Why is freshness important in search results?",
    ]

    for q in example_queries:
        res = system.answer_query(q, k=3, factuality_threshold=0.3)
        print_result(res)


In [None]:
!pip install rank-bm25 transformers torch