In [None]:
from datasets import load_dataset
import pandas as pd
from collections import defaultdict

In [None]:
trivia = load_dataset("trivia_qa", "rc", split="train")

trivia_docs = defaultdict(list)
for row in trivia:
    doc = None
    if row.get("entity_pages"):
        # nếu là dict thì lấy trực tiếp
        if isinstance(row["entity_pages"], dict):
            doc = row["entity_pages"].get("wiki_context")
        # nếu là list thì lấy phần tử đầu
        elif isinstance(row["entity_pages"], list) and len(row["entity_pages"]) > 0:
            ep0 = row["entity_pages"][0]
            if isinstance(ep0, dict):
                doc = ep0.get("wiki_context")

    # fallback sang evidence nếu không có entity_pages
    if not doc and row.get("evidence"):
        if isinstance(row["evidence"], dict):
            doc = row["evidence"].get("text")
        elif isinstance(row["evidence"], list) and len(row["evidence"]) > 0:
            ev0 = row["evidence"][0]
            if isinstance(ev0, dict):
                doc = ev0.get("text")
            elif isinstance(ev0, str):
                doc = ev0

    # nếu doc là list -> join lại
    if isinstance(doc, list):
        doc = " ".join([d for d in doc if isinstance(d, str)])

    if isinstance(doc, str) and len(doc.split()) >= 600:
        trivia_docs[doc].append(row.get("question"))

trivia_docs = {d: qs for d, qs in trivia_docs.items() if len(qs) >= 10}

trivia_rows = []
for doc, qs in trivia_docs.items():
    for q in qs:
        trivia_rows.append({"document": doc, "question": q})
trivia_df = pd.DataFrame(trivia_rows)
trivia_df.to_csv("/content/drive/MyDrive/Colab Notebooks/NLP/project2/triviaqa_filtered.csv", index=False)
print("Saved triviaqa_filtered.csv", len(trivia_df), "rows")

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

Saved triviaqa_filtered.csv 13941 rows


In [None]:
trivia_df[:35]

Unnamed: 0,document,question
0,Europe is a continent that comprises the weste...,Which country is Europe's largest silk producer?
1,Europe is a continent that comprises the weste...,Which was the last country in mainland Europe ...
2,Europe is a continent that comprises the weste...,"In terms of land area, which is the largest co..."
3,Europe is a continent that comprises the weste...,Which country produces more than 50 % of all E...
4,Europe is a continent that comprises the weste...,What species interbred with humans in Europe a...
5,Europe is a continent that comprises the weste...,In November 1943 which South American country ...
6,Europe is a continent that comprises the weste...,Which nation has the longest coastline of any ...
7,Europe is a continent that comprises the weste...,Europe’s first female President Vigdís Finnbog...
8,Europe is a continent that comprises the weste...,Which country is sometimes called the battlefi...
9,Europe is a continent that comprises the weste...,Which country is Europe's largest silk producer?


In [None]:
squad = load_dataset("squad_v2", split="train")

rows = []
for item in squad:
    context = item["context"]
    question = item["question"]
    # chỉ lấy context dài >= 400 từ
    if len(context.split()) >= 400:
        rows.append({"document": context, "question": question})

squad_df = pd.DataFrame(rows)
# lọc document có ít nhất 10 câu hỏi
doc_counts = squad_df["document"].value_counts()
valid_docs = doc_counts[doc_counts >= 10].index
squad_df = squad_df[squad_df["document"].isin(valid_docs)]
squad_df.to_csv("/content/drive/MyDrive/Colab Notebooks/NLP/project2/squad_filtered.csv", index=False)
print("Saved squad_filtered.csv", len(squad_df), "rows")

Saved squad_filtered.csv 91 rows


In [None]:
squad_df[:35]

Unnamed: 0,document,question
8,The biographical tradition asserts that Virgil...,Which Hellenistic poet was known for bucolic h...
9,The biographical tradition asserts that Virgil...,"Who tried to pay off veterans with land, some ..."
10,The biographical tradition asserts that Virgil...,Virgil's loss of his famiyl farm and the attem...
11,The biographical tradition asserts that Virgil...,To whom was Eclogue 4 addressed to?
12,The biographical tradition asserts that Virgil...,Which Eclogues discusses homosexual love?
13,The biographical tradition asserts that Virgil...,What language was Virgil's work written in?
14,The biographical tradition asserts that Virgil...,When did Theocritus publish his last work?
15,The biographical tradition asserts that Virgil...,What theme did Virgil write about the most often?
16,The biographical tradition asserts that Virgil...,Where did Octavian live?
17,The biographical tradition asserts that Virgil...,How many books of poetry did Cornelius Gallus ...


In [None]:
import pandas as pd

# Load CSV đã tiền xử lý
squad_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP/project2/squad_filtered.csv")
trivia_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP/project2/triviaqa_filtered.csv")

In [None]:
pip install faiss-cpu sentence-transformers openai

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


**MOCK CACHE**

In [None]:
import os
import time
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from openai import OpenAI

USE_REAL_API = False  # đổi thành True nếu còn quota

# --- Embedder ---
encoder = SentenceTransformer("all-MiniLM-L6-v2")

# --- Semantic Cache ---
class SemanticCache:
    def __init__(self, dim):
        self.index = faiss.IndexFlatIP(dim)
        self.embs = []
        self.store = {}
    def add(self, q_emb, summary):
        idx = len(self.embs)
        self.embs.append(q_emb)
        self.index.add(q_emb.reshape(1, -1).astype("float32"))
        self.store[idx] = summary
    def lookup(self, q_emb, thresh=0.85):
        if len(self.embs) == 0:
            return None, 0.0
        D, I = self.index.search(q_emb.reshape(1, -1).astype("float32"), 1)
        score = float(D[0][0])
        if score >= thresh:
            return self.store[int(I[0][0])], score
        return None, score

cache = SemanticCache(dim=encoder.get_sentence_embedding_dimension())

# --- Real summarizer using GPT ---
SUMMARY_PROMPT = """Bạn là một hệ thống tóm tắt.
Hãy đọc đoạn văn bản dưới đây và câu hỏi.
Sinh một bản tóm tắt ngắn (200 từ) chỉ tập trung vào những chi tiết hữu ích để trả lời câu hỏi.

Đoạn văn bản:
{doc}

Câu hỏi:
{q}
"""
def summarize(document, question):
    if USE_REAL_API:
        prompt = SUMMARY_PROMPT.format(doc=document, q=question)
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        return resp.choices[0].message.content.strip()
    else:
        return f"[MOCK SUMMARY cho câu hỏi: {question[:50]}...]"

# --- Real answerer using GPT ---
ANSWER_PROMPT = """Bạn là hệ thống trả lời.
Dựa vào tóm tắt ngữ cảnh dưới đây, hãy trả lời câu hỏi một cách chính xác và ngắn gọn.

Tóm tắt:
{summary}

Câu hỏi:
{q}
"""

def answer(summary, question):
    if USE_REAL_API:
        prompt = ANSWER_PROMPT.format(summary=summary, q=question)
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        return resp.choices[0].message.content.strip()
    else:
        return f"[MOCK ANSWER cho: {question[:50]}...]"

# --- Run pipeline ---
def run_pipeline(df, n_samples=10, thresh=0.85):
    stats = {"hits": 0, "miss": 0, "latency_enc": [], "latency_cache": [], "latency_llm": []}
    answers = []
    for i in range(min(n_samples, len(df))):
        q = df.loc[i, "question"]
        doc = df.loc[i, "document"]

        # encode
        t0 = time.time()
        q_emb = encoder.encode([q], normalize_embeddings=True)[0]
        stats["latency_enc"].append(time.time() - t0)

        # lookup cache
        t0 = time.time()
        cached_summary, score = cache.lookup(q_emb, thresh=thresh)
        stats["latency_cache"].append(time.time() - t0)

        if cached_summary:
            stats["hits"] += 1
            summary = cached_summary
        else:
            stats["miss"] += 1
            t1 = time.time()
            summary = summarize(doc, q)
            stats["latency_llm"].append(time.time() - t1)
            cache.add(q_emb, summary)

        t2 = time.time()
        ans = answer(summary, q)
        stats["latency_llm"].append(time.time() - t2)
        answers.append(ans)

    return stats, answers


In [None]:
# --- Demo 1 ---
stats, answers = run_pipeline(squad_df, n_samples=30, thresh=0.85)
print("Cache hits:", stats["hits"], "Misses:", stats["miss"])
print("Sample Answers:")
for a in answers:
    print(a)

Cache hits: 30 Misses: 0
Sample Answers:
[MOCK ANSWER cho: Which Hellenistic poet was known for bucolic hexam...]
[MOCK ANSWER cho: Who tried to pay off veterans with land, some of w...]
[MOCK ANSWER cho: Virgil's loss of his famiyl farm and the attempt t...]
[MOCK ANSWER cho: To whom was Eclogue 4 addressed to?...]
[MOCK ANSWER cho: Which Eclogues discusses homosexual love?...]
[MOCK ANSWER cho: What language was Virgil's work written in?...]
[MOCK ANSWER cho: When did Theocritus publish his last work?...]
[MOCK ANSWER cho: What theme did Virgil write about the most often?...]
[MOCK ANSWER cho: Where did Octavian live?...]
[MOCK ANSWER cho: How many books of poetry did Cornelius Gallus writ...]
[MOCK ANSWER cho: How many copies did Whitesnake's 1987 self title a...]
[MOCK ANSWER cho: What was the title of Aerosmith's 1987 comeback al...]
[MOCK ANSWER cho: Def Leppard's album Hysteria churned out how many ...]
[MOCK ANSWER cho: What Guns N Roses album was the second best sellin...]
[MO

In [None]:
# --- Demo 1 ---
stats, answers = run_pipeline(squad_df, n_samples=30, thresh=0.5)
print("Cache hits:", stats["hits"], "Misses:", stats["miss"])
print("Sample Answers:")
for a in answers:
    print(a)

Cache hits: 30 Misses: 0
Sample Answers:
[MOCK ANSWER cho: Which Hellenistic poet was known for bucolic hexam...]
[MOCK ANSWER cho: Who tried to pay off veterans with land, some of w...]
[MOCK ANSWER cho: Virgil's loss of his famiyl farm and the attempt t...]
[MOCK ANSWER cho: To whom was Eclogue 4 addressed to?...]
[MOCK ANSWER cho: Which Eclogues discusses homosexual love?...]
[MOCK ANSWER cho: What language was Virgil's work written in?...]
[MOCK ANSWER cho: When did Theocritus publish his last work?...]
[MOCK ANSWER cho: What theme did Virgil write about the most often?...]
[MOCK ANSWER cho: Where did Octavian live?...]
[MOCK ANSWER cho: How many books of poetry did Cornelius Gallus writ...]
[MOCK ANSWER cho: How many copies did Whitesnake's 1987 self title a...]
[MOCK ANSWER cho: What was the title of Aerosmith's 1987 comeback al...]
[MOCK ANSWER cho: Def Leppard's album Hysteria churned out how many ...]
[MOCK ANSWER cho: What Guns N Roses album was the second best sellin...]
[MO

In [None]:
# --- Demo 2 ---
stats, answers = run_pipeline(trivia_df, n_samples=30, thresh=0.85)
print("Cache hits:", stats["hits"], "Misses:", stats["miss"])
print("Sample Answers:")
for a in answers:
    print(a)

Cache hits: 30 Misses: 0
Sample Answers:
[MOCK ANSWER cho: Which country is Europe's largest silk producer?...]
[MOCK ANSWER cho: Which was the last country in mainland Europe to s...]
[MOCK ANSWER cho: In terms of land area, which is the largest countr...]
[MOCK ANSWER cho: Which country produces more than 50 % of all Europ...]
[MOCK ANSWER cho: What species interbred with humans in Europe and A...]
[MOCK ANSWER cho: In November 1943 which South American country sent...]
[MOCK ANSWER cho: Which nation has the longest coastline of any coun...]
[MOCK ANSWER cho: Europe’s first female President Vigdís Finnbogadót...]
[MOCK ANSWER cho: Which country is sometimes called the battlefield ...]
[MOCK ANSWER cho: Which country is Europe's largest silk producer?...]
[MOCK ANSWER cho: Which was the last country in mainland Europe to s...]
[MOCK ANSWER cho: In terms of land area, which is the largest countr...]
[MOCK ANSWER cho: Which country produces more than 50 % of all Europ...]
[MOCK ANSWER c

In [None]:
# --- Demo 2 ---
stats, answers = run_pipeline(trivia_df, n_samples=30, thresh=0.5)
print("Cache hits:", stats["hits"], "Misses:", stats["miss"])
print("Sample Answers:")
for a in answers:
    print(a)

Cache hits: 30 Misses: 0
Sample Answers:
[MOCK ANSWER cho: Which country is Europe's largest silk producer?...]
[MOCK ANSWER cho: Which was the last country in mainland Europe to s...]
[MOCK ANSWER cho: In terms of land area, which is the largest countr...]
[MOCK ANSWER cho: Which country produces more than 50 % of all Europ...]
[MOCK ANSWER cho: What species interbred with humans in Europe and A...]
[MOCK ANSWER cho: In November 1943 which South American country sent...]
[MOCK ANSWER cho: Which nation has the longest coastline of any coun...]
[MOCK ANSWER cho: Europe’s first female President Vigdís Finnbogadót...]
[MOCK ANSWER cho: Which country is sometimes called the battlefield ...]
[MOCK ANSWER cho: Which country is Europe's largest silk producer?...]
[MOCK ANSWER cho: Which was the last country in mainland Europe to s...]
[MOCK ANSWER cho: In terms of land area, which is the largest countr...]
[MOCK ANSWER cho: Which country produces more than 50 % of all Europ...]
[MOCK ANSWER c

**FAISS, utility (EM, F1, sim), token saving**

In [None]:
import time
import numpy as np
import re

# --- Token đếm đơn giản ---
def count_tokens(text):
    return len(text.split())

# --- Helper utility ---
def normalize_text(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", "", s)
    return s.strip()

def f1_score_str(pred, truth):
    pred_tokens = normalize_text(pred).split()
    truth_tokens = normalize_text(truth).split()
    common = set(pred_tokens) & set(truth_tokens)
    if not pred_tokens or not truth_tokens:
        return 0.0
    if not common:
        return 0.0
    prec = len(common) / len(pred_tokens)
    rec = len(common) / len(truth_tokens)
    return 2 * prec * rec / (prec + rec)

def exact_match_score(pred, truth):
    return int(normalize_text(pred) == normalize_text(truth))

def semantic_similarity(pred, truth, encoder):
    """
    pred, truth: string
    encoder: sentence transformer (ví dụ all-MiniLM-L6-v2)
    """
    try:
        emb = encoder.encode([pred, truth], normalize_embeddings=True)
        return float(np.dot(emb[0], emb[1]))
    except Exception as e:
        print("semantic_similarity error:", e)
        return None


In [None]:
def run_pipeline_eval(df, n_samples=50, thresh=0.85, reset_cache=True):
    global cache
    if reset_cache:
        cache = SemanticCache(dim=encoder.get_sentence_embedding_dimension())

    stats = {
        "hits": 0, "miss": 0,
        "total_latency": [],
        "input_tokens": [], "output_tokens": [],
        "utilities": []
    }

    for i in range(min(n_samples, len(df))):
        q = df.loc[i, "question"]
        doc = df.loc[i, "document"]
        truth = df.loc[i, "answer"] if "answer" in df.columns else None

        t0 = time.time()

        # lookup
        q_emb = encoder.encode([q], normalize_embeddings=True)[0]
        cached_summary, score = cache.lookup(q_emb, thresh=thresh)

        if cached_summary:
            stats["hits"] += 1
            summary = cached_summary
        else:
            stats["miss"] += 1
            summary = summarize(doc, q)
            cache.add(q_emb, summary)

        pred = answer(summary, q)

        # tokens
        stats["input_tokens"].append(count_tokens(summary))
        stats["output_tokens"].append(count_tokens(pred))

        # utility (EM, F1, SemanticSim → gộp thành trung bình)
        if truth:
            em = int(normalize_text(pred) == normalize_text(truth))
            f1 = f1_score_str(pred, truth)
            sim = semantic_similarity(pred, truth, encoder)
            vals = [v for v in [em, f1, sim] if v is not None]
            stats["utilities"].append(np.mean(vals) if vals else None)
        else:
            stats["utilities"].append(None)

        stats["total_latency"].append(time.time() - t0)

    report = {
        "Cache hits": stats["hits"],
        "Cache misses": stats["miss"],
        "Cache hit rate": stats["hits"] / max(1, (stats["hits"] + stats["miss"])),
        "# input tokens": int(np.mean(stats["input_tokens"])),
        "# output tokens": int(np.mean(stats["output_tokens"])),
        "Utility": np.mean([u for u in stats["utilities"] if u is not None]) if any(stats["utilities"]) else None,
        "Total latency": np.mean(stats["total_latency"])
    }
    return report

In [None]:
# --- evaluate_dataset ---
def evaluate_dataset(df, dataset_name="dataset", n_samples=200):
    results = []
    thresholds = [0.6, 0.8]
    lengths = [100, 200, 400]

    for thresh in thresholds:
        for L in lengths:
            print(f"==> Running {dataset_name} | Thresh={thresh}, Len={L}")

            # patch summarize để giới hạn độ dài
            def summarize_with_len(doc, q, max_len=L):
                if USE_REAL_API:
                    prompt = SUMMARY_PROMPT.format(doc=doc, q=q)
                    resp = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[{"role": "user", "content": prompt}],
                        max_tokens=max_len
                    )
                    return resp.choices[0].message["content"]
                else:
                    return "[MOCK SUMMARY] " + " ".join(doc.split()[:max_len])

            global summarize
            summarize = summarize_with_len

            report = run_pipeline_eval(
                df,
                n_samples=n_samples,
                thresh=thresh,
                reset_cache=True
            )

            row = {"Dataset": dataset_name,
                   "Thresh": thresh,
                   "Summary_len": L}
            row.update(report)
            results.append(row)

    return pd.DataFrame(results)

In [None]:
squad_results = evaluate_dataset(squad_df, dataset_name="Squad_v2", n_samples=len(squad_df))
squad_results

==> Running Squad_v2 | Thresh=0.6, Len=100
==> Running Squad_v2 | Thresh=0.6, Len=200
==> Running Squad_v2 | Thresh=0.6, Len=400
==> Running Squad_v2 | Thresh=0.8, Len=100
==> Running Squad_v2 | Thresh=0.8, Len=200
==> Running Squad_v2 | Thresh=0.8, Len=400


Unnamed: 0,Dataset,Thresh,Summary_len,Cache hits,Cache misses,Cache hit rate,# input tokens,# output tokens,Utility,Total latency
0,Squad_v2,0.6,100,38,53,0.417582,102,11,,0.029592
1,Squad_v2,0.6,200,38,53,0.417582,202,11,,0.025108
2,Squad_v2,0.6,400,38,53,0.417582,402,11,,0.020469
3,Squad_v2,0.8,100,15,76,0.164835,102,11,,0.022026
4,Squad_v2,0.8,200,15,76,0.164835,202,11,,0.021339
5,Squad_v2,0.8,400,15,76,0.164835,402,11,,0.019743


In [None]:
trivia_results = evaluate_dataset(trivia_df, dataset_name="TriviaQA", n_samples=len(trivia_df))
trivia_results

==> Running TriviaQA | Thresh=0.6, Len=100
==> Running TriviaQA | Thresh=0.6, Len=200
==> Running TriviaQA | Thresh=0.6, Len=400
==> Running TriviaQA | Thresh=0.8, Len=100
==> Running TriviaQA | Thresh=0.8, Len=200
==> Running TriviaQA | Thresh=0.8, Len=400


Unnamed: 0,Dataset,Thresh,Summary_len,Cache hits,Cache misses,Cache hit rate,# input tokens,# output tokens,Utility,Total latency
0,TriviaQA,0.6,100,10670,3271,0.765368,102,12,,0.024987
1,TriviaQA,0.6,200,10670,3271,0.765368,202,12,,0.025003
2,TriviaQA,0.6,400,10670,3271,0.765368,402,12,,0.025508
3,TriviaQA,0.8,100,8415,5526,0.603615,102,12,,0.025089
4,TriviaQA,0.8,200,8415,5526,0.603615,202,12,,0.025103
5,TriviaQA,0.8,400,8415,5526,0.603615,402,12,,0.02713
