# Data Preprocessing HotpotQA

In [8]:
import pandas as pd

df = pd.read_csv("old/outputs/processed_hotpot_qa_on_full_dataset.csv")
df.columns

Index(['id', 'question', 'answer', 'type', 'level', 'supporting_facts',
       'context', 'supporting_docs', 'supporting_sentences',
       'supporting_docs_dedup', 'clean_supporting_sentences'],
      dtype='object')

In [4]:
df.iloc[0]["supporting_docs_dedup"]

'[array(["Arthur\'s Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.",\n       \' Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.\',\n       \' In May 1846 it was merged into "Godey\\\'s Lady\\\'s Book".\'],\n      dtype=object), array(["First for Women is a woman\'s magazine published by Bauer Media Group in the USA.",\n       \' The magazine was started in 1989.\',\n       \' It is based in Englewood Cliffs, New Jersey.\',\n       \' In 2011 the circulation of the magazine was 1,310,696 copies.\'],\n      dtype=object)]'

In [5]:
df.iloc[0]["clean_supporting_sentences"]

'["arthur\'s magazine (1844–1846) was an american literary periodical published in philadelphia in the 19th century.", "first for women is a woman\'s magazine published by bauer media group in the usa."]'

In [10]:
df = df.rename(columns={"clean_supporting_sentences": "groundtruth_docs", "supporting_docs_dedup" : "passage"})
df.drop(columns=['type', 'level', 'supporting_facts',
       'context', 'supporting_docs', 'supporting_sentences'], inplace=True)
df.columns

Index(['id', 'question', 'answer', 'passage', 'groundtruth_docs'], dtype='object')

In [11]:
df.head()

Unnamed: 0,id,question,answer,passage,groundtruth_docs
0,5a7a06935542990198eaf050,Which magazine was started first Arthur's Maga...,Arthur's Magazine,"[array([""Arthur's Magazine (1844–1846) was an ...","[""arthur's magazine (1844–1846) was an america..."
1,5a879ab05542996e4f30887e,The Oberoi family is part of a hotel company t...,Delhi,[array(['The Oberoi family is an Indian family...,['the oberoi family is an indian family that i...
2,5a8d7341554299441c6b9fe5,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,"[array(['Allison Beth ""Allie"" Goertz (born Mar...","['allison beth ""allie"" goertz (born march 2, 1..."
3,5a82171f5542990a1d231f4a,What nationality was James Henry Miller's wife?,American,"[array(['Margaret ""Peggy"" Seeger (born June 17...","['margaret ""peggy"" seeger (born june 17, 1935)..."
4,5a84dd955542997b5ce3ff79,Cadmium Chloride is slightly soluble in this c...,alcohol,[array(['Cadmium chloride is a white crystalli...,['it is a hygroscopic solid that is highly sol...


In [15]:
import re
import ast
import pandas as pd

def to_python_list(s: str):
    # Handle NaN/None
    if pd.isna(s):
        return []
    s = str(s)

    # 1) Convert NumPy-style repr to pure-Python list syntax
    #    '[array(["a","b"], dtype=object), array(["c"])]' -> '[["a","b"], ["c"]]'
    s = s.replace("\n", " ")
    s = re.sub(r"array\(", "[", s)
    s = re.sub(r",\s*dtype=object\)", "]", s)

    # 2) Safely parse into Python list
    try:
        obj = ast.literal_eval(s)
    except Exception:
        return []

    # 3) Flatten any nesting
    out = []
    def _flatten(x):
        if isinstance(x, (list, tuple)):
            for y in x:
                _flatten(y)
        elif x is not None and str(x).strip().lower() != "nan":
            out.append(str(x))
    _flatten(obj)
    return out

def clean_items(items):
    cleaned = []
    for t in items:
        # strip surrounding quotes/spaces, remove trailing punctuation spaces
        tt = t.strip().strip('"').strip("'").strip()
        # optional: remove trailing periods inside quotes duplicates
        tt = re.sub(r'\s+', ' ', tt)  # collapse extra spaces
        # lowercase
        tt = tt.lower()
        cleaned.append(tt)
    return cleaned

# Apply to your DataFrame
# df['passage'] is the column containing the raw strings like the example
df["passage"] = (
    df["passage"]
    .apply(to_python_list)   # parse + flatten
    .apply(clean_items)      # normalize each sentence and lowercase
)

#test

In [33]:
# df_clean = pd.read_csv("hotpotqa_fulldataset_cleaned.csv")
df_clean = df.copy()
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90447 entries, 0 to 90446
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        90447 non-null  int64 
 1   id                90447 non-null  object
 2   question          90447 non-null  object
 3   answer            90447 non-null  object
 4   passage           90447 non-null  object
 5   groundtruth_docs  90447 non-null  object
dtypes: int64(1), object(5)
memory usage: 4.1+ MB


In [34]:
import re
import ast
import json
import pandas as pd

def parse_passage_cell(x, to_lower=True, strip_inner_quotes=True):
    # Already a list → clean each item
    if isinstance(x, list):
        items = x
    else:
        s = "" if pd.isna(x) else str(x).strip()
        if not s:
            return []
        # If it's double-quoted JSON string, unwrap once
        if (s.startswith('"[') and s.endswith(']"')) or (s.startswith("'[") and s.endswith("]'")):
            try:
                s = json.loads(s)
            except Exception:
                pass
        # Try JSON first
        items = None
        if isinstance(s, str):
            try:
                obj = json.loads(s)
                if isinstance(obj, list):
                    items = obj
            except Exception:
                pass
            # If not JSON, try Python literal (handles single quotes)
            if items is None:
                try:
                    obj = ast.literal_eval(s)
                    if isinstance(obj, list):
                        items = obj
                except Exception:
                    pass
        if items is None:
            # Fallback: treat as one item
            items = [s]

    cleaned = []
    for t in items:
        t = "" if t is None else str(t)

        # collapse whitespace
        t = " ".join(t.split())

        # remove surrounding quotes
        t = t.strip().strip('"').strip("'").strip()

        # optional: remove inner double-quotes that wrap a phrase (your example)
        # e.g., in may 1846 it was merged into "godey's lady's book". -> ... into godey's lady's book.
        if strip_inner_quotes:
            t = t.replace('\\"', '"')  # unescape
            t = re.sub(r'"([^"]+)"', r"\1", t)

        # optional: lowercase
        if to_lower:
            t = t.lower()

        cleaned.append(t)
    return cleaned

# Apply to your DataFrame column "passage"
# Example: df['passage'] contains stringified lists
df_clean["passage"] = df_clean["passage"].apply(parse_passage_cell)
df_clean["groundtruth_docs"] = df_clean["groundtruth_docs"].apply(parse_passage_cell)

# Verify
print(df_clean["passage"].iloc[0])
print(df_clean["groundtruth_docs"].iloc[0]) 

["arthur's magazine (1844–1846) was an american literary periodical published in philadelphia in the 19th century.", 'edited by t.s. arthur, it featured work by edgar a. poe, j.h. ingraham, sarah josepha hale, thomas g. spear, and others.', "in may 1846 it was merged into godey's lady's book.", "first for women is a woman's magazine published by bauer media group in the usa.", 'the magazine was started in 1989.', 'it is based in englewood cliffs, new jersey.', 'in 2011 the circulation of the magazine was 1,310,696 copies.']
["arthur's magazine (1844–1846) was an american literary periodical published in philadelphia in the 19th century.", "first for women is a woman's magazine published by bauer media group in the usa."]


In [39]:
df_clean.to_csv("hotpotqa_fulldataset_cleaned.csv", index=False)

# Sparse Retriever models

In [1]:
import os
import re
import ast
import json
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import subprocess

import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from scipy.sparse import csr_matrix, save_npz, load_npz


# -----------------------
# Utilities and helpers
# -----------------------

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def parse_list_like_strict(x):
    """
    Return a real Python list[str] from:
    - a Python list
    - a JSON list string
    - a Python-literal list string
    - a double-encoded JSON list string
    Never returns a nested string like '["..."]' as a single item.
    """
    if isinstance(x, list):
        return [str(s).strip() for s in x if isinstance(s, (str, int, float)) and str(s).strip()]

    s = "" if pd.isna(x) else str(x).strip()
    if not s:
        return []

    if (s.startswith('"[') and s.endswith(']"')) or (s.startswith("'[") and s.endswith("]'")):
        try:
            s = json.loads(s)
        except Exception:
            pass

    if isinstance(s, str):
        # Try JSON
        try:
            obj = json.loads(s)
            if isinstance(obj, list):
                return [str(z).strip() for z in obj if str(z).strip()]
        except Exception:
            pass
        # Try Python literal
        try:
            obj = ast.literal_eval(s)
            if isinstance(obj, list):
                return [str(z).strip() for z in obj if str(z).strip()]
        except Exception:
            pass

    return [s]

def normalize_sentence(s: str) -> str:
    s = s.replace("\u00a0", " ")
    s = " ".join(s.split()).strip()
    s = s.lower()
    s = re.sub(r'^["“”]+|["“”]+$', "", s)
    return s

def build_sentence_corpus_with_row_ids(df, passage_col="passage"):
    """
    Build a sentence-level corpus where each sentence is one document.
    Doc IDs are 'rowIdx_sentIdx' (e.g., '1_0').
    Global dedup is applied: identical normalized sentences map to a single doc_id chosen by first occurrence.
    Returns:
      - corpus_texts: list[str] aligned with corpus_doc_ids
      - corpus_doc_ids: list[str] like 'row_sent'
      - sentence_to_docid: dict[str normalized sentence -> doc_id]
    """
    corpus_texts = []
    corpus_doc_ids = []
    sentence_to_docid = {}  # normalized sentence -> canonical doc_id

    for i in range(len(df)):
        raw_list = parse_list_like_strict(df.loc[i, passage_col])
        sents = [normalize_sentence(s) for s in raw_list if s and str(s).strip()]
        for j, sent in enumerate(sents):
            did = f"{i}_{j}"
            if sent not in sentence_to_docid:
                sentence_to_docid[sent] = did
                corpus_texts.append(sent)
                corpus_doc_ids.append(did)

    return corpus_texts, corpus_doc_ids, sentence_to_docid

def build_groundtruth_rel_sets(df, groundtruth_col, sentence_to_docid):
    """
    For each query row, parse groundtruth_docs into a set of doc_ids using the canonical mapping.
    Returns rel_sets: dict[qid -> set(doc_id)]
    """
    rel_sets = {}
    missing = 0
    total = 0
    for i in range(len(df)):
        gt_list = parse_list_like_strict(df.loc[i, groundtruth_col])
        gt_norm = [normalize_sentence(x) for x in gt_list if x and str(x).strip()]
        rel = set()
        for g in gt_norm:
            total += 1
            did = sentence_to_docid.get(g)
            if did is not None:
                rel.add(did)
            else:
                missing += 1
        rel_sets[str(i)] = rel
    if total > 0 and missing > 0:
        print(f"[Info] Ground-truth sentences not found in corpus after normalization/dedup: {missing}/{total}")
    return rel_sets

def ap_at_k_multi(ret_ids, rel_ids_set, k):
    score = 0.0
    hit = 0
    for rank, did in enumerate(ret_ids[:k], 1):
        if did in rel_ids_set:
            hit += 1
            score += hit / rank
    denom = min(len(rel_ids_set), k)
    return score / denom if denom > 0 else 0.0

def ndcg_at_k_multi(ret_ids, rel_ids_set, k):
    dcg = 0.0
    for rank, did in enumerate(ret_ids[:k], 1):
        if did in rel_ids_set:
            dcg += 1.0 / math.log2(rank + 1)
    ideal = min(len(rel_ids_set), k)
    idcg = sum(1.0 / math.log2(r + 1) for r in range(1, ideal + 1))
    return (dcg / idcg) if idcg > 0 else 0.0


# -----------------------
# BM25 (Pyserini/Lucene)
# -----------------------

class BM25Retriever:
    def __init__(self, index_dir="./indices/bm25", threads=8, java_mem="8g"):
        self.index_dir = index_dir
        self.threads = threads
        self.java_mem = java_mem

    def build_index(self, corpus_texts, corpus_doc_ids, work_dir="./work/bm25"):
        ensure_dir(self.index_dir); ensure_dir(work_dir)
        corpus_dir = os.path.join(work_dir, "json_corpus"); ensure_dir(corpus_dir)
        docs_path = os.path.join(corpus_dir, "docs.jsonl")
        with open(docs_path, "w", encoding="utf-8") as f:
            for text, did in zip(corpus_texts, corpus_doc_ids):
                f.write(json.dumps({"id": did, "contents": text}, ensure_ascii=False) + "\n")
        cmd = [
            "python","-m","pyserini.index.lucene",
            "--collection","JsonCollection",
            "--input", corpus_dir,
            "--index", self.index_dir,
            "--generator","DefaultLuceneDocumentGenerator",
            "--threads", str(self.threads),
            "--storePositions","--storeDocvectors","--storeRaw"
        ]
        env = os.environ.copy()
        env["JAVA_TOOL_OPTIONS"] = f"-Xms{self.java_mem} -Xmx{self.java_mem}"
        res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env)
        if res.returncode != 0:
            print(res.stdout); print(res.stderr)
            raise RuntimeError("BM25 indexing failed (check Java 11+ and pyserini).")

    def retrieve(self, queries, topk=10):
        from pyserini.search.lucene import LuceneSearcher
        searcher = LuceneSearcher(self.index_dir)
        results = {}
        for qid, q in tqdm(queries.items(), desc="BM25 search"):
            hits = searcher.search(q, k=topk)
            results[qid] = [(h.docid, float(h.score)) for h in hits]
        return results


# -----------------------
# SPLADE (Transformers + CSR)
# -----------------------

class SPLADERetriever:
    def __init__(self, index_dir="./indices/splade", model_name="naver/splade-cocondenser-ensembledistil",
                 batch_size=8, max_length=256, min_weight=0.01):
        self.index_dir = index_dir
        self.model_name = model_name
        self.batch_size = batch_size
        self.max_length = max_length
        self.min_weight = min_weight
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tok = None
        self.model = None

    def _load(self):
        if self.tok is None:
            self.tok = AutoTokenizer.from_pretrained(self.model_name)
        if self.model is None:
            self.model = AutoModelForMaskedLM.from_pretrained(self.model_name).to(self.device).eval()

    @torch.no_grad()
    def _encode_texts(self, texts):
        self._load()
        V = self.model.config.vocab_size
        data, indices, indptr = [], [], [0]
        for i in tqdm(range(0, len(texts), self.batch_size), desc="SPLADE encode"):
            batch = texts[i:i+self.batch_size]
            toks = self.tok(batch, return_tensors="pt", padding=True, truncation=True,
                            max_length=self.max_length).to(self.device)
            logits = self.model(**toks).logits  # [B,L,V]
            activ = torch.log1p(torch.relu(logits))
            weights = activ.max(dim=1).values.cpu().numpy()  # [B,V]
            for row in weights:
                nz = np.where(row >= self.min_weight)[0]
                indices.extend(nz.tolist())
                data.extend(row[nz].astype(np.float32).tolist())
                indptr.append(len(indices))
        return csr_matrix((np.array(data, np.float32),
                           np.array(indices, np.int32),
                           np.array(indptr, np.int32)),
                          shape=(len(texts), V), dtype=np.float32)

    def build_index(self, corpus_texts, corpus_doc_ids):
        ensure_dir(self.index_dir)
        texts = ["" if pd.isna(t) else str(t) for t in corpus_texts]
        mat = self._encode_texts(texts)
        save_npz(os.path.join(self.index_dir, "docs.npz"), mat)
        with open(os.path.join(self.index_dir, "doc_ids.json"), "w") as f:
            json.dump(list(corpus_doc_ids), f)

    @torch.no_grad()
    def _encode_query(self, q):
        self._load()
        toks = self.tok([q], return_tensors="pt", padding=True, truncation=True,
                        max_length=self.max_length).to(self.device)
        logits = self.model(**toks).logits
        activ = torch.log1p(torch.relu(logits))
        w = activ.max(dim=1).values.squeeze(0).cpu().numpy()
        nz = np.where(w >= self.min_weight)[0]
        return csr_matrix((w[nz].astype(np.float32), nz.astype(np.int32), np.array([0,len(nz)], np.int32)),
                          shape=(1, self.model.config.vocab_size), dtype=np.float32)

    def retrieve(self, queries, topk=10):
        docs = load_npz(os.path.join(self.index_dir, "docs.npz"))
        with open(os.path.join(self.index_dir, "doc_ids.json"), "r") as f:
            doc_ids = json.load(f)
        out = {}
        for qid, q in tqdm(queries.items(), desc="SPLADE search"):
            qv = self._encode_query(q)  # [1,V]
            scores = (docs @ qv.T).toarray().ravel()
            if topk >= len(doc_ids):
                idx = np.argsort(-scores)
            else:
                idx = np.argpartition(scores, -topk)[-topk:]
                idx = idx[np.argsort(-scores[idx])]
            out[qid] = [(doc_ids[i], float(scores[i])) for i in idx[:topk]]
        return out


# -----------------------
# Pipeline
# -----------------------

def run_pipeline(df,
                 output_dir="./outputs",
                 bm25_index_dir="./indices/bm25",
                 splade_index_dir="./indices/splade",
                 work_dir="./work",
                 topk=10,
                 groundtruth_col="groundtruth_docs"):
    # Required columns
    for col in ["question", "passage", groundtruth_col]:
        if col not in df.columns:
            raise ValueError(f"Missing required column '{col}'.")
    if "answer" not in df.columns:
        df["answer"] = ""
    df = df.reset_index(drop=True)

    # Build sentence-level corpus with per-row doc_ids like "i_j", dedup globally
    corpus_texts, corpus_doc_ids, sentence_to_docid = build_sentence_corpus_with_row_ids(df, "passage")
    corpus_lookup = {did: txt for did, txt in zip(corpus_doc_ids, corpus_texts)}

    # Build relevance sets from groundtruth_docs (NOT from passage)
    rel_sets = build_groundtruth_rel_sets(df, groundtruth_col, sentence_to_docid)

    # Queries
    queries = {str(i): str(df.loc[i, "question"]) for i in range(len(df))}

    ensure_dir(output_dir); ensure_dir(os.path.dirname(bm25_index_dir)); ensure_dir(os.path.dirname(splade_index_dir)); ensure_dir(work_dir)

    # BM25
    bm25 = BM25Retriever(index_dir=bm25_index_dir)
    bm25.build_index(corpus_texts, corpus_doc_ids, work_dir=os.path.join(work_dir, "bm25"))
    bm25_res = bm25.retrieve(queries, topk=topk)
    save_results("bm25", df, "passage", groundtruth_col, bm25_res, rel_sets, corpus_lookup, os.path.join(output_dir, "bm25_results.csv"))

    # SPLADE
    splade = SPLADERetriever(index_dir=splade_index_dir)
    splade.build_index(corpus_texts, corpus_doc_ids)
    splade_res = splade.retrieve(queries, topk=topk)
    save_results("splade", df, "passage", groundtruth_col, splade_res, rel_sets, corpus_lookup, os.path.join(output_dir, "splade_results.csv"))

def save_results(name, df, passage_col, groundtruth_col, ret, rel_sets, corpus_lookup, out_csv):
    rows = []
    ks = [3, 5, 10]
    for i in range(len(df)):
        qid = str(i)
        retrieved = ret.get(qid, [])
        ret_ids = [d for d, _ in retrieved]
        rel_set = rel_sets.get(qid, set())

        metrics = {}
        for k in ks:
            metrics[f"MAP@{k}"] = ap_at_k_multi(ret_ids, rel_set, k)
            metrics[f"NDCG@{k}"] = ndcg_at_k_multi(ret_ids, rel_set, k)

        # NEW: store the complete document text, plus a short preview_snippet for convenience
        packed = []
        for d, s in retrieved:
            full = corpus_lookup.get(d, "")
            # Keep the document exactly as in the corpus (already normalized at sentence level)
            preview = full[:200].replace("\n", " ")
            packed.append({
                "doc_id": d,
                "score": round(float(s), 4),
                "full_text": full,
                "preview_snippet": preview
            })

        # Preserve normalized passage and GT (as you had)
        passage_list = [normalize_sentence(x) for x in parse_list_like_strict(df.loc[i, passage_col])]
        gt_list = [normalize_sentence(x) for x in parse_list_like_strict(df.loc[i, groundtruth_col])]

        rows.append({
            "question": str(df.loc[i, "question"]),
            "answer": "" if "answer" not in df.columns or pd.isna(df.loc[i, "answer"]) else str(df.loc[i, "answer"]),
            "passage": json.dumps(passage_list, ensure_ascii=False),
            "groundtruth_docs": json.dumps(gt_list, ensure_ascii=False),
            f"{name}_ret_docs": json.dumps(packed, ensure_ascii=False),
            **metrics
        })

    pd.DataFrame(rows).to_csv(out_csv, index=False)
    print(f"{name} results saved to {out_csv}")


# -----------------------
# Example usage
# -----------------------
if __name__ == "__main__":
    # df must have: question, answer, passage (list), groundtruth_docs (list)
    # For example:
    # df = pd.read_parquet("your_dataset.parquet")
    # Or:
    df = pd.read_csv("hotpotqa_fulldataset_cleaned.csv")
    # Optional: subset for quick test
    # df = df.head(300)


    run_pipeline(
        df,
        topk=10,
        groundtruth_col="groundtruth_docs"
    )

  from .autonotebook import tqdm as notebook_tqdm


[Info] Ground-truth sentences not found in corpus after normalization/dedup: 1/215661


Oct 16, 2025 11:38:45 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false
BM25 search: 100%|██████████| 90447/90447 [02:26<00:00, 619.26it/s]


bm25 results saved to ./outputs/bm25_results.csv


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
SPLADE encode: 100%|██████████| 42323/42323 [09:14<00:00, 76.39it/s]
SPLADE search: 100%|██████████| 90447/90447 [2:58:26<00:00,  8.45it/s]  


splade results saved to ./outputs/splade_results.csv


In [2]:
import pandas as pd
for name, path in [("BM25","./outputs/bm25_results.csv"), ("SPLADE","./outputs/splade_results.csv")]:
    df = pd.read_csv(path)
    print(name, "Results:")
    for k in (3,5,10):
        print(f"  MAP@{k}: {pd.to_numeric(df[f'MAP@{k}'], errors='coerce').mean():.4f}, "
              f"NDCG@{k}: {pd.to_numeric(df[f'NDCG@{k}'], errors='coerce').mean():.4f}")
    print()

BM25 Results:
  MAP@3: 0.4529, NDCG@3: 0.5334
  MAP@5: 0.4685, NDCG@5: 0.5571
  MAP@10: 0.4829, NDCG@10: 0.5819

SPLADE Results:
  MAP@3: 0.4913, NDCG@3: 0.5690
  MAP@5: 0.5070, NDCG@5: 0.5922
  MAP@10: 0.5211, NDCG@10: 0.6156



# Dense Retriever models

In [22]:
"""

Uses sentence-level documents from the passage column (each item is a doc), with global dedup.
Evaluates MAP/NDCG strictly against groundtruth_docs (not passage).
Uses the same normalization on corpus, queries, and groundtruth to avoid string-mismatch pitfalls.
Uses FAISS IndexFlatIP with L2-normalized embeddings (cosine-like) for indexing/search..
Uses two proper dense encoders:
sentence-transformers/all-mpnet-base-v2 via AutoModel + mean pooling + L2 norm
facebook/contriever via AutoModel + mean pooling + L2 norm
Retrieves top-k per question, and computes MAP/NDCG@3/5/10 against groundtruth_docs (multiple relevant docs per question).
Stores retrieved docs, normalized passage, normalized groundtruth_docs, and metrics in CSV.

"""

'\nBuilds a sentence-level corpus from passage (each sentence = one doc, deduped across the dataset).\nSupports two dense models:\nsentence-transformers/all-mpnet-base-v2\nfacebook/contriever\nUses FAISS for indexing/search.\nRetrieves top-k per question, saves results to CSV, and computes MAP/NDCG@3/5/10 against groundtruth_docs (multiple relevant sentences per question).\nReuses the same helpers and evaluation style you already have.\n'

In [1]:
#New

import os
import re
import ast
import json
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from transformers import AutoTokenizer, AutoModel

import faiss

import os
os.makedirs("./outputs_dense", exist_ok=True)

# -----------------------
# Normalization & parsing
# -----------------------

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def parse_list_like_strict(x):
    """
    Robust parsing of lists that may be:
    - Python list
    - JSON list string
    - Python literal list string
    - Double-encoded JSON list string
    Always returns a flat list[str] without nested stringified lists.
    """
    if isinstance(x, list):
        return [str(s).strip() for s in x if isinstance(s, (str, int, float)) and str(s).strip()]

    s = "" if pd.isna(x) else str(x).strip()
    if not s:
        return []

    # Unwrap double-encoded JSON arrays like "\"[...]""
    if (s.startswith('"[') and s.endswith(']"')) or (s.startswith("'[") and s.endswith("]'")):
        try:
            s = json.loads(s)
        except Exception:
            pass

    if isinstance(s, str):
        # Try JSON
        try:
            obj = json.loads(s)
            if isinstance(obj, list):
                return [str(z).strip() for z in obj if str(z).strip()]
        except Exception:
            pass
        # Try Python literal
        try:
            obj = ast.literal_eval(s)
            if isinstance(obj, list):
                return [str(z).strip() for z in obj if str(z).strip()]
        except Exception:
            pass

    return [s]

def normalize_text(s: str) -> str:
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    s = str(s).replace("\u00a0", " ")
    s = " ".join(s.split()).strip()
    s = s.lower()
    # strip wrapping quotes
    s = re.sub(r'^["“”]+|["“”]+$', "", s)
    return s


# -----------------------
# Corpus & relevance mapping (sentence-level)
# -----------------------

def build_sentence_corpus_and_mappings(df, passage_col="passage"):
    """
    Build a global deduplicated corpus of sentences (each sentence is a doc).
    - corpus_texts: list[str] of unique normalized sentences
    - corpus_doc_ids: list[str] of doc IDs "rowIdx_sentIdx" for FIRST occurrence of that sentence
    - sentence_to_docid: dict normalized sentence -> canonical doc_id
    - row_sentence_docsets: dict qid -> set(doc_ids) for that row's sentences (useful for checks)
    """
    corpus_texts = []
    corpus_doc_ids = []
    sentence_to_docid = {}
    row_sentence_docsets = {}

    for i in range(len(df)):
        sents_raw = parse_list_like_strict(df.loc[i, passage_col])
        sents = [normalize_text(s) for s in sents_raw if s and str(s).strip()]
        docset = set()
        for j, sent in enumerate(sents):
            did = f"{i}_{j}"
            if sent not in sentence_to_docid:
                sentence_to_docid[sent] = did
                corpus_texts.append(sent)
                corpus_doc_ids.append(did)
            else:
                did = sentence_to_docid[sent]
            docset.add(did)
        row_sentence_docsets[str(i)] = docset

    return corpus_texts, corpus_doc_ids, sentence_to_docid, row_sentence_docsets

def build_groundtruth_rel_sets(df, groundtruth_col, sentence_to_docid):
    """
    For each row, map groundtruth_docs sentences -> canonical doc_ids using sentence_to_docid.
    Returns rel_sets: dict[qid -> set(doc_id)]
    """
    rel_sets = {}
    missing = 0
    total = 0
    for i in range(len(df)):
        gt_list = parse_list_like_strict(df.loc[i, groundtruth_col])
        gt_norm = [normalize_text(x) for x in gt_list if x and str(x).strip()]
        rel = set()
        for g in gt_norm:
            total += 1
            did = sentence_to_docid.get(g)
            if did is not None:
                rel.add(did)
            else:
                missing += 1
        rel_sets[str(i)] = rel
    if total > 0 and missing > 0:
        print(f"[Info] Ground-truth sentences not found in corpus after normalization/dedup: {missing}/{total}")
    return rel_sets


# -----------------------
# Metrics
# -----------------------

def ap_at_k_multi(ret_ids, rel_ids_set, k):
    score = 0.0
    hit = 0
    for rank, did in enumerate(ret_ids[:k], 1):
        if did in rel_ids_set:
            hit += 1
            score += hit / rank
    denom = min(len(rel_ids_set), k)
    return score / denom if denom > 0 else 0.0

def ndcg_at_k_multi(ret_ids, rel_ids_set, k):
    dcg = 0.0
    for rank, did in enumerate(ret_ids[:k], 1):
        if did in rel_ids_set:
            dcg += 1.0 / math.log2(rank + 1)
    ideal = min(len(rel_ids_set), k)
    idcg = sum(1.0 / math.log2(r + 1) for r in range(1, ideal + 1))
    return (dcg / idcg) if idcg > 0 else 0.0


# -----------------------
# Dense encoders (AutoModel + mean pooling + L2 norm)
# -----------------------

def mean_pool(last_hidden_state, attention_mask):
    # last_hidden_state: [B, L, H], attention_mask: [B, L]
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)  # [B,L,1]
    summed = (last_hidden_state * mask).sum(dim=1)                  # [B,H]
    denom = mask.sum(dim=1).clamp(min=1e-9)                         # [B,1]
    return summed / denom

class DenseEncoder:
    """
    Proper dense encoding with AutoModel:
      - sentence-transformers/all-mpnet-base-v2 (AutoModel + mean pooling)
      - facebook/contriever (AutoModel + mean pooling)
    """
    def __init__(self, model_name: str, device: str = None, max_length: int = 256, batch_size: int = 64):
        self.model_name = model_name
        self.max_length = max_length
        self.batch_size = batch_size
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tok = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device).eval()

    @torch.no_grad()
    def encode(self, texts):
        vecs = []
        for i in tqdm(range(0, len(texts), self.batch_size), desc=f"Encode {self.model_name}"):
            batch = texts[i:i+self.batch_size]
            toks = self.tok(
                batch,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            ).to(self.device)
            out = self.model(**toks)
            emb = mean_pool(out.last_hidden_state, toks["attention_mask"])
            emb = torch.nn.functional.normalize(emb, p=2, dim=1)
            vecs.append(emb.cpu().numpy().astype(np.float32))
        return np.vstack(vecs) if len(vecs) else np.zeros((0, self.model.config.hidden_size), dtype=np.float32)


# -----------------------
# FAISS helpers
# -----------------------

def build_faiss_index(embs: np.ndarray, use_gpu=False):
    d = embs.shape[1]
    index = faiss.IndexFlatIP(d)  # cosine-like when vectors are L2-normalized
    if use_gpu and faiss.get_num_gpus() > 0:
        res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(res, 0, index)
    index.add(embs)
    return index


# -----------------------
# Dense Pipeline (HotpotQA) with groundtruth_docs evaluation
# -----------------------

def run_dense_pipeline_hotpotqa(
    df: pd.DataFrame,
    model_name: str,
    output_csv: str,
    faiss_dir: str,
    topk: int = 10,
    max_length: int = 256,
    batch_size: int = 64,
    groundtruth_col: str = "groundtruth_docs",
    passage_col: str = "passage",
):
    # Validate columns
    for col in ["question", passage_col, groundtruth_col]:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
    if "answer" not in df.columns:
        df["answer"] = ""
    df = df.reset_index(drop=True)

    # Build sentence-level corpus and canonical mapping
    corpus_texts, corpus_doc_ids, sentence_to_docid, _ = build_sentence_corpus_and_mappings(df, passage_col=passage_col)
    corpus_lookup = {did: txt for did, txt in zip(corpus_doc_ids, corpus_texts)}

    # Build relevance sets from groundtruth_docs only
    rel_sets = build_groundtruth_rel_sets(df, groundtruth_col, sentence_to_docid)

    # Prepare normalized queries
    queries = [normalize_text(df.loc[i, "question"]) for i in range(len(df))]

    # Encode corpus
    ensure_dir(faiss_dir)
    encoder = DenseEncoder(model_name=model_name, max_length=max_length, batch_size=batch_size)
    corpus_vecs = encoder.encode(corpus_texts)  # [N, D]

    # Build FAISS
    index = build_faiss_index(corpus_vecs, use_gpu=False)

    # Encode queries and search
    query_vecs = encoder.encode(queries)  # [Q, D]
    scores, idxs = index.search(query_vecs, topk)  # [Q, topk]
    doc_ids_array = np.array(corpus_doc_ids)

    # Collect rows and compute metrics
    rows = []
    ks = [3, 5, 10]
    for i in range(len(df)):
        qid = str(i)
        retrieved = []
        for r in range(topk):
            j = int(idxs[i, r])
            if j < 0 or j >= len(doc_ids_array):
                continue
            did = doc_ids_array[j]
            sc = float(scores[i, r])
            retrieved.append((did, sc))
        ret_ids = [d for d, _ in retrieved]
        rel_set = rel_sets.get(qid, set())
        metrics = {f"MAP@{k}": ap_at_k_multi(ret_ids, rel_set, k) for k in ks}
        metrics.update({f"NDCG@{k}": ndcg_at_k_multi(ret_ids, rel_set, k) for k in ks})

        # Store complete doc text + short preview (keeps CSV readable while enabling exact text eval later)
        pack = []
        for d, s in retrieved:
            full = corpus_lookup.get(d, "")
            preview = full[:200].replace("\n", " ")
            pack.append({
                "doc_id": d,
                "score": round(float(s), 6),
                "full_text": full,
                "preview_snippet": preview
            })

        passage_list = [normalize_text(x) for x in parse_list_like_strict(df.loc[i, passage_col])]
        gt_list = [normalize_text(x) for x in parse_list_like_strict(df.loc[i, groundtruth_col])]

        rows.append({
            "question": normalize_text(df.loc[i, "question"]),
            "answer": normalize_text(df.loc[i, "answer"]),
            "passage": json.dumps(passage_list, ensure_ascii=False),
            "groundtruth_docs": json.dumps(gt_list, ensure_ascii=False),
            f"{model_name.split('/')[-1]}_ret_docs": json.dumps(pack, ensure_ascii=False),
            **metrics
        })

    pd.DataFrame(rows).to_csv(output_csv, index=False)
    print(f"Dense results saved to {output_csv}")


# -----------------------
# Run both models if called directly
# -----------------------

if __name__ == "__main__":
    # Example:
    df = pd.read_csv("hotpotqa_fulldataset_cleaned.csv")
    # Ensure df has columns: question, answer, passage (list or stringified list), groundtruth_docs (list or stringified list)

    # Uncomment and set your path:
    # df = pd.read_csv("hotpotqa_fulldataset_cleaned.csv")

    # Example runs:
    run_dense_pipeline_hotpotqa(
        df,
        model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
        output_csv="./outputs_dense/hotpotqa_multiqa.csv",
        faiss_dir="./indices_dense/faiss_multiqa",
        topk=10,
        max_length=256,
        batch_size=64,
        groundtruth_col="groundtruth_docs",
        passage_col="passage",
    )

    # run_dense_pipeline_hotpotqa(
    #     df,
    #     model_name="facebook/contriever",
    #     output_csv="./outputs_dense/hotpotqa_contriever.csv",
    #     faiss_dir="./indices_dense/faiss_contriever",
    #     topk=10,
    #     max_length=256,
    #     batch_size=64,
    #     groundtruth_col="groundtruth_docs",
    #     passage_col="passage",
    # )
    # pass

  from .autonotebook import tqdm as notebook_tqdm


[Info] Ground-truth sentences not found in corpus after normalization/dedup: 1/215661


Encode sentence-transformers/multi-qa-mpnet-base-dot-v1: 100%|██████████| 5291/5291 [06:30<00:00, 13.56it/s]
Encode sentence-transformers/multi-qa-mpnet-base-dot-v1: 100%|██████████| 1414/1414 [01:37<00:00, 14.48it/s]


Dense results saved to ./outputs_dense/hotpotqa_multiqa.csv


In [2]:
# results summary
import pandas as pd
for name, path in [("MPNet","./outputs_dense/hotpotqa_mpnet.csv"), ("Contriever","./outputs_dense/hotpotqa_contriever.csv")]:
    df = pd.read_csv(path)
    print(name, "Results:")
    for k in (3,5,10):
        print(f"  MAP@{k}: {pd.to_numeric(df[f'MAP@{k}'], errors='coerce').mean():.4f}, "
              f"NDCG@{k}: {pd.to_numeric(df[f'NDCG@{k}'], errors='coerce').mean():.4f}")
    print()

MPNet Results:
  MAP@3: 0.3314, NDCG@3: 0.4034
  MAP@5: 0.3461, NDCG@5: 0.4272
  MAP@10: 0.3606, NDCG@10: 0.4539

Contriever Results:
  MAP@3: 0.3212, NDCG@3: 0.3966
  MAP@5: 0.3396, NDCG@5: 0.4256
  MAP@10: 0.3555, NDCG@10: 0.4545

