In [None]:
import re, json, ujson, numpy as np
from pathlib import Path
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import download
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer

download("punkt")
download("stopwords")

STOP = set(stopwords.words("english"))
STEM = PorterStemmer().stem

In [None]:
print(STOP)

Read evidence

In [None]:
ev_path = Path("./data/evidence.json")
with ev_path.open("r", encoding="utf-8") as f:
    evid_dict = ujson.load(f)

evid_ids   = list(evid_dict.keys())
raw_texts  = [evid_dict[eid] for eid in evid_ids]

In [None]:
import unicodedata

def clean_text(text: str) -> str:
    # Unicode normalize
    text = unicodedata.normalize("NFC", text)
    # \u00AD = soft hyphen, \u2010–\u2014 = range of dashes, '-' ASCII hyphen
    text = re.sub(r"[\u00AD\u2010-\u2014\-]", "", text)
    return text

def nltk_stem_preprocessor(text: str) -> str:
    text = clean_text(text)
    tokens = re.findall(r"[A-Za-z]+", text.lower())
    tokens = [STEM(t) for t in tokens if t not in STOP]
    return " ".join(tokens)

cv = CountVectorizer(
        ngram_range=(1, 2),
        preprocessor=nltk_stem_preprocessor,
        tokenizer=lambda text: text.split(),
        token_pattern=None,             
        stop_words=None,
    )
analyzer = cv.build_analyzer()

# TOKEN
token_corpus = [analyzer(doc) for doc in tqdm(raw_texts, desc="Tokenize")]

bm25 = BM25Okapi(token_corpus, k1=1.2, b=0.75)

Tokenize:  63%|██████▎   | 760758/1208827 [02:40<01:20, 5546.32it/s]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    preprocessor=nltk_stem_preprocessor,
    tokenizer=lambda s: s.split(),
    token_pattern=None,
    ngram_range=(1, 2)
)
# 文档-特征 稀疏矩阵，形状 (num_docs, num_features)
tfidf_corpus = vectorizer.fit_transform(raw_texts)

# ———— 定义检索函数 ————
def retrieve_topk_tfidf(claim_text: str, topk: int = 100):
    """
    基于 TF–IDF 和余弦相似度，返回得分最高的前 topk 条证据。
    输出格式：[(evid_id, score), ...]
    """
    # 将查询转为同维度稀疏向量
    tfidf_query = vectorizer.transform([claim_text])
    # 计算余弦相似度：点积 / (||q|| * ||d_i||)
    # 但由于 TfidfVectorizer 已经做了 L2 归一化，直接点积即可得到余弦相似度
    scores = (tfidf_corpus @ tfidf_query.T).toarray().ravel()
    
    # 取 topk 索引
    idx_sorted = np.argsort(scores)[-topk:][::-1]
    return [(evid_ids[i], float(scores[i])) for i in idx_sorted]

# ———— 批量处理声明文件 ————
def process_claim_file_tfidf(claim_json: str, out_json: str):
    with open(claim_json, "r", encoding="utf-8") as f:
        claims = json.load(f)  # {claim_id: {...}}
    results = {}
    for cid, obj in tqdm(claims.items(), desc="Retrieve"):
        hits = retrieve_topk_tfidf(obj["claim_text"], topk=100)
        results[cid] = {"evidences": [h[0] for h in hits]}
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

In [5]:
process_claim_file_tfidf("./data/dev-claims.json", "./data/dev-claims-top100-tf-idf.json")

Retrieve: 100%|██████████| 154/154 [00:50<00:00,  3.05it/s]


In [None]:
def retrieve_topk(claim_text: str, topk: int = 200):
    query_tokens = analyzer(claim_text)
    scores       = bm25.get_scores(query_tokens)
    idx_sorted   = np.argsort(scores)[-topk:][::-1]
    return [(evid_ids[i], float(scores[i])) for i in idx_sorted]

# batch process
def process_claim_file(claim_json: str, out_json: str):
    with open(claim_json, "r", encoding="utf-8") as f:
        claims = json.load(f)
    results = {}
    for cid, obj in tqdm(claims.items(), desc="Retrieve"):
        hits = retrieve_topk(obj["claim_text"])
        results[cid] = {"evidences": [h[0] for h in hits]}
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

In [None]:
process_claim_file("./data/dev-claims.json", "./data/dev-claims-top200-bigram-noise.json")

In [None]:
import json
from pathlib import Path

def truncate_top100(in_path: str, out_path: str, topk: int = 100):
    # 1. 读入已有的 top200 结果
    with open(in_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # 2. 对每个 claim，只保留前 topk 条 evidence id
    truncated = {}
    for cid, obj in data.items():
        evids = obj.get("evidences", [])
        truncated[cid] = {"evidences": evids[:topk]}

    # 3. 写出到新的 JSON 文件
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(truncated, f, ensure_ascii=False, indent=2)

# 调用示例
truncate_top100(
    "./data/dev-claims-top200-bigram-noise.json",
    "./data/dev-claims-top100-bigram-noise.json",
    topk=100
)

In [None]:
import json, ujson, numpy as np
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

DATA_DIR   = Path("data")
TOP100_FNS = {
    "train": "train-claims-top100.json",
    "dev"  : "dev-claims-top100.json",
    "test" : "test-claims-top100.json"
}
TOP_M = 20

# ---------- 0. evidence ----------
with (DATA_DIR / "evidence.json").open() as f:
    evid_dict = ujson.load(f)
evid_ids = list(evid_dict.keys())
id2row   = {eid: i for i, eid in enumerate(evid_ids)}

# ---------- 1. encode evidence ----------
print("Encoding evidence vectors ...")
bi_model   = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
BATCH = 1024
chunks = []
for i in tqdm(range(0, len(evid_ids), BATCH)):
    txts = [evid_dict[eid] for eid in evid_ids[i:i+BATCH]]
    chunks.append(
        bi_model.encode(txts, batch_size=32, normalize_embeddings=True).astype("float32")
    )
evid_matrix = np.vstack(chunks)

# ---------- 2. process each split ----------
for split, fn in TOP100_FNS.items():
    top100_path = DATA_DIR / fn
    if not top100_path.exists():
        continue

    with top100_path.open() as f:
        top100 = ujson.load(f)

    claim_texts = {}
    cfile = DATA_DIR / f"{split}-claims.json"

    with cfile.open() as f:
        raw = ujson.load(f)
    claim_texts = {
        cid: raw[cid]["claim_text"] if isinstance(raw[cid], dict) else raw[cid] for cid in raw
    }

    dense_out, text_out = {}, {}
    for cid, entry in tqdm(top100.items(), desc=f"{split} rerank"):
        id_list = entry["evidences"] if isinstance(entry, dict) else entry
        claim_emb = bi_model.encode(claim_texts.get(cid, ""), normalize_embeddings=True)
        vecs = evid_matrix[[id2row[eid] for eid in id_list]]
        scores = vecs @ claim_emb
        top_idx = scores.argsort()[-TOP_M:][::-1]
        top_ids = [id_list[i] for i in top_idx]

        # dense
        dense_out[cid] = top_ids

        # text
        text_out[cid] = {
            "claim_text": claim_texts.get(cid, ""),
            "ranked_evidences": [
                {"id": eid, "text": evid_dict[eid]} for eid in top_ids
            ]
        }

    # output
    (DATA_DIR / f"{split}-claims-top{TOP_M}-dense.json").write_text(
        json.dumps(dense_out, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    (DATA_DIR / f"{split}-claims-top{TOP_M}-text.json").write_text(
        json.dumps(text_out, ensure_ascii=False, indent=2), encoding="utf-8"
    )

print("All splits processed ✅")