In [1]:
from pathlib import Path
import json
import numpy as np
from tqdm.auto import tqdm

import faiss
from sentence_transformers import SentenceTransformer




In [3]:
# 项目根目录（notebook 放在 Multi-turn-RAG 根目录下）
PROJECT_ROOT = Path.cwd().parent

DATASET = "clapnq"   # 先只做 clapnq，后面你可以换成 cloud / fiqa / govt
CORPUS_PATH = PROJECT_ROOT / "dataset" / DATASET / "corpus.jsonl"

# dense index 输出目录
MODEL_NAME = "BAAI/bge-base-en-v1.5"   # 或 "BAAI/bge-en-icl"
MODEL_TAG  = "bge-base"                # 或 "bge-icl"
OUT_DIR = PROJECT_ROOT / "indexes" / f"{DATASET}-{MODEL_TAG}-faiss"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# -------- 重要的控制参数 --------
BATCH_SIZE = 64

# 调这个参数控制“只跑一部分 doc”，方便本地 debug：
# - None 表示全量（18w+）
# - 比如 20000 表示只取前 20000 条
MAX_DOCS = 20000   # 先不要全量，你本地会疯掉

# 是否把每一小块 embedding 存成 chunk（方便中途挂了重来）
SAVE_CHUNKS = True
CHUNK_SIZE  = 5000   # 每 5000 条存一个 chunk


In [3]:
doc_ids = []
doc_texts = []

with CORPUS_PATH.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if MAX_DOCS is not None and i >= MAX_DOCS:
            break
        obj = json.loads(line)
        did = obj["_id"]
        title = obj.get("title") or ""
        text  = obj.get("text") or ""
        content = (title + " " + text).strip()
        if not content:
            continue
        doc_ids.append(did)
        doc_texts.append(content)

len(doc_ids), len(doc_texts)


(20000, 20000)

In [5]:
# 你现在是 Mac + mtrag 环境，本地基本只有 CPU，可以写 device="cpu"
model = SentenceTransformer(MODEL_NAME, device="cpu")
model


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
from tqdm.auto import tqdm
import numpy as np

BATCH_SIZE = 64

all_embs = []

for start in tqdm(range(0, len(doc_texts), BATCH_SIZE)):
    batch = doc_texts[start:start + BATCH_SIZE]
    embs = model.encode(
        batch,
        batch_size=len(batch),
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False,
    )
    all_embs.append(embs)

embeddings = np.vstack(all_embs)
embeddings.shape


  0%|          | 0/313 [00:00<?, ?it/s]

In [None]:
import faiss

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

index_file = OUT_DIR / "index.faiss"
ids_file   = OUT_DIR / "doc_ids.npy"

faiss.write_index(index, str(index_file))
np.save(ids_file, np.array(doc_ids))

index_file, ids_file
