In [2]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/SemEval_26_Task8_MTRAG
!pwd
!ls


Mounted at /content/drive
/content/drive/MyDrive/SemEval_26_Task8_MTRAG
/content/drive/MyDrive/SemEval_26_Task8_MTRAG
 beir   dataset   indexes  'queries data'   README.md   src


In [3]:
!pip install -U --no-cache-dir faiss-gpu-cu11
!pip install -q sentence-transformers
!pip install -q tqdm numpy

Collecting faiss-gpu-cu11
  Downloading faiss_gpu_cu11-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting nvidia-cuda-runtime-cu11>=11.8.89 (from faiss-gpu-cu11)
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cublas-cu11>=11.11.3.6 (from faiss-gpu-cu11)
  Downloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading faiss_gpu_cu11-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.3/48.3 MB[0m [31m164.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl (417.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m327.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl (875 kB)
[2K   

In [4]:
import faiss
print(f"FAISS version: {faiss.__version__}")
print("GPUs detected:", faiss.get_num_gpus())

import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))


FAISS version: 1.13.0
GPUs detected: 1
CUDA available: True
GPU name: Tesla T4


In [5]:
!ls dataset/clapnq

clapnq.jsonl.zip  corpus.jsonl	qrels  queries.jsonl


In [6]:
from pathlib import Path
import json
import numpy as np
from tqdm.auto import tqdm
import faiss
from sentence_transformers import SentenceTransformer

# --- 项目根目录 ---
PROJECT_ROOT = Path.cwd()

# --- 数据集 ---
DATASET = "clapnq"   # 你之后可以换成 cloud / fiqa / govt
CORPUS_PATH = PROJECT_ROOT / "dataset" / DATASET / "corpus.jsonl"

# --- 输出目录 ---
MODEL_NAME = "BAAI/bge-base-en-v1.5"
MODEL_TAG = "bge-base"
OUT_DIR = PROJECT_ROOT / "indexes" / f"{DATASET}-{MODEL_TAG}-faiss"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- 加载模型 ---
print("Loading embedding model...")
model = SentenceTransformer(MODEL_NAME)
model = model.to("cuda")

# --- 加载 corpus.jsonl ---
docs = []
with open(CORPUS_PATH, "r") as f:
    for line in f:
        j = json.loads(line)
        docs.append(j["text"])

print(f"Loaded {len(docs)} docs from {DATASET}")

# --- Dense Embedding ---
batch_size = 64
all_embs = []

for i in tqdm(range(0, len(docs), batch_size)):
    batch = docs[i : i + batch_size]
    emb = model.encode(
        batch,
        batch_size=len(batch),
        convert_to_numpy=True,
        show_progress_bar=False,
        normalize_embeddings=True,
    )
    all_embs.append(emb)

embeddings = np.vstack(all_embs)
print("Embeddings shape:", embeddings.shape)

# --- Build FAISS index ---
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # cosine similarity (after normalization)
index.add(embeddings)

# Save index
faiss.write_index(index, str(OUT_DIR / "index.faiss"))
np.save(str(OUT_DIR / "emb.npy"), embeddings)

print(f"Index saved to: {OUT_DIR}")


Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded 183408 docs from clapnq


  0%|          | 0/2866 [00:00<?, ?it/s]

Embeddings shape: (183408, 768)
Index saved to: /content/drive/MyDrive/SemEval_26_Task8_MTRAG/indexes/clapnq-bge-base-faiss
