In [1]:
!pip install faiss-cpu datasets scikit-learn
from google.colab import files
import pickle
import faiss
import numpy as np
from datasets import load_dataset
from sklearn.metrics import average_precision_score

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━

In [2]:
uploaded = files.upload()
input_file = "scifact_evidence_embeddings.pkl"
with open(input_file, "rb") as f:
    embeddings = pickle.load(f)

embedding_list = []
doc_id_list = []
abstract_list = []

for doc, embedding in embeddings.items():
    doc_id, abstract = doc
    doc_id_list.append(doc_id)
    abstract_list.append(abstract)
    embedding_list.append(embedding)


embedding_matrix = np.array(embedding_list).astype('float32')
embedding_dim = embedding_matrix.shape[1]
index = faiss.IndexFlatL2(embedding_dim)

index.add(embedding_matrix)

Saving scifact_claim_embeddings.pkl to scifact_claim_embeddings.pkl
Saving scifact_evidence_embeddings.pkl to scifact_evidence_embeddings.pkl


In [None]:
claim_file = "scifact_claim_embeddings.pkl"
with open(claim_file, "rb") as f:
    claims = pickle.load(f)


claim_embedding_list = []
claim_id_list = []

for claim, claim_embedding in claims.items():
    claim_id_list.append(claim)
    claim_embedding_list.append(claim_embedding)


scifact_corpus = load_dataset("scifact", "corpus")
scifact_claims = load_dataset("scifact", "claims")


claim_id_to_gold = {}

for entry in scifact_claims['train']:
    claim_id = entry['id']
    relevant_ids = entry['cited_doc_ids']
    claim_id_to_gold[claim_id] = relevant_ids


aligned_gold_relevant_ids = []
aligned_claim_embeddings = []

for idx, (claim_id, claim_text) in enumerate(claim_id_list):
    if claim_id in claim_id_to_gold:
        aligned_gold_relevant_ids.append(claim_id_to_gold[claim_id])
        aligned_claim_embeddings.append(claim_embedding_list[idx])

In [3]:
if len(aligned_claim_embeddings) == 0:
    print("No aligned claim embeddings found. Exiting.")
else:
    claim_matrix = np.array(aligned_claim_embeddings).astype('float32')


    assert claim_matrix.ndim == 2, f"claim_matrix has incorrect shape: {claim_matrix.shape}"

    def map_faiss_indices_to_doc_ids(retrieved_indices, doc_id_list):
        """
        Map the FAISS retrieved indices to actual document IDs.
        """
        mapped_ids = []
        for indices in retrieved_indices:
            mapped_ids.append([doc_id_list[idx] for idx in indices])
        return mapped_ids

    def calculate_map(retrieved_indices, relevant_indices):
        """
        Calculate the Mean Average Precision (MAP).
        """
        ap_sum = 0
        for idx in range(len(retrieved_indices)):
            relevant = relevant_indices[idx]
            if len(relevant) == 0:
                continue

            y_true = [1 if i in relevant else 0 for i in retrieved_indices[idx]]
            if sum(y_true) == 0:
                continue

            y_score = [1 / (rank + 1) for rank in range(len(retrieved_indices[idx]))]
            ap_sum += average_precision_score(y_true, y_score)
        return ap_sum / len(relevant_indices)


    k = 20
    D, I = index.search(claim_matrix, k)
    mapped_retrieved_ids = map_faiss_indices_to_doc_ids(I, doc_id_list)


    if len(mapped_retrieved_ids) != len(aligned_gold_relevant_ids):
        print(f"Mismatch between retrieved and relevant indices for k={k}")
    else:
        map_score = calculate_map(mapped_retrieved_ids, aligned_gold_relevant_ids)
        print(f"MAP@20: {map_score}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/7.92k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.33k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.06k [00:00<?, ?B/s]

The repository for scifact contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/scifact.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/3.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5183 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1261 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/300 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/450 [00:00<?, ? examples/s]

MAP@20: 0.677441460288456
