In [1]:
import pandas as pd
from pathlib import Path
DATA_DIR = Path('./scifact_data/data')
df_train = pd.read_json(DATA_DIR / "claims_train.jsonl", lines=True)
df_dev   = pd.read_json(DATA_DIR / "claims_dev.jsonl",   lines=True)
df_test  = pd.read_json(DATA_DIR / "claims_test.jsonl",  lines=True)
df_corpus= pd.read_json(DATA_DIR / "corpus.jsonl",       lines=True)

print(df_train.head(10))


   id                                              claim  \
0   0  0-dimensional biomaterials lack inductive prop...   
1   2  1 in 5 million in UK have abnormal PrP positiv...   
2   4  1-1% of colorectal cancer patients are diagnos...   
3   6  10% of sudden infant death syndrome (SIDS) dea...   
4   9  32% of liver transplantation programs required...   
5  10  4-PBA treatment decreases endoplasmic reticulu...   
6  11  4-PBA treatment raises endoplasmic reticulum s...   
7  12  40mg/day dosage of folic acid and 2mg/day dosa...   
8  14                   5'-nucleotidase metabolizes 6MP.   
9  15  50% of patients exposed to radiation have acti...   

                                            evidence cited_doc_ids  
0                                                 {}    [31715818]  
1  {'13734012': [{'sentences': [4], 'label': 'CON...    [13734012]  
2                                                 {}    [22942787]  
3                                                 {}     [26137

In [33]:
import pickle
import pandas as pd
import numpy as np
import faiss
from pathlib import Path

# Load the pickled embeddings
print("Loading embeddings...")
with open("scifact_evidence_embeddings.pkl", "rb") as f:
    doc_embeddings = pickle.load(f)

with open("scifact_claim_embeddings.pkl", "rb") as f:
    claim_embeddings = pickle.load(f)

DATA_DIR = Path('./scifact_data/data')
df_train = pd.read_json(DATA_DIR / "claims_train.jsonl", lines=True)
# we only have embeddings for train
#df_dev = pd.read_json(DATA_DIR / "claims_dev.jsonl", lines=True)
#df_test = pd.read_json(DATA_DIR / "claims_test.jsonl", lines=True)

doc_id_to_idx = {}
doc_embeddings_matrix = []
doc_ids = []

for i, (doc_key, embedding) in enumerate(doc_embeddings.items()):
    doc_id, abstract = doc_key
    doc_id_to_idx[str(doc_id)] = i
    doc_embeddings_matrix.append(embedding)
    doc_ids.append(str(doc_id))

doc_embeddings_matrix = np.array(doc_embeddings_matrix).astype('float32')

index = faiss.IndexFlatL2(doc_embeddings_matrix.shape[1])
index.add(doc_embeddings_matrix)

claim_id_to_embedding = {}
for (claim_id, claim_text), embedding in claim_embeddings.items():
    claim_id_to_embedding[claim_id] = embedding

def average_precision_at_k(retrieved_ids, relevant_ids_set, k):
    hits = 0
    precisions = []
    for i, doc_id in enumerate(retrieved_ids[:k], start=1):
        if doc_id in relevant_ids_set:
            hits += 1
            precisions.append(hits / i)
    if not relevant_ids_set:
        return 0.0
    return sum(precisions) / min(len(relevant_ids_set), k)


def evaluate_retrieval(df_eval, split_name="eval"):
    
    print(f"\nEvaluating on {split_name} set...")
    k_values = [1, 10, 50]
    results = {}
    
    notinembedding = 0
    notrelevant = 0

    for k in k_values:
        valid_queries = 0
        empty_evidence_fallbacks = 0
        reciprocal_ranks = []
        avg_precisions = []

        for _, row in df_eval.iterrows():
            claim_id = row['id']

            # skip if no embedding for this claim
            if claim_id not in claim_id_to_embedding:
                notinembedding += 1

                continue

            # Build the set of relevant doc ids (strings)
            # Some have evidence, but some don't so use cited_doc_ids as fallback
            evidence = row.get('evidence', {}) or {}
            if isinstance(evidence, dict) and len(evidence) > 0:
                relevant_ids = {str(doc_id) for doc_id in evidence.keys()}
            else:
                empty_evidence_fallbacks += 1
                cited = row.get('cited_doc_ids', []) or []
                relevant_ids = {str(doc_id) for doc_id in cited}

            # Keep only relevant docs that exist in our index
            relevant_ids = {doc_id for doc_id in relevant_ids if doc_id in doc_id_to_idx}
            if not relevant_ids:
                notrelevant += 1
                continue

            valid_queries += 1
            claim_embedding = np.asarray([claim_id_to_embedding[claim_id]], dtype='float32')

            # search
            distances, indices = index.search(claim_embedding, k)
            retrieved_doc_ids = [doc_ids[idx] for idx in indices[0]]

            # rr@k
            rr = 0.0
            for rank, did in enumerate(retrieved_doc_ids, start=1):
                if did in relevant_ids:
                    rr = 1.0 / rank
                    break
            reciprocal_ranks.append(rr)

            # AP@k
            ap = average_precision_at_k(retrieved_doc_ids, relevant_ids, k)
            avg_precisions.append(ap)

        mrr = float(np.mean(reciprocal_ranks)) if reciprocal_ranks else 0.0
        map_k = float(np.mean(avg_precisions)) if avg_precisions else 0.0
        results[f"MRR@{k}"] = mrr
        results[f"MAP@{k}"] = map_k
        print(f"  MRR@{k}: {mrr:.4f} | MAP@{k}: {map_k:.4f}")

        print(f"  Valid queries: {valid_queries}/{len(df_train)}")
        print(f"  Used cited_doc_ids fallback for {empty_evidence_fallbacks} claims")
    print(notinembedding, "claims missing from embeddings")
    print(notrelevant, "claims with no relevant documents")
    return results

train_results = evaluate_retrieval(df_train, "train")


print("\nTrain Set:")
for metric, score in train_results.items():
    print(f"  {metric}: {score:.4f}")


Loading embeddings...

Evaluating on train set...
  MRR@1: 0.5884 | MAP@1: 0.5884
  Valid queries: 809/809
  Used cited_doc_ids fallback for 304 claims
  MRR@10: 0.6808 | MAP@10: 0.6771
  Valid queries: 809/809
  Used cited_doc_ids fallback for 304 claims
  MRR@50: 0.6851 | MAP@50: 0.6821
  Valid queries: 809/809
  Used cited_doc_ids fallback for 304 claims
0 claims missing from embeddings
0 claims with no relevant documents

Train Set:
  MRR@1: 0.5884
  MAP@1: 0.5884
  MRR@10: 0.6808
  MAP@10: 0.6771
  MRR@50: 0.6851
  MAP@50: 0.6821
