# Calculate best possible score from retrieved documents

In [1]:
# Standard Library
import json           
import time           
import logging        
import string        
import statistics
import re             
from pathlib import Path 
from typing import List, Dict
import numpy as np

# Bioinformatics Libraries
from Bio import Entrez, Medline   # For accessing and parsing PubMed/NCBI data

# Text Search / Ranking
from rank_bm25 import BM25Okapi   # For BM25 ranking algorithm. Extension of the TF-IDF (Term Frequency-Inverse Document Frequency) model, taking into account term frequency saturation and document length to improve ranking accuracy. 


# NLP and Tokenization Tools
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

from typing import List
import string

nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Progress Visualization
from tqdm import tqdm, trange    

# Machine Learning Libraries
import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering, pipeline
)

import scipy

from sentence_transformers import SentenceTransformer

import requests

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/julian/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Get your API and put it HERE https://account.ncbi.nlm.nih.gov/settings/
EMAIL   = "julian.fluer@outlook.com"        
API_KEY = "258c639ed1b7790cc1b355f7cdca25e4f109"       

# API key for synonyms
MESH_API_KEY = "2bb9ddd9-1828-419d-81df-bd89c4a97130"
BASE_URL = "https://data.bioontology.org/search"

# hyper-params you can play around with these to see if you get different results(Only play with candidates)
MAX_DOCS = 10 # list size required by BioASQ Phase-A
RETMX = 10000 # retrieve more, then truncate
SLEEP = 0.11 # 10 requests / sec with API key

ROOT = Path("data")
TRAIN = ROOT / "training13b.json"
OUT = ROOT / "bm25_phaseA_run.json"  
CACHE_DIR = ROOT / "medline_cache"
CACHE_DIR.mkdir(exist_ok=True)

logging.basicConfig(
    filename="data/phaseA_esearch.log",
    level=logging.WARNING,
    format="%(asctime)s %(levelname)s %(message)s"
)

Entrez.email   = EMAIL
Entrez.api_key = API_KEY

In [20]:
def dcg(relevances, k):
    """Compute Discounted Cumulative Gain up to rank k."""
    relevances = np.asarray(relevances)[:k]
    return np.sum((2**relevances - 1) / np.log2(np.arange(2, relevances.size + 2)))

def ndcg_at_k(predicted_list, ground_truth_list, k = 10):
    """
    Compute NDCG@k where both predicted and ground truth are ranked lists of doc IDs.
    
    Args:
        predicted_list (list): List of predicted doc IDs in ranked order.
        ground_truth_list (list): List of ground truth doc IDs in ranked order.
        k (int): Truncation level.
    
    Returns:
        float: NDCG@k score.
    """
    # Assign implicit relevance scores: higher rank = higher score
    max_relevance = len(ground_truth_list)
    ground_truth_relevance = {
        doc_id: max_relevance - rank for rank, doc_id in enumerate(ground_truth_list)
    }

    # Map predicted documents to their relevance based on ground truth rank
    predicted_relevances = [ground_truth_relevance.get(doc_id, 0) for doc_id in predicted_list]

    # Ideal DCG from the perfect ranking (i.e., ground truth itself)
    ideal_relevances = sorted(ground_truth_relevance.values(), reverse=True)

    dcg_score = dcg(predicted_relevances, k)
    idcg_score = dcg(ideal_relevances, k)

    if idcg_score == 0:
        return 0.0

    return dcg_score / idcg_score

def mean_ndcg_at_k(predictions, questions, k = 10):
    sum_ndcg = 0

    for prediction, question in zip(predictions, questions):
        sum_ndcg += ndcg_at_k(prediction["documents"], question["documents"], k)

    mean_ndcg = sum_ndcg / len(predictions)

    return mean_ndcg

In [21]:
def get_mesh_synonyms(query, limit=5):
    params = {
        'q': query,
        'ontologies': 'MESH',
        'apikey': MESH_API_KEY,
        'include': 'synonym,prefLabel',
        'pagesize': limit
    }

    response = requests.get(BASE_URL, params=params)
    
    if response.status_code != 200:
        return []

    data = response.json()
    synonyms = set()

    for result in data.get('collection', []):
        pref_label = result.get('prefLabel', '')
        synonyms.add(pref_label.lower())
        for syn in result.get('synonym', []):
            synonyms.add(syn.lower())

    synonyms.discard(query.lower())
    return list(synonyms)

In [22]:
def expand_query_with_mesh(query):
    mesh_synonyms = get_mesh_synonyms(query)
    terms = [f'"{query}"[Title/Abstract]']
    terms += [f'"{syn}"[Title/Abstract]' for syn in mesh_synonyms]
    terms += [f'"{syn}"[MeSH Terms]' for syn in mesh_synonyms]
    expanded_query = ' OR '.join(terms)
    return expanded_query

In [23]:
def clean(query: str) -> (str, List[str]):
    s = re.sub(r'[^a-z0-9\s]', ' ', query.lower())
    tokens = [t for t in word_tokenize(s) if len(t) >= 3]
    phrase = " ".join(tokens)
    return phrase, tokens

def esearch_pmids(query: str, k: int = RETMX) -> List[str]:
    phrase, tokens = clean(query)
    if not tokens:
        return []
    parts = []
    # full‑phrase lookups
    parts.append(f'"{phrase}"[Title/Abstract]')
    parts.append(f'"{phrase}"[MeSH Terms]')
    # individual‑token lookups
    parts += [f'{t}[Title/Abstract]' for t in tokens]
    parts += [f'{t}[MeSH Terms]' for t in tokens]
    term = f"({' OR '.join(parts)}) AND hasabstract[text]"
    for attempt in range(1, 4):
        try:
            handle = Entrez.esearch(
                db="pubmed",
                term=term,
                retmax=k,
                sort="relevance",
                retmode="xml"
            )
            ids = Entrez.read(handle).get("IdList", [])
            time.sleep(SLEEP)
            return ids
        except Exception as e:
            logging.warning(f"eSearch retry {attempt}: {e!r}")
            time.sleep(2 ** attempt)
    logging.error(f"All eSearch retries failed for {query!r}")
    return []

def pmid_to_url(pmid: str) -> str:
    return f"http://www.ncbi.nlm.nih.gov/pubmed/{pmid}"

In [25]:

qs = json.loads(TRAIN.read_text())["questions"]
print(f"Loaded {len(qs)} questions from {TRAIN.name}")

predictions = []
for q in tqdm(qs, unit="Q"):
    pmids = esearch_pmids(q["body"])
    predictions.append({
        "id": q["id"],
        "documents": [pmid_to_url(p) for p in pmids],
        "snippets": []                      # keep the field, even if empty becuase the test expects it
    })

OUT.write_text(json.dumps({"questions": predictions}, indent=2))    
print(f"Wrote Phase-A run file → {OUT.resolve()}")
print("Check phaseA_esearch.log for warnings or API errors.")

Loaded 5389 questions from training13b.json


100%|██████████| 5389/5389 [3:43:58<00:00,  2.49s/Q]  


Wrote Phase-A run file → /Users/julian/Documents/TU/Advanced_Information_Retrieval/project/data/bm25_phaseA_run.json


In [5]:
ground_truth = json.loads(TRAIN.read_text())["questions"]
corpus = json.loads(OUT.read_text())

In [None]:
print(corpus)

: 

: 