<a id="tabel"></a>
<b>Table of contents:</b>


* [1. Import Libraries & Dataset](#import)
* [2. Document Retrieval](#retrive)
    - [2.1. Raw Baseline(Querying from National Library of Medicine)](#1000-document)
    - [2.2. Enhanced Querying by adding Mesh Index](#enhanced)
* [3. BM 25](#bm-25)
* [3. Out-of-Box Neural Re-ranking](#bm-25)
* [4. Fine Tune Neural Re-ranking on Training Dataset](#fine-tune)
* [5. Phase B(Question Specific Agents)](#phase_b)

[🏠 Table of Contents](#tabel)




<style>
  body {
    background: #111;          /* match your dark page */
    color: #fff;
    font-family: 'Poppins', Roboto, sans-serif;
  }

  .timeline {
    display: flex;
    align-items: stretch;      /* make children all same height */
    justify-content: center;
    gap: 40px;
    padding: 40px;
  }

  /* Left & right columns stack vertically */
  .phase-column {
    display: flex;
    flex-direction: column;
    gap: 30px;
  }

  /* dashed separator */
  .separator {
    border-left: 2px dashed #888;
    width: 0;
    /* stretch separator to the tallest column */
    align-self: stretch;
    margin: 0 20px;
  }

  .column-title {
    font-size: 1.2em;
    font-weight: 600;
    text-align: center;
    margin-bottom: 10px;
    text-transform: uppercase;
    letter-spacing: 1px;
  }

  .phase {
    width: 260px;
    height: 70px;
    line-height: 70px;
    color: white;
    font-weight: 600;
    text-align: center;
    border-radius: 6px;
    font-size: 1.05em;
    clip-path: polygon(0% 0%, 92% 0%, 100% 50%, 92% 100%, 0% 100%);
    box-shadow: 2px 4px 10px rgba(0,0,0,0.3);
    opacity: 0.9;
    transition: transform 0.2s, box-shadow 0.2s, opacity 0.2s;
  }
  .phase:hover {
    transform: scale(1.03);
    box-shadow: 4px 6px 15px rgba(0,0,0,0.4);
    opacity: 1;
  }
  .phase1 { background: #69B8F7; }
  .phase2 { background: #54A6F3; }
  .phase3 { background: #2B7BC1; }
  .phase4 { background: #1F557F; }
  .phase5 { background: #5C9D99; }
  .phase6 { background: #4E9B97; }

  @media (max-width: 800px) {
    .timeline {
      flex-direction: column;
      align-items: center;
    }
    .separator {
      display: none;
    }
  }
</style>

<div class="timeline">
  <!-- Phase A column -->
  <div class="phase-column">
    <div class="column-title">Phase A</div>
    <div class="phase phase1">1. 1000 Document Pool from API</div>
    <div class="phase phase2">2. Add BM25 (“Traditional” IR)</div>
    <div class="phase phase3">3. Representation Learning (BERT)</div>
    <div class="phase phase4">4. Baseline Neural Re-Ranking</div>
    <div class="phase phase5">5. Transfer Learning on the Neural</div>
  </div>

  <!-- Dashed line -->
  <div class="separator"></div>

  <!-- Phase B column -->
  <div class="phase-column">
    <div class="column-title">Phase B</div>
    <div class="phase phase6">6. Create Model for Phase B</div>
  </div>
</div>


## Library Imports


<a id="import"></a>

In [None]:
! pip install requirements.txt

In [None]:
# ----------------------------- #
#        Standard Library       #
# ----------------------------- #
import csv
import json
import requests
import logging
import math
import re
import os
import statistics
import pandas as pd
import pathlib
import random
import string,argparse,sys
import time
import faiss
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple

# ----------------------------- #
#      Third-Party Libraries    #
# ----------------------------- #

# Scientific Computing
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering,
    pipeline,
)
from sentence_transformers import SentenceTransformer


# Bioinformatics
from Bio import Entrez, Medline  # Accessing and parsing PubMed/NCBI data


# NLP & Text Processing
import nltk                      # Tokenization, stopwords, etc.
from rank_bm25 import BM25Okapi # BM25 ranking algorithm for information retrieval
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from rank_bm25 import BM25Okapi

# Sentence Transformers
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader

# Progress Bar / Visualization
from tqdm import tqdm, trange

<a id="retrive"></a>

## Retrieving a Pool of Documents through Querying

>https://www.nlm.nih.gov

### Baseline(Raw Querying from National Library of Medicine)

<a id="1000-document"></a>

In [None]:
EMAIL = "pgreinald@gmail.com"           # Enter your E-Mail(The API will start compaining if not entered)
API_KEY = "9666f51fccbd68a29320334f1d78ad257608"         # Enter your API Key(More Queries/s if entered)

Entrez.email = EMAIL
Entrez.api_key = API_KEY

RETMX = 1000  # Number of documents to fetch
TRAIN_DATA_URL  = "https://participants-area.bioasq.org/Tasks/13b/trainingDataset/training13b.json"
DATA_FILE = "training13b.json"
BATCH = 10_000_000    # PubMed efetch batch size
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer() # Suffix Removal(Generalization of Query)

Entrez.email   = EMAIL
Entrez.api_key = API_KEY


TRAIN_DATA_URL = (
    "https://participants-area.bioasq.org/"
    "Tasks/13b/trainingDataset/training13b.json"
)



##############################---Utility Functions---###################################
# Functions to help us with the fetching
#  Download training set.
if not os.path.exists(DATA_FILE):
    print(f"Downloading training data from:\n  {TRAIN_DATA_URL}")
    resp = requests.get(TRAIN_DATA_URL)
    resp.raise_for_status()
    with open(DATA_FILE, "wb") as f:
        f.write(resp.content)
    print(f"Saved to {DATA_FILE}")

# Load the Training File
with open(DATA_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

# To fetch only the Document 
def pmids_from_urls(urls: List[str]) -> List[str]:
    """Extract the numeric PMID from each PubMed URL."""
    pmids: List[str] = []
    for u in urls:
        m = re.search(r'/pubmed/(\d+)', u)
        if m:
            pmids.append(m.group(1))
    return pmids

#####################################################################################


# Main Fetching Logic
def fetch_documents(pmids: List[str], batch_size: int = BATCH) -> List[Tuple[str,str,str]]:
    """Fetch (pmid, title, abstract) for each PMID."""
    docs: List[Tuple[str,str,str]] = [] # List to store the documents
    # Loop to iterate through each query
    for start in range(0, len(pmids), batch_size):
        # Querying by batches
        batch = pmids[start:start+batch_size]
        # Phrasing the query(Doing the main API Call)
        handle  = Entrez.efetch(db="pubmed", id=",".join(batch), retmode="xml")
        records = Entrez.read(handle)
        time.sleep(0.34)  # NCBI rate limiting
        for art in records.get('PubmedArticle', []):
            pmid = art['MedlineCitation']['PMID'] 
            el   = art['MedlineCitation']['Article']
            title = el.get('ArticleTitle', '') 
            abst  = " ".join(el.get('Abstract',{}).get('AbstractText', []))
            docs.append((pmid, title, abst))
    return docs

# Build output structure under a top-level "questions" key just like the testing of the competition expects it
questions_out = []
for q in data.get("questions", []):
    
    qid   = q.get("id", "")
    qtype = q.get("type")           
    body  = q.get("body", "").strip()
    urls  = q.get("documents", [])
    pmids = pmids_from_urls(urls)

    docs = fetch_documents(pmids)
    docs_json = [
        {"pmid": pmid, "title": title, "abstract": abst}
        for pmid, title, abst in docs
    ]

    questions_out.append({
        "id":        qid,
        "type":      qtype,
        "body":      body,
        "documents": docs_json
    })

#  Write wrapped JSON
out_dir = "api_retrival"
os.makedirs(out_dir, exist_ok=True)
out_path = Path(out_dir) / "bio_api_docs_title+abstract_baseline.json"

with open(out_path, "w", encoding="utf-8") as f:
    json.dump({"questions": questions_out}, f, indent=2, ensure_ascii=False)

print(f"Processed {len(questions_out)} questions; total docs = "
      f"{sum(len(q['documents']) for q in questions_out)}")

<a id="enhanced"></a>

## Enhanced Querying by adding Mesh Index

In [8]:
EMAIL = "pgreinald@gmail.com"           # Enter your E-Mail(The API will start compaining if not entered)
API_KEY = "9666f51fccbd68a29320334f1d78ad257608"         # Enter your API Key(More Queries/s if entered)

RETMX = 1000  # Number of documents to fetch
TRAIN_DATA_URL  = "https://participants-area.bioasq.org/Tasks/13b/trainingDataset/training13b.json"
DATA_FILE = "training13b.json"
BATCH = 10_000_000    # PubMed efetch batch size

# Load the Training File
with open(DATA_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)
questions = data.get("questions", [])




##############################---Utility Functions---###################################

def ensure_training_file() -> None:
    if os.path.exists(DATA_FILE) and os.path.getsize(DATA_FILE) > 0:
        return
    print("Downloading training data …")
    resp = requests.get(TRAIN_DATA_URL, timeout=30)
    resp.raise_for_status()
    if b"<html" in resp.content[:100].lower():
        raise RuntimeError("Downloaded content is HTML, not JSON - check URL/login.")
    with open(DATA_FILE, "wb") as fh:
        fh.write(resp.content)
    print(f"Saved → {DATA_FILE}")

def esearch_pmids(query: str, retmax: int = RETMX) -> list:
    """
    Perform a PubMed search using both Title/Abstract and MeSH terms.
    Returns a list of PMIDs (strings).
    """
    # Cleaning up teh query by phrasing it as well as tokenizing it.
    phrase = query.lower().strip()
    tokens = phrase.split()
    # Build query parts for full phrase and individual tokens. Mesh Terms are 
    # standardized vocabulary used to enhance search precision by capturing the semantic meaning of biomedical concepts.
    # Frankly this really helps in the document retrival part.
    parts = [f'"{phrase}"[Title/Abstract]', f'"{phrase}"[MeSH Terms]']
    for t in tokens:
        parts.append(f'{t}[Title/Abstract]')
        parts.append(f'{t}[MeSH Terms]')
    term = f"({' OR '.join(parts)}) AND hasabstract[text]"

    # Retry up to 3 times on failure
    for attempt in range(3):
        try:
            handle = Entrez.esearch(
                db="pubmed",
                term=term,
                retmax=retmax,
                sort="relevance",
                retmode="xml"
            )
            result = Entrez.read(handle)
            time.sleep(0.1)
            return result.get('IdList', [])
        except Exception as e:
            print(f"Esearch attempt {attempt+1} failed: {e}")
            time.sleep(2 ** attempt)
    return []

#####################################################################################

def fetch_documents(pmids: List[str], batch_size: int = BATCH) -> List[Tuple[str,str,str]]:
    docs: List[Tuple[str,str,str]] = []
    for start in range(0, len(pmids), batch_size):
        batch = pmids[start:start+batch_size]
        for attempt in range(3):
            try:
                h = Entrez.efetch(db="pubmed", id=",".join(batch), retmode="xml")
                recs = Entrez.read(h)
                time.sleep(0.34)         
                break
            except Exception as e:
                print(f"    [efetch] attempt {attempt+1}/3 failed → {e}")
                time.sleep(2 ** attempt)
        else:                            # all retries exhausted
            continue

        for art in recs.get("PubmedArticle", []):
            pmid   = art["MedlineCitation"]["PMID"]
            art_el = art["MedlineCitation"]["Article"]
            title  = art_el.get("ArticleTitle", "")
            abst   = " ".join(art_el.get("Abstract", {})
                              .get("AbstractText", []))
            docs.append((pmid, title, abst))
    return docs


def load_processed_ids(out_path: Path) -> set:
    """Return set of question IDs already present in JSONL file."""
    processed = set()
    if out_path.exists() and out_path.stat().st_size:
        with out_path.open("r", encoding="utf-8") as fh:
            for line in fh:
                try:
                    processed.add(json.loads(line)["id"])
                except Exception:
                    continue
    return processed

# Loop Iteration for each query in the training file or testin batches. 
def main(limit: int | None, resume: bool) -> None:
    ensure_training_file()

    # Load input questions
    with open(DATA_FILE, "r", encoding="utf-8") as fh:
        questions = json.load(fh)["questions"]

    # Output file setup
    out_dir  = Path("api_retrieval")
    out_dir.mkdir(exist_ok=True)
    out_path = out_dir / "bio_api_docs_mesh_streamed.jsonl"

    processed_ids = load_processed_ids(out_path) if resume else set()
    mode = "a" if resume else "w"

    remaining = (
        [q for q in questions if q["id"] not in processed_ids]
        if resume else questions
    )
    if limit:
        remaining = remaining[:limit]

    try:
        # Nice progress bar if tqdm available
        try:
            from tqdm import tqdm
            iterator = tqdm(remaining, total=len(remaining), unit="q")
        except ModuleNotFoundError:
            iterator = remaining

        with out_path.open(mode, encoding="utf-8") as fout:
            for idx, q in enumerate(iterator, 1):
                qid, qtype = q["id"], q["type"]
                body       = q["body"].strip()

                print(f"[{idx}] {qid} …")

                pmids = esearch_pmids(body)
                print(f"    pmids: {len(pmids)}")

                docs  = fetch_documents(pmids)
                print(f"    docs : {len(docs)}")

                record = {
                    "id": qid,
                    "type": qtype,
                    "body": body,
                    "documents": [
                        {"pmid": p, "title": t, "abstract": a}
                        for p, t, a in docs
                    ]
                }

                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
                fout.flush()                   # ensure line is on disk
    except KeyboardInterrupt:
        print("\nInterrupted by user — progress saved. Resume with --resume.")

    print("\n✅ Finished.")
    
    
#sys.argv = ["notebook", "--resume", "--limit", "10"]
sys.argv = ["notebook", "--resume"]  

parser = argparse.ArgumentParser(
    description="Stream PubMed docs for BioASQ questions."
)
parser.add_argument("--limit", type=int, default=None,
                    help="Process only first N questions.")
parser.add_argument("--resume", action="store_true",
                    help="Resume from existing output file.")
args = parser.parse_args()

main(limit=args.limit, resume=args.resume)

 96%|█████████▌| 943/987 [2:23:00<06:56,  9.46s/q]

    docs : 1000
[944] 661d4fabeac11fad3300001c …
    pmids: 1000


 96%|█████████▌| 944/987 [2:23:06<06:01,  8.41s/q]

    docs : 1000
[945] 65cfe0381930410b13000029 …
    pmids: 1000


 96%|█████████▌| 945/987 [2:23:16<06:10,  8.82s/q]

    docs : 1000
[946] 660999e7fdcbea915f00001c …
    pmids: 1000


 96%|█████████▌| 946/987 [2:23:30<07:09, 10.47s/q]

    docs : 1000
[947] 661c03c848a2c27714000004 …
    pmids: 1000


 96%|█████████▌| 947/987 [2:23:39<06:36,  9.92s/q]

    docs : 1000
[948] 660d1285fdcbea915f000036 …
    pmids: 1000


 96%|█████████▌| 948/987 [2:23:48<06:25,  9.88s/q]

    docs : 1000
[949] 661dc376fe9d0b3967000002 …
    pmids: 1000


 96%|█████████▌| 949/987 [2:23:58<06:18,  9.96s/q]

    docs : 1000
[950] 661d6252eac11fad33000025 …
    pmids: 1000


 96%|█████████▋| 950/987 [2:24:08<06:03,  9.83s/q]

    docs : 1000
[951] 65d1300e1930410b13000033 …
    pmids: 1000


 96%|█████████▋| 951/987 [2:24:14<05:17,  8.81s/q]

    docs : 994
[952] 65f77618c4010b4d7800002e …
    pmids: 1000


 96%|█████████▋| 952/987 [2:24:23<05:09,  8.85s/q]

    docs : 1000
[953] 661bdc9148a2c27714000001 …
    pmids: 1000


 97%|█████████▋| 953/987 [2:24:30<04:39,  8.22s/q]

    docs : 1000
[954] 6614fb7bfdcbea915f000044 …
    pmids: 1000


 97%|█████████▋| 954/987 [2:24:40<04:50,  8.79s/q]

    docs : 1000
[955] 661c424588784bab31000001 …
    pmids: 1000


 97%|█████████▋| 955/987 [2:24:53<05:23, 10.09s/q]

    docs : 1000
[956] 661d6473eac11fad33000027 …
    pmids: 1000


 97%|█████████▋| 956/987 [2:25:00<04:38,  9.00s/q]

    docs : 1000
[957] 65cf98431930410b1300000b …
    pmids: 1000


 97%|█████████▋| 957/987 [2:25:09<04:31,  9.03s/q]

    docs : 912
[958] 65f77465c4010b4d78000028 …
    pmids: 1000


 97%|█████████▋| 958/987 [2:25:18<04:26,  9.18s/q]

    docs : 1000
[959] 65ef8502dffffb9b6b000003 …
    pmids: 1000


 97%|█████████▋| 959/987 [2:25:26<04:00,  8.60s/q]

    docs : 1000
[960] 66165f2afdcbea915f000051 …
    pmids: 1000


 97%|█████████▋| 960/987 [2:25:35<04:01,  8.93s/q]

    docs : 1000
[961] 661c413f48a2c27714000006 …
    pmids: 1000


 97%|█████████▋| 961/987 [2:25:46<04:03,  9.37s/q]

    docs : 1000
[962] 661d51fdeac11fad3300001d …
    pmids: 1000


 97%|█████████▋| 962/987 [2:25:53<03:41,  8.87s/q]

    docs : 1000
[963] 65cfa70f1930410b1300000f …
    pmids: 1000


 98%|█████████▊| 963/987 [2:26:01<03:20,  8.35s/q]

    docs : 992
[964] 65f774a4c4010b4d78000029 …
    pmids: 1000


 98%|█████████▊| 964/987 [2:26:16<04:01, 10.49s/q]

    docs : 1000
[965] 65f37aeac4010b4d7800000c …
    pmids: 1000


 98%|█████████▊| 965/987 [2:26:23<03:27,  9.42s/q]

    docs : 1000
[966] 66168d7bfdcbea915f000056 …
    pmids: 1000


 98%|█████████▊| 966/987 [2:26:40<04:05, 11.68s/q]

    docs : 1000
[967] 661d2428eac11fad33000017 …
    pmids: 1000


 98%|█████████▊| 967/987 [2:26:47<03:22, 10.15s/q]

    docs : 1000
[968] 65cf96ce1930410b1300000a …
    pmids: 1000


 98%|█████████▊| 968/987 [2:26:56<03:10, 10.05s/q]

    docs : 1000
[969] 65f7754ec4010b4d7800002b …
    pmids: 1000


 98%|█████████▊| 969/987 [2:27:09<03:16, 10.92s/q]

    docs : 1000
[970] 660982eefdcbea915f000012 …
    pmids: 1000


 98%|█████████▊| 970/987 [2:27:16<02:42,  9.57s/q]

    docs : 1000
[971] 65f868eac4010b4d78000056 …
    pmids: 1000


 98%|█████████▊| 971/987 [2:27:29<02:52, 10.76s/q]

    docs : 1000
[972] 661d56c0eac11fad3300001f …
    pmids: 1000


 98%|█████████▊| 972/987 [2:27:35<02:18,  9.22s/q]

    docs : 1000
[973] 65cfa8f81930410b13000010 …
    pmids: 1000


 99%|█████████▊| 973/987 [2:27:43<02:03,  8.80s/q]

    docs : 1000
[974] 65f774dec4010b4d7800002a …
    pmids: 1000


 99%|█████████▊| 974/987 [2:27:54<02:03,  9.51s/q]

    docs : 1000
[975] 65f03163dffffb9b6b000005 …
    pmids: 1000


 99%|█████████▉| 975/987 [2:28:03<01:52,  9.35s/q]

    docs : 1000
[976] 65f846bfc4010b4d78000045 …
    pmids: 1000


 99%|█████████▉| 976/987 [2:28:18<02:01, 11.00s/q]

    docs : 1000
[977] 661d5c97eac11fad33000021 …
    pmids: 1000


 99%|█████████▉| 977/987 [2:28:23<01:33,  9.30s/q]

    docs : 1000
[978] 65cfe1d21930410b1300002a …
    pmids: 1000


 99%|█████████▉| 978/987 [2:28:31<01:19,  8.88s/q]

    docs : 1000
[979] 65f777d7c4010b4d78000032 …
    pmids: 1000


 99%|█████████▉| 979/987 [2:28:45<01:23, 10.46s/q]

    docs : 1000
[980] 660812b4fdcbea915f000006 …
    pmids: 1000


 99%|█████████▉| 980/987 [2:29:03<01:29, 12.72s/q]

    docs : 1000
[981] 6614f672fdcbea915f000042 …
    pmids: 1000


 99%|█████████▉| 981/987 [2:29:10<01:05, 10.91s/q]

    docs : 1000
[982] 661d4aa3eac11fad33000019 …
    pmids: 1000


 99%|█████████▉| 982/987 [2:29:26<01:02, 12.47s/q]

    docs : 1000
[983] 65d134bd1930410b13000037 …
    pmids: 1000


100%|█████████▉| 983/987 [2:29:35<00:46, 11.59s/q]

    docs : 1000
[984] 65f7789dc4010b4d78000034 …
    pmids: 1000


100%|█████████▉| 984/987 [2:29:55<00:41, 13.90s/q]

    docs : 1000
[985] 65f59e46c4010b4d78000015 …
    pmids: 1000


100%|█████████▉| 985/987 [2:30:02<00:23, 11.90s/q]

    docs : 1000
[986] 6616576afdcbea915f000050 …
    pmids: 1000


100%|█████████▉| 986/987 [2:30:10<00:10, 10.65s/q]

    docs : 1000
[987] 661d4c2eeac11fad3300001a …
    pmids: 1000


100%|██████████| 987/987 [2:30:16<00:00,  9.13s/q]

    docs : 1000

✅ Finished.





In [9]:
with open("training13b.json", "r", encoding="utf-8") as f:
    data = json.load(f)
print("Total questions:", len(data["questions"]))

Total questions: 5389


In [10]:
with open("api_retrieval/bio_api_docs_mesh_streamed.jsonl") as f:
    written_ids = {json.loads(line)["id"] for line in f}

with open("training13b.json", "r", encoding="utf-8") as f:
    all_ids = {q["id"] for q in json.load(f)["questions"]}

missing = all_ids - written_ids
print("Remaining questions to process:", len(missing))

Remaining questions to process: 0


In [11]:
with open("api_retrieval/bio_api_docs_mesh_streamed.jsonl") as f:
    line_count = sum(1 for _ in f)
print("Written questions:", line_count)


## BM 25(Traditional IR Model)

<a id="bm-25"></a>

In [None]:
RETMX   = 1000
TOP_K   = 10
OUT_DIR = "api_retrival"

# Paths
INPUT_FILE  = Path(OUT_DIR) / "bio_api_docs_title+abstract.json"
OUTPUT_FILE = Path("bm_25") / "bm_25_output.json"

def main():
    # read the 'questions' key
    data = json.loads(INPUT_FILE.read_text())['questions']
    output = []

    for i, item in enumerate(data, 1):
        print(f"Step {i}/{len(data)}: Fetching PMIDs…")
        qid, qtype, body = item['id'], item.get('type'), item['body']

        pmids = esearch_pmids(body, RETMX)
        docs  = fetch_documents(pmids)

        print(f"  Fetched {len(docs)} documents")
        corpus   = [preprocess(t) for _, _, t in docs]  
        bm25     = BM25Okapi(corpus)
        q_tokens = preprocess(body)
        scores   = bm25.get_scores(q_tokens)

        top_idx   = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:TOP_K]
        top_pmids = [docs[i][0] for i in top_idx]
        top_urls  = [f"http://www.ncbi.nlm.nih.gov/pubmed/{pmid}" for pmid in top_pmids]

        output.append({
            'id':        qid,
            'type':      qtype,
            'body':      body,
            'documents': top_urls,
            'snippets':  []
        })

    OUTPUT_FILE.parent.mkdir(exist_ok=True)
    OUTPUT_FILE.write_text(json.dumps({'questions': output}, indent=2))
    print(f"Saved output to {OUTPUT_FILE}")


main()

# Representation Learning(NLP)

<a id="represent"></a>

In [None]:
IN_PATH  = Path("api_retrieval/bio_api_docs_mesh.json")   # your file
OUT_PATH = Path("nlp/bio_dense_docs.json")              
TOP_K    = 100                                            # keep best k
 
MODEL    = "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"  # biomedical ST model :contentReference[oaicite:0]{index=0}
device   = "cuda" if torch.cuda.is_available() else "cpu"
encoder  = SentenceTransformer(MODEL, device=device)
 

##############################---Utility Functions---###################################
def embed(texts, batch=32):
    """Return a 2-D np.float32 array (len(texts), dim) with unit-norm rows."""
    return encoder.encode(texts, batch_size=batch, convert_to_numpy=True,
                          normalize_embeddings=True)
 
def dense_rank(q_vec, doc_vecs, top_k=TOP_K):
    """Return indices of top_k most similar doc_vecs to q_vec (cosine/IP)."""
    idx = faiss.IndexFlatIP(doc_vecs.shape[1])
    idx.add(doc_vecs)
    D, I = idx.search(q_vec[None, :], top_k)
    return I[0], D[0]         # indices, similarities

#####################################################################################
 
with open(IN_PATH, encoding="utf-8") as f:
    data = json.load(f)
 
new_questions = []
for q in tqdm(data["questions"], desc="queries"):
 
    query_txt   = q["body"].strip()
    docs        = q["documents"]                           # 1 000 dicts
    doc_texts   = [f"{d['title']} {d['abstract']}" for d in docs]
 
    # embeddings
    q_vec       = embed([query_txt])[0]                    # shape from the model itself (768,)
    doc_vecs    = embed(doc_texts)                         # Since we have 1000 documents for query (num_documents, 768). 
 
    # cosine similarity ranking 
    top_idx, sims = dense_rank(q_vec, doc_vecs, top_k=TOP_K)
 
    # rebuild doc list in new order, attach scores
    ranked_docs = []
    for rank, (i, s) in enumerate(zip(top_idx, sims), 1):
        d = docs[i].copy()
        d.update({"dense_rank": rank, "dense_score": float(s)})
        ranked_docs.append(d)
 
    new_questions.append({
        "id":   q["id"],
        "type": q["type"],
        "body": query_txt,
        "documents": ranked_docs          # now densified & trimmed
    })
 
# Write 
OUT_PATH.parent.mkdir(exist_ok=True)
with open(OUT_PATH, "w", encoding="utf-8") as f:
    json.dump({"questions": new_questions}, f, indent=2, ensure_ascii=False)
 
print(f"Done. Dense-ranked output → {OUT_PATH}")

## Out-of-Box Neural Re-ranking(Cross Encoder trained on MS Marco Passage Ranking)


<a id="Baseline-NN"></a>

In [None]:
# Just a check when we were first running the model to ensure its running on GPU on the cloud.
assert torch.cuda.is_available(), "CUDA is not available."


with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

model = CrossEncoder(
    "cross-encoder/ms-marco-MiniLM-L-6-v2",
    device="cuda"
)


top_k = 10  # Limited by the competition
questions = data['questions']

# Build one big list of all (query, abstract) pairs
all_pairs = []
# keep track of how many docs each question had
doc_counts = []
for q in questions:
    docs = q['documents']
    doc_counts.append(len(docs))
    all_pairs.extend([(q['body'], d['abstract']) for d in docs])

# Predict scores in large batches
scores = model.predict(all_pairs, batch_size=128)

# Split scores back per-question and pick top_k
output = {"questions": []}
idx = 0
for q, count in zip(questions, doc_counts):
    # slice out this question's scores
    chunk_scores = scores[idx: idx + count]
    idx += count

    # pick top_k within this chunk
    ranked = np.argsort(chunk_scores)[::-1][:top_k]
    top_docs = [q['documents'][i] for i in ranked]

    output["questions"].append({
        "id":       q.get("id"),
        "body":     q['body'],
        "documents": top_docs,
        "snippets": []
    })

# Save results(JSON Dump)

out_dir = "Neural_Reranking"
os.makedirs(out_dir, exist_ok=True)
neural_reranking_path = Path(out_dir) / "Baseline_Neural_Reranking.json"

with open(neural_reranking_path, "w", encoding="utf-8") as out:
    json.dump(output, out, indent=2, ensure_ascii=False)

print("Done – ran all questions in one big batch and wrote Ranked_Results.json")


# Fine Tune Neural Re-ranking on Training Dataset

<a id="fine-tune"></a>


In [None]:
# We just used 500 out of the 5000 documents to train the model due to computation limitations and that is why we created this filtered
# Dataset
out_dir = "Neural_Reranking/Fine_Tuning"

filtered_training = Path(out_dir) / "Filtered_Training.json"

# Loading the master file and collecting its IDs
with out_path.open('r', encoding='utf-8') as f:
    master = json.load(f)
master_ids = { q['id'] for q in master.get('questions', []) if 'id' in q }

# Load the ground truth file
with DATA_FILE.open('r', encoding='utf-8') as f:
    meta = json.load(f)

# Filter meta to only those IDs present in the golden file
filtered_meta = [
    q for q in meta.get('questions', [])
    if q.get('id') in master_ids
]

# 4) Write out the filtered meta
with filtered_training.open('w', encoding='utf-8') as f:
    json.dump({"questions": filtered_meta}, f, indent=2, ensure_ascii=False)

print(
    f"Filtered meta: kept {len(filtered_meta)} entries "
    f"out of {len(meta.get('questions', []))}."
)

In [None]:
# ------------------------------------------------------------------
TRAIN_CSV = Path(out_dir) / "train.csv"
DEV_CSV = Path(out_dir) / "dev.csv"
DEV_SPLIT  = 0.10   # 10 % of rows go to dev
BATCH_SIZE = 16
# ------------------------------------------------------------------

def is_positive(abs_txt, snippets, exacts):
    # cross-referencing the BioASQ annotations, 
    # specifically marking documents(with a label 1) containing exact answer matches or 
    # those explicitly referenced by expert-provided relevant snippets and the rest as 0 
    # This approach was choosen becuase we expect the querying to make systematic errors regardless of the data being 
    # the training and test dataset so we can help the model understand these issues(where we retrive non-sense documents).
    for s in snippets:
        if s['text'] in abs_txt:
            return True
    for token in exacts:
        if re.search(r'\b{}\b'.format(re.escape(token.lower())), abs_txt.lower()):
            return True
    return False

def flatten_exact(exact_answer):
    """ exact_answer can be list of lists or strings – flatten it """
    flat = []
    for item in exact_answer:
        if isinstance(item, list):
            flat.extend(item)
        else:
            flat.append(item)
    return flat

# build (label, question, abstract) triples 
with filtered_training.open() as f:
    qs = json.load(f)['questions']

with out_path.open() as f:
    pools = {q['id']: q['documents'] for q in json.load(f)['questions']}

rows = []
for q in qs:
    qid   = q['id']
    qtext = q['body']
    pool  = pools.get(qid, [])

    snippets = q.get('snippets', [])
    exacts   = flatten_exact(q.get('exact_answer', []))
    pos_ids  = {s['document'].split('/')[-1] for s in snippets}

    for d in pool:
        abstract = d.get('abstract', '')
        matched  = (str(d['pmid']) in pos_ids) or is_positive(abstract, snippets, exacts)
        rows.append((int(matched), qtext, abstract))

print(f"Collected {len(rows):,} pairs")

# shuffle & split
random.shuffle(rows)
cut = int(len(rows) * DEV_SPLIT)

train_rows = rows[cut:]
dev_rows   = rows[:cut]

# write CSV with full quoting 
for path, data in [(TRAIN_CSV, train_rows), (DEV_CSV, dev_rows)]:
    with path.open('w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        writer.writerow(['label', 'question', 'abstract'])  # header
        writer.writerows(data)
    print(f"Wrote {len(data):,} rows → {path}")

#  helper to read the new CSV 
def read_csv(path: pathlib.Path):
    with path.open(encoding='utf-8') as f:
        rdr = csv.DictReader(f, quoting=csv.QUOTE_ALL)
        for row in rdr:
            yield InputExample(
                texts=[row['question'], row['abstract']],
                label=float(row['label'])
            )

train_samples = list(read_csv(TRAIN_CSV))
dev_samples   = list(read_csv(DEV_CSV))
train_loader  = DataLoader(train_samples, shuffle=True, batch_size=BATCH_SIZE)

print("Data ready:",
      len(train_samples), "train pairs |",
      len(dev_samples),   "dev pairs | batch =", BATCH_SIZE)


In [None]:
train_df = pd.read_csv('/Neural_Reranking/Fine_Tuning/train.csv')

test_df = pd.read_csv('/Neural_Reranking/Fine_Tuning/dev.csv')

In [None]:
print(f"Missing Data in the Training Dataset: \n{train_df.isna().sum()}")

print(f"Missing Data in the Test Dataset: \n{test_df.isna().sum()}")

In [None]:
# Clearning so that we have a clean datset
train_df = train_df.dropna()

test_df = test_df.dropna()

In [None]:
train_df.to_csv('/Neural_Reranking/Fine_Tuning/train.csv')

test_df.to_csv('/Neural_Reranking/Fine_Tuning/dev.csv')

In [None]:
# Initializing the Cross Encoder
model = CrossEncoder(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    num_labels=1,
    max_length=512,
    device="cuda"
)

# tiny LR so we don’t erase prior knowledge
model.config.hidden_dropout_prob = 0.1

In [None]:
# Preparing text-pair training data

BATCH     = 16

# Load the CSVs
def load_csv(path):
    with open(path, encoding="utf-8") as f:
        reader = csv.DictReader(f, quoting=csv.QUOTE_ALL)
        for row in reader:
            yield InputExample(
                texts=[row["question"], row["abstract"]],
                label=float(row["label"])
            )

train_samples = list(load_csv(train_df))
dev_samples   = list(load_csv(test_df))

train_loader  = DataLoader(train_samples, shuffle=True, batch_size=BATCH)

print(f"{len(train_samples):,} training pairs | "
      f"{len(dev_samples):,} dev pairs | batch={BATCH}")

In [None]:
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    device      = "cuda"
    print(f"✅ GPU found → {device_name}")
else:
    device = "cpu"
    print("No CUDA GPU detected.")

In [None]:
model = CrossEncoder(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    num_labels=1,
    max_length=512,
    device="cuda"         # T4 GPU since we are working on Lightning.ai
)

In [None]:
model.fit(
    train_dataloader=train_loader,
    epochs=1,
    warmup_steps=int(0.1 * len(train_loader)),
    use_amp=True,
    output_path="/Neural_Reranking/Fine_Tuning/Model/biomed_cross_encoder",      
    show_progress_bar=True,        
    optimizer_params={'lr': 2e-5}
)

print("✅  Fine‑tuning complete. Model saved to biomed_cross_encoder/")

In [None]:
model.save("/Neural_Reranking/Fine_Tuning/Model/biomed_cross_encoder")

In [None]:
# Loading the Model Parameters
model_path = "/Neural_Reranking/Fine_Tuning/Model/biomed_cross_encoder"
model_dev  = CrossEncoder(model_path, device="cuda")

# Creating Label Pairs
pairs   = [ex.texts for ex in dev_samples]   # [(q, doc), (q, doc), ...]
labels  = [ex.label for ex in dev_samples]   # floats 0 / 1


# Creating Some metrics
scores  = model_dev.predict(pairs, batch_size=128)
mse     = np.mean((np.array(scores) - np.array(labels)) ** 2)
pearson = np.corrcoef(scores, labels)[0, 1]

print(f"\nDev set MSE       : {mse:.4f}")
print(f"Dev set Pearson r : {pearson:.4f}")

## Testing

In [None]:
out_dir = "Neural_Reranking"
INPUT_PATH = Path(out_dir) / "Baseline_Neural_Reranking.json"

PRED_FILE = 
GOLD_FILE = 
K         = 10                     # evaluate at top‑K


def pmid_from_url(url: str) -> str:
    """extract PubMed ID from any of the URLs"""
    m = re.search(r'/pubmed/(\d+)', url)
    return m.group(1) if m else url.rsplit('/', 1)[-1]

# load golden file
with GOLD_FILE.open(encoding="utf-8") as f:
    gold_data = json.load(f)

gold_docs = {
    q["id"]: { pmid_from_url(d) for d in q["documents"] }
    for q in gold_data["questions"]
}

# ---------- load predictions ----------
with PRED_FILE.open(encoding="utf-8") as f:
    pred_data = json.load(f)

pred_docs = {
    q["id"]: [ pmid_from_url(d) for d in q["documents"] ]
    for q in pred_data["questions"]
}

# ---------- compute metrics ----------
def metrics_for_query(gold_set, pred_list, k):
    """return (precision@k, recall@k, AP, RR) for one query"""
    hits = 0
    precisions = []
    rr = 0
    for rank, pmid in enumerate(pred_list[:k], start=1):
        if pmid in gold_set:
            hits += 1
            precisions.append(hits / rank)      # precision at this hit
            if rr == 0:
                rr = 1 / rank                   # first relevant --> RR
    prec_k  = hits / k
    rec_k   = hits / len(gold_set) if gold_set else 0
    ap      = sum(precisions) / len(gold_set) if gold_set else 0
    return prec_k, rec_k, ap, rr

P_sum = R_sum = AP_sum = RR_sum = 0
count = 0
missing = []

for qid, gold_set in gold_docs.items():
    pred_list = pred_docs.get(qid)
    if pred_list is None:
        missing.append(qid)
        continue
    p, r, ap, rr = metrics_for_query(gold_set, pred_list, K)
    P_sum  += p
    R_sum  += r
    AP_sum += ap
    RR_sum += rr
    count  += 1

print(f"Evaluated {count} questions   (missing preds for {len(missing)})")
print(f"Precision@{K}: {P_sum / count:.3f}")
print(f"Recall@{K}   : {R_sum / count:.3f}")
print(f"MAP          : {AP_sum / count:.3f}")
print(f"MRR          : {RR_sum / count:.3f}")


# Phase B(Question Specific Agents)

<a id="phase_b"></a>


In [None]:
tokenizer = AutoTokenizer.from_pretrained("sultan/BioM-ELECTRA-Large-SQuAD2")
model = AutoModelForQuestionAnswering.from_pretrained("sultan/BioM-ELECTRA-Large-SQuAD2")

In [None]:
# Phase B: Answer Generation Pipeline
# ------------------------------------
# This script takes the *reranked snippets* produced in Phase A and generates
#    exact answers for yes/no, factoid, and list questions
#    ideal (summary) answers for every question type
# It is fully self-contained: given an input JSON that follows BioASQ's
# `"questions" -> [ {id, type, body, snippets:[{text:..},...]} ]` format, it
# returns a submission-ready JSON with `exact_answer` and `ideal_answer` fields.
#
# Key design choices:
#   1. Keep three lightweight HuggingFace pipelines resident on GPU at once.
#   2. Heuristically size the summary so we do not overrun evaluation limits.
#   3. Treat "uncertain" yes/no outputs as *no* - BioASQ scores that higher.
#
# -------------------------------------------------------------

# Configuration
DEVICE = 0 if torch.cuda.is_available() else -1  # We would not like to see -1 here :/
CTX_CHAR_LIMIT = 3500                            # truncate long contexts
SUM_MIN, SUM_MAX_CAP = 30, 120                   # Min/Max token budget for summaries.  Dynamically rescaled further down.
out_dir = "Neural_Reranking"
INPUT_PATH = Path(out_dir) / "Baseline_Neural_Reranking.json"
OUT_PATH = DATA_FILE


# 1) YES‑/‑NO QA (BioBERT fine‑tuned on PubMedQA)
yn_tok = AutoTokenizer.from_pretrained("learnifyhealth/biobert-v1-1-pubmedqa-finetuned")
yn_model = AutoModelForSequenceClassification.from_pretrained(
    "learnifyhealth/biobert-v1-1-pubmedqa-finetuned"
).eval() # In eval mode to disable dropout.
yn_pipe = pipeline(
    "text-classification",
    model=yn_model,
    tokenizer=yn_tok,
    device=DEVICE,
    truncation=True,
)

# 2) SPAN QA (factoid / list)
qa_tok = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
if DEVICE >= 0:
    qa_model.to("cuda")
qa_model.eval()
qa_pipe = pipeline(
    "question-answering",
    model=qa_model,
    tokenizer=qa_tok,
    device=DEVICE,
)

# 3) SUMMARISATION (for ideal answers)
sum_pipe = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    tokenizer="sshleifer/distilbart-cnn-12-6",
    device=DEVICE,
)

##############################---Utility Functions---###################################

def _map_yes_no(label: str) -> str:
    """Force model label → literal "yes"/"no"."""
    lab = label.lower()
    if lab.startswith(("yes", "true", "1")):
        return "yes"
    if lab.startswith(("no", "false", "0")):
        return "no"
    # treat uncertain / maybe as negative in BioASQ evaluation
    return "no"


def yes_no_answer(question: str, context: str) -> str:
    text = f"{question} {yn_tok.sep_token} {context}"
    pred = yn_pipe(text, top_k=1)[0]
    return _map_yes_no(pred["label"])


def _dedup(tokens: List[str]) -> List[str]:
    """Deduplicate while preserving original order."""
    return list(dict.fromkeys(tokens))


def span_answers(question: str, context: str, k: int = 8) -> List[str]:
    res = qa_pipe(question=question, context=context, top_k=k)
    res = res if isinstance(res, list) else [res]
    # Clean punctuation and whitespace artefacts
    raw = [r["answer"] for r in res]
    clean = [re.sub(r"[^\w\- ]", "", a).strip() for a in raw]
    return _dedup([c for c in clean if c])


def wrap_synonyms(strings: List[str]) -> List[List[str]]:
    """BioASQ expects each synonym in its own list."""
    return [[s] for s in strings]


def summarise(context: str) -> str:
    "Generate an abstractive summary constrained by dynamic length bounds."
    n_toks = len(sum_pipe.tokenizer.tokenize(context))

    # Very short contexts are copied verbatim; avoids hallucinating.
    if n_toks < 20:
        return context.strip()
    # Token budget: 80 % of source but never above SUM_MAX_CAP.
    max_len = min(int(0.8 * n_toks), SUM_MAX_CAP)
    min_len = min(max_len - 5, max(SUM_MIN, int(0.33 * n_toks)))

    # Edge case: tiny windows where min >= max --> devide in 2
    if min_len >= max_len:  
        min_len = max_len // 2

    return sum_pipe(
        context,
        min_length=min_len,
        max_length=max_len,
        do_sample=False, # At every step it picks the highest-probability token (or the best beam) given the context.
    )[0]["summary_text"].strip()

#####################################################################################

# MAIN

def main() -> None:
    data = json.loads(INPUT_PATH.read_text())
    results = []

    for q in data.get("questions", []):
        qid, qtype, body = q["id"], q["type"], q["body"]
        
        # Build a single context string of *all* snippets (truncated).
        ctx = " ".join(s.get("text", "") for s in q.get("snippets", []))[:CTX_CHAR_LIMIT]

        
        if qtype == "yesno":
            exact = yes_no_answer(body, ctx)

        elif qtype == "factoid":
            spans = span_answers(body, ctx, k=8)
            exact = wrap_synonyms(spans)[:5]  # ≤5 synonyms

        elif qtype == "list":
            spans = span_answers(body, ctx, k=15)
            exact = wrap_synonyms(spans)

        else:  # summary‑only questions
            exact = []

        
        ideal = summarise(ctx)

        results.append(
            {
                "id": qid,
                "type": qtype,
                "exact_answer": exact,
                "ideal_answer": ideal,
            }
        )

    OUT_PATH.write_text(json.dumps({"questions": results}, indent=2))
    print(f"✅ submission ready → {OUT_PATH}")


if __name__ == "__main__":
    main()