<a id="tabel"></a>
<b>Table of contents:</b>


* [1. Import Libraries & Dataset](#import)

[🏠 Table of Contents](#tabel)


In [None]:
!pip install Bio

<style>
.phase-container {
    display: flex;
    flex-wrap: wrap;
    justify-content: center;
    gap: 15px;
    padding: 20px;
    font-family: 'Poppins', Roboto, sans-serif;
}

.phase {
    width: 260px;
    height: 70px;
    line-height: 70px;
    color: white;
    font-weight: 600;
    text-align: center;
    border-radius: 6px;
    position: relative;
    font-size: 1.05em;
    clip-path: polygon(0% 0%, 92% 0%, 100% 50%, 92% 100%, 0% 100%);
    box-shadow: 2px 4px 10px rgba(0, 0, 0, 0.2);
    opacity: 0.9;
    transition: all 0.3s ease;
    flex-shrink: 0;
}

.phase:hover {
    transform: scale(1.05);
    box-shadow: 4px 6px 15px rgba(0, 0, 0, 0.25);
    opacity: 1;
}

.phase1 { background: #69B8F7; }
.phase2 { background: #54A6F3; }
.phase3 { background: #2B7BC1; }
.phase4 { background: #1F557F; }
.phase5 { background: #5C9D99; }
.phase6 { background: #4E9B97; }

@media (max-width: 768px) {
    .phase {
        width: 90%;
    }
}
</style>

<div class="phase-container">
    <div class="phase phase1">1. Create Baseline</div>
    <div class="phase phase2">2. Add BM25 (Retrieval Algorithm)</div>
    <div class="phase phase3">3. Add Classification of Questions Model</div>
    <div class="phase phase4">4. Add Model for Yes/No Questions</div>
    <div class="phase phase5">5. Add Model for Factoid, List Questions</div>
    <div class="phase phase6">6. Add Model for Summary Questions</div>
</div>


## Library Imports


<a id="import"></a>

In [31]:
# Standard Library
import json           
import time           
import logging        
import string        
import statistics
import re             
from pathlib import Path 
from typing import List, Dict

# Bioinformatics Libraries
from Bio import Entrez, Medline   # For accessing and parsing PubMed/NCBI data

# Text Search / Ranking
from rank_bm25 import BM25Okapi   # For BM25 ranking algorithm. Extension of the TF-IDF (Term Frequency-Inverse Document Frequency) model, taking into account term frequency saturation and document length to improve ranking accuracy. 


# NLP and Tokenization Tools
import nltk                       # Natural Language Toolkit (tokenization, stopwords, etc.)

# Progress Visualization
from tqdm import tqdm, trange    

# Machine Learning Libraries
import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering, pipeline
)


In [None]:
# Insanly Slow. Just know that till I state differently what we are doing here is nothing else but just a automatic search of the database
# Without any aditional thing added. This is so to say like a random predictor and everything for phase A will be built on top of this.
"""

logging.basicConfig(
    filename="phaseA_errors.log",
    level=logging.WARNING,
    format="%(asctime)s %(levelname)s %(message)s"
)


TRAIN = Path("/Users/greinaldpappa/Downloads/BioASQ-training13b/training13b.json")
OUT   = Path("./api_phaseA_run.json")


data = json.loads(TRAIN.read_text())
qs   = data["questions"]
print(f"🗒️  Loaded {len(qs)} questions.")


sess = start_session()
print("🗝️  API session:", sess)


predictions = []
for q in tqdm.tqdm(qs, unit="Q"):
    qid, query = q["id"], q["body"]
    pmids = []
    for attempt in range(1, 4):       # up to 3 tries per question
        try:
            pmids = find_pubmed_citations(sess, query, k=1000)
            break                       # success → exit retry loop
        except Exception as exc:
            wait = 2 ** attempt         # exponential back-off: 2,4,8s
            logging.warning(
                f"QID={qid} attempt {attempt} failed: {exc!r}; retrying in {wait}s"
            )
            time.sleep(wait)
    else:
        # all retries failed → log and move on with empty list
        logging.error(f"QID={qid} all retries failed; returning empty list")
    predictions.append({
        "query_id": qid,
        "documents": pmids
    })
    time.sleep(0.1)                


OUT.write_text(json.dumps(predictions, indent=2))
print(f"✅  Wrote {len(predictions)} entries to {OUT}")
print("⚠️  If you saw errors, check phaseA_errors.log.")"""

# Baseline Information Retrieval

In [20]:
TRAIN = Path("/Users/greinaldpappa/Downloads/BioASQ-training13b/training13b.json")
OUT   = Path("/Users/greinaldpappa/Downloads/BioASQ-training13b/esearch_phaseA_run.json")

# Get your API and put it HERE https://account.ncbi.nlm.nih.gov/settings/
EMAIL   = ""        
API_KEY = ""       

MAX_DOCS = 10        # list size required by BioASQ Phase-A
RETMX    = 60        # retrieve more, then truncate
SLEEP    = 0.11      # 10 requests / sec with API key



logging.basicConfig(
    filename="/Users/greinaldpappa/Downloads/BioASQ-training13b/phaseA_esearch.log",
    level=logging.WARNING,
    format="%(asctime)s %(levelname)s %(message)s"
)



Entrez.email   = EMAIL
Entrez.api_key = API_KEY



def esearch_pmids(query: str, k: int = RETMX) -> List[str]:
    """Return up to *k* PMIDs ranked by PubMed’s relevance score."""
    for attempt in range(1, 4):
        try:
            handle = Entrez.esearch(
                db="pubmed",
                term=query,
                retmax=k,
                sort="relevance",
                retmode="xml"
            )
            ids = Entrez.read(handle).get("IdList", [])
            time.sleep(SLEEP)             # respect 10 req/s limit when connected to API
            return ids
        except Exception as exc:
            logging.warning(f"E-search retry {attempt}: {exc!r}")
            time.sleep(2 ** attempt)      # exponential back-off
    logging.error(f"All retries failed for query: {query[:60]!r}")
    return []

def pmid_to_url(pmid: str) -> str:
    return f"http://www.ncbi.nlm.nih.gov/pubmed/{pmid}"

# MAIN WORK 

qs = json.loads(TRAIN.read_text())["questions"]
print(f"🗒️  Loaded {len(qs)} questions from {TRAIN.name}")

predictions = []
for q in tqdm(qs, unit="Q"):
    pmids = esearch_pmids(q["body"])
    predictions.append({
        "id": q["id"],
        "documents": [pmid_to_url(p) for p in pmids][:MAX_DOCS],
        "snippets": []                      # keep the field, even if empty becuase the test expects it
    })

OUT.write_text(json.dumps({"questions": predictions}, indent=2))
print(f"✅  Wrote Phase-A run file → {OUT.resolve()}")
print("⚠️  Check phaseA_esearch.log for warnings or API errors.")

🗒️  Loaded 5389 questions from training13b.json


100%|██████████| 5389/5389 [1:05:46<00:00,  1.37Q/s]

✅  Wrote Phase-A run file → /Users/greinaldpappa/Documents/GitHub/SS25AIR_Group14/esearch_phaseA_run.json





Test Score:
| Metric Index | Metric Name                          | Value                     |
|--------------|--------------------------------------|---------------------------|
| 1            | MAP (Mean Average Precision)         | 0.09842234475283342       |
| 2            | GMAP (Geometric Mean Average Prec.)  | 0.1201860059504669        |



## BM 25 Ranking

1. We are basically pooling for teh API and then fetching Title+Abstract and ranking using the BM 25 Ranking Algorithm(see below)

$$
\text{score}(q,d)= \sum_{t\in q} \underbrace{\text{IDF}(t)}_{\text{rarer words get more weight}} \cdot
\frac{f_{t,d}\,(k_1+1)}{f_{t,d}+k_1\bigl(1-b+b\cdot\frac{|d|}{\mathrm{avgdl}}\bigr)}
$$

* $f_{t,d}$ = how many times term *t* appears in *d*  
* $|d| / avgdl$  = length-normalisation so long abstracts aren’t unfairly boosted  
* **IDF(t)**   = inverse-document-frequency: words that appear in *many* abstracts (“the”, “cell”) get near-zero weight, rare biomedical terms (“trastuzumab”) dominate  
* $k_1,\,b$ = tuning constants (rank_bm25 defaults: 1.5, 0.75)

In [32]:

ROOT        = Path("/Users/greinaldpappa/Downloads/BioASQ-training13b")
TRAIN_FILE  = ROOT / "training13b.json"
RUN_FILE    = ROOT / "bm25_phaseA_run.json"  
CACHE_DIR   = ROOT / "medline_cache"
CACHE_DIR.mkdir(exist_ok=True)

# Get your API and put it HERE https://account.ncbi.nlm.nih.gov/settings/
EMAIL   = "pgreinald@gmail.com"
API_KEY = "9666f51fccbd68a29320334f1d78ad257608"

# hyper-params you can play around with these to see if you get different results(Only play with candidates)
CANDIDATES = 120    # eSearch pool
MAX_DOCS   = 10     # BM25 output size. Limited according to https://www.bioasq.org/
SLEEP      = 0.11   


logging.basicConfig(filename=ROOT / "bm25_phaseA.log",
                    level=logging.WARNING,
                    format="%(asctime)s %(levelname)s %(message)s")

Entrez.email, Entrez.api_key = EMAIL, API_KEY

# Nothing changed from before just a normal search like in google.
def fetch_candidates(question: str, k: int = CANDIDATES) -> List[str]:
    for attempt in range(1, 4):
        try:
            h = Entrez.esearch(db="pubmed", term=question,
                               sort="relevance", retmax=k, retmode="xml")
            ids = Entrez.read(h).get("IdList", [])
            time.sleep(SLEEP)
            return ids
        except Exception as exc:
            logging.warning(f"eSearch retry {attempt}: {exc!r}")
            time.sleep(2 ** attempt)
    logging.error(f"eSearch failed for {question[:60]!r}")
    return []

# Here the caching happens. The cache is a local file system cache where we save title and Abstract
def _cache(pmid: str) -> Path:
    return CACHE_DIR / f"{pmid}.txt"

def get_abstracts(pmids: List[str]) -> Dict[str, str]:
    texts, miss = {}, []
    for p in pmids:
        fp = _cache(p)
        if fp.exists():
            texts[p] = fp.read_text()
        else:
            miss.append(p)

    for start in trange(0, len(miss), 200, leave=False, desc="efetch"):
        batch = miss[start:start+200]
        if not batch: break
        try:
            h = Entrez.efetch(db="pubmed", id=",".join(batch),
                              rettype="medline", retmode="text")
            for rec in Medline.parse(h):
                txt = f"{rec.get('TI','')} {rec.get('AB','')}".strip()
                _cache(rec["PMID"]).write_text(txt)
                texts[rec["PMID"]] = txt
        except Exception as exc:
            logging.warning(f"EFetch chunk failed: {exc!r}")
        time.sleep(SLEEP)

    for p in pmids:
        texts.setdefault(p, "")
    return texts

#  BM25 rerank 
def bm25_top10(question: str, pmids: List[str], texts: Dict[str, str]) -> List[str]:
    usable = [p for p in pmids if texts[p].strip()]
    if not usable:
        return pmids[:MAX_DOCS]
    corpus = [texts[p].lower().split() for p in usable]
    bm25   = BM25Okapi(corpus)
    q_tok  = question.lower().split()
    scores = bm25.get_scores(q_tok)
    ranked = [p for p, _ in sorted(zip(usable, scores), key=lambda x: -x[1])]
    return ranked[:MAX_DOCS]

# Picks the sentence in the abstract with the most word-overlap with the question and returns it as a BioASQ-formatted snippet
_punct = str.maketrans(string.punctuation, " " * len(string.punctuation))
def best_snippet(question: str, pmid: str, text: str):
    if not text: return None
    q_terms = set(question.lower().translate(_punct).split())
    sentences = nltk.sent_tokenize(text)
    best = max(sentences,
               key=lambda s: len(set(s.lower().split()) & q_terms),
               default="")
    if not best: return None
    return {
        "document": pmid_to_url(pmid),
        "text": best,
        "offsetInBeginSection": 0,
        "offsetInEndSection": len(best),
        "beginSection": "abstract",
        "endSection": "abstract"
    }

def pmid_to_url(pid: str) -> str:
    return f"http://www.ncbi.nlm.nih.gov/pubmed/{pid}"

# Main function to run the pipeline
def main():
    qs = json.loads(TRAIN_FILE.read_text())["questions"]
    print(f"🗒️  Loaded {len(qs)} questions")

    out_qs = []
    for q in tqdm(qs, unit="Q", desc="BM25 pipeline"):
        pmids      = fetch_candidates(q["body"])
        abstracts  = get_abstracts(pmids)
        top_pmids  = bm25_top10(q["body"], pmids, abstracts)
        docs       = [pmid_to_url(p) for p in top_pmids]

        snips = []
        for p in top_pmids:
            s = best_snippet(q["body"], p, abstracts[p])
            if s: snips.append(s)
            if len(snips) == 10: break

        out_qs.append({"id": q["id"], "documents": docs, "snippets": snips})

    RUN_FILE.write_text(json.dumps({"questions": out_qs}, indent=2))
    print(f"✅  BM25 submission written → {RUN_FILE}")


if __name__ == "__main__":
    main()
    print("ℹ️  Baseline file remains at esearch_phaseA_run.json")
    print("⚠️  Check bm25_phaseA.log for warnings or API errors.")

🗒️  Loaded 5389 questions


BM25 pipeline: 100%|██████████| 5389/5389 [1:58:50<00:00,  1.32s/Q]  


✅  BM25 submission written → /Users/greinaldpappa/Downloads/BioASQ-training13b/bm25_phaseA_run.json
ℹ️  Baseline file remains at esearch_phaseA_run.json


Test Score:

| Metric Index | Metric Name                          | Value                     |
|--------------|--------------------------------------|---------------------------|
| 1            | MAP (Mean Average Precision)         | 0.10938912894285033       |
| 2            | GMAP (Geometric Mean Average Prec.)  | 0.12504363569986954       |


In [34]:
1693 / 5389

0.31415847095936167

In [33]:
# This is a quick check to see if the documents are being fetched correctly.
# Quick check. As I assumed the documents are not being fetched correctly in the first place where we have more than 1 hit only 0.31% of the documents
# but I dont understand why becuase we are looking at enough documents to expect good rankings but seeing from baseline retrival we are most of the time
# retriving irelevant information.

GOLD = json.load(open(TRAIN_FILE))["questions"]
RUN  = json.load(open(RUN_FILE))["questions"]

hit_positions = []
for g, r in zip(GOLD, RUN):
    gold_set = set(g["documents"])
    ranked   = r["documents"]
    pos      = next((i+1 for i, d in enumerate(ranked) if d in gold_set), None)
    if pos: hit_positions.append(pos)

print("Questions with ≥1 hit:", len(hit_positions), "/", len(GOLD))
print("Median hit position  :", statistics.median(hit_positions))

Questions with ≥1 hit: 1693 / 5389
Median hit position  : 2


# Phase A Plus which requires Answers and not just documents anymore
## See Steps below


## 🔄 Phase-A+ QA Pipeline: Step-by-Step

### **Step 1: Classify the Question Type**
- **Input**: Raw question text (e.g., `"Is Hirschsprung disease a mendelian or a multifactorial disorder?"`)
- **Model**: BioBERT-based sequence classifier  
  (`myctgh/biobert-question-type-bioasq13b`)
- **Tokenizer**: `dmis-lab/biobert-base-cased-v1.1`
- **Output**: One of the four types:
  - `"yesno"`
  - `"factoid"`
  - `"list"`
  - `"summary"`

---

### **Step 2: Select the Answering Strategy Based on Type**

#### 🔹 If `yesno`
- **Model**: BioBERT binary classifier  
  (`myctgh/biobert-yesno-bioasq13b`)
- **Tokenizer**: Reuse BioBERT tokenizer
- **Task**: Predict "yes" or "no" based on question and context

---

#### 🔹 If `factoid` or `list`
- **Model**: RoBERTa QA model  
  (`deepset/roberta-base-squad2`)
- **Tokenizer**: RoBERTa tokenizer
- **Task**: Extract spans from the context as answers

---

#### 🔹 If `summary`
- **Model**: T5-based abstractive summarizer  
  (`google/bioctrl-t5-base`)
- **Tokenizer**: T5 tokenizer
- **Task**: Generate a short natural-language summary from context

---

### **Step 3: Save Answers**
- Store both:
  - `"exact_answer"`: from yes/no, span, or list models
  - `"ideal_answer"`: generated summary
- Save to: `answers_phaseA+.json`




In [None]:
# phaseA_plus.
# pip install transformers sentencepiece accelerate torch>=2.0

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Q-type classifier (4-way)
# This here tokenizes the questions
qtype_tok   = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
# This here is a model which is pretrained on the BioASQ dataset and fine-tuned to give the type of question
qtype_model = AutoModelForSequenceClassification.from_pretrained(
                 "myctgh/biobert-question-type-bioasq13b")  # fine-tuned model
qtype_model.to(DEVICE).eval()  # Move model to device (GPU if available) and set to evaluation mode

# Yes/No classifier (binary)
# Reuses the same tokenizer as above (BioBERT-based)
yn_tok   = qtype_tok
# Loads a binary classifier (yes/no) fine-tuned on BioASQ yes/no questions
yn_model = AutoModelForSequenceClassification.from_pretrained(
              "myctgh/biobert-yesno-bioasq13b").to(DEVICE).eval()

# Extractive QA for factoid / list
# Tokenizer for extractive QA using RoBERTa model fine-tuned on SQuAD2.0
qa_tok   = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
# Model for extractive QA (predicts answer spans in the context)
qa_model = AutoModelForQuestionAnswering.from_pretrained(
              "deepset/roberta-base-squad2").to(DEVICE).eval()

# HuggingFace pipeline to simplify question answering over a context
# Only used if the question type is either factoid or list
qa_pipe  = pipeline("question-answering",
                    model=qa_model, tokenizer=qa_tok,
                    device=0 if DEVICE=="cuda" else -1)


# Abstractive summariser for “ideal” answer
# Tokenizer for T5 model fine-tuned on BioASQ summarization task
sum_pipe = pipeline("summarization",
                    model="google/bioctrl-t5-base",
                    tokenizer="google/bioctrl-t5-base",
                    device=0 if DEVICE=="cuda" else -1,
                    max_length=150)


# Storing the question types in a list
Q_TYPES = ["yesno", "factoid", "list", "summary"]   


# This function takes a question string and returns its predicted type ("yesno", "factoid", "list", or "summary")
def classify_question(q: str) -> str:
    with torch.no_grad():  # Disable gradient tracking (inference mode, faster and uses less memory)
        # Tokenize the input question and move tensors to the correct device (CPU or GPU) I dont know in the Jupiter Notebook it says that
        # we are using the notebook is using GPU but when I go into it I just see mulitple CPUS(We can apply multithreding if no GPU since we have 40 CPUS).
        inputs = qtype_tok(q, return_tensors="pt").to(DEVICE)
        
        # Get the raw output logits from the classifier model
        logits = qtype_model(**inputs).logits
        
        # Get the index of the highest logit (i.e., most probable class), convert to corresponding label. Basically we are
        # getting out the index of the highest value in the logits and then we are using that index to get the type of question(Think of it as a vector where there are
        # probabilities of each question type and we want the most probable one)
        return Q_TYPES[int(logits.argmax())]


# This function answers yes/no questions using the yes-no classifier model
def yes_no_answer(q: str, ctx: str) -> str:
    # Format input as "question [SEP] context" and tokenize it
    inp = yn_tok(f"{q} [SEP] {ctx}", truncation=True,
                 return_tensors="pt").to(DEVICE)
    
    # Get logits and convert them to probabilities using softmax
    # Output: [no_prob, yes_prob]
    prob = yn_model(**inp).logits.softmax(-1).tolist()[0]
    
    # Return "yes" if yes_prob > no_prob, otherwise "no"
    return "yes" if prob[1] > prob[0] else "no"

# This function returns the top-k predicted spans from the context as answers to the question
# Requires pulling exact words/phrases from the text, not generating them.
def span_answers(q: str, ctx: str, top_k=5) -> list[str]:
    # Use the extractive QA pipeline to get answers from the context
    res = qa_pipe(question=q, context=ctx, top_k=top_k)
    
    # If multiple answers returned (list), extract each "answer" string
    # Otherwise, return a single answer in a list
    return [r["answer"] for r in res] if isinstance(res, list) else [res["answer"]]


# This function processes a raw list of string answers (from extractive QA),
# cleaning and formatting them for list-type questions.

def post_process_list(raw: list[str]) -> list[list[str]]:
    # Step 1: Clean each answer string
    # - Remove all non-word characters except dash (-) and space
    # - Strip leading/trailing whitespace
    clean = [re.sub(r"[^\w\- ]", "", a).strip() for a in raw]
    # Step 2: Deduplicate answers while preserving order (using dict.fromkeys)
    # Step 3: Format each cleaned answer as a list of one item (as required by BioASQ format)
    return [[a] for a in dict.fromkeys(clean) if a]


# File Paths.
DATA_DIR = Path("/Users/greinaldpappa/Downloads/BioASQ-training13b")
PHASEA   = DATA_DIR / "bm25_phaseA_run.json"
OUTFILE  = DATA_DIR / "answers_phaseA+.json"

# Load Phase-A BM25 results (predicted relevant snippets) from file
phaseA = json.loads(PHASEA.read_text())["questions"]

answers_json = []  # This will store the final answers for all questions

# Iterate over each question object in the Phase-A results
for qobj in phaseA:
    qid = qobj["id"]  # Unique question ID
    
    # Retrieve the full question text ("body") by matching ID in the training file
    query = next(q["body"] for q in
                 json.loads((DATA_DIR / "training13b.json").read_text())["questions"]
                 if q["id"] == qid)

    # Predict the type of the question: yesno, factoid, list, or summary
    qtype = classify_question(query)

    # Concatenate all retrieved snippets into a single context string (truncate to 3500 characters)
    ctx = " ".join(s["text"] for s in qobj["snippets"])[:3500]

    # Choose the appropriate answering strategy based on question type
    if qtype == "yesno":
        # Use binary classifier to return "yes" or "no"
        exact = yes_no_answer(query, ctx)
    elif qtype in {"factoid", "list"}:
        # Use extractive QA model to get answer spans
        raw = span_answers(query, ctx, top_k=8)
        # Clean and format answers (e.g., [["fever"], ["cough"]])
        exact = post_process_list(raw)
    else:
        # For summary questions, leave exact answer empty
        exact = []

    # Generate an ideal natural-language summary using the summarization pipeline
    ideal = sum_pipe(ctx, min_length=30, max_length=120,
                     do_sample=False)[0]["summary_text"]

    # Append the answer object to the final output list
    answers_json.append({
        "id": qid,
        "type": qtype,
        "exact_answer": exact,
        "ideal_answer": ideal.strip()
    })
    
# dump submission
OUTFILE.write_text(json.dumps({"questions": answers_json}, indent=2))
print("✅  Phase-A* answers written →", OUTFILE)
