# BioMed: Information Retrieval - BioMedical Information Retrieval System

---

**Group:**
- Reyes Castro, Didier Yamil (didier.reyes.castro@alumnos.upm.es)
- Rodriguez Fernández, Cristina ()

**Course:** BioMedical Informatics - 2025/26

**Institution:** Polytechnic University of Madrid (UPM)

**Date:** November 2026

---

## Goal

To develop an Information Retrieval system — specifically, a **binary text classifier** — to identify scientific articles in the PubMed database that are related to a given set of abstracts within a defined research topic. In this case, the focus is on a collection of 1,308 manuscripts containing information on the polyphenol composition of various foods.

## Setup and Installation

In [None]:
# !pip install scikit-learn pandas requests transformers pytorch datasets numpy

In [1]:
import requests
import time
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
ESEARCH_URL = BASE_URL + "esearch.fcgi"
FETCH_URL = BASE_URL + "efetch.fcgi"
DS_WITH_PMID = 'publications_pmid.csv'
PMID_ABSTRACTS = 'publications_pmid_abstract.csv'
ELSEVIER_SEARCH_URL = "https://api.elsevier.com/content/search/scopus"
ELSEVIER_ABSTRACTS = 'publications_abstract_pubmed_elsevier.csv'
IRRELEVANT_PUBLICATIONS = 'irrelevant_publications.csv'


## **Task 1:** 

Retrieve from PubMed the abstracts associated with each publication in publications.xlsx

(21 minutes with API KEY)

In [None]:
dataset = pd.read_csv('publications.csv')
dataset

In [None]:
# Step 1: Search for the PMID of the articles by title
def search_pmid(article):
    
    title = article['title']
    params = {
        "db": "pubmed",
        "term": f"{title}",
        "retmode": "json",
        "field": "title"
    }

    try:

        # Trying to find the PMID
        response = requests.get(ESEARCH_URL, params=params)
        response.raise_for_status()
        data = response.json()

        if len(data["esearchresult"]["idlist"]) >= 1:
            pmid = data['esearchresult']['idlist'][0]
            print(f"> Found PMID for article: {pmid}")
            return pmid
        
        print(f"> No PMID found for article.")
        return None

    except requests.exceptions.RequestException as e:
        print(f"> ERROR during request for article: {e}")
        return None
    
ds_pmid = dataset.copy()
for idx, article in ds_pmid.iterrows():
    print(f"[{idx + 1}/{len(ds_pmid)}] Searching PMID for: {article['title']}")
    pmid = search_pmid(article)
    ds_pmid.at[idx, 'pmid'] = pmid

ds_pmid.to_csv(DS_WITH_PMID, index=False)

In [None]:
print("Number of articles with PMID found:", ds_pmid['pmid'].notnull().sum())

In [3]:
# Step 2: Fetch article abstract by PMID
def fetch_abstract_by_pmid(pmid):
    params = {
        "db": "pubmed",
        "id": f"{pmid}",
        "retmode": "text",
        "rettype": "abstract",
    }

    try:
        response = requests.get(FETCH_URL, params=params)
        response.raise_for_status()
        print(f"> Fetched abstract!!")
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"> ERROR fetching abstract for PMID '{pmid}': {e}")
        return None
    
# ds_pmid_abstract = ds_pmid.copy()
#for idx, article in ds_pmid_abstract.iterrows():
#    pmid = article['pmid']
#    if pd.notnull(pmid):
#        print(f"[{idx + 1}/{len(ds_pmid_abstract)}] Fetching abstract for PMID: {pmid}")
#        abstract = fetch_abstract_by_pmid(pmid)
#        ds_pmid_abstract.at[idx, 'abstract'] = abstract
#    else:
#        print(f"[{idx + 1}/{len(ds_pmid_abstract)}] No PMID available, skipping abstract fetch.")
#        ds_pmid_abstract.at[idx, 'abstract'] = None

In [None]:
ds_pmid_abstract.to_csv(PMID_ABSTRACTS, index=False)
print("Number of articles with abstract fetched:", ds_pmid_abstract['abstract'].notnull().sum())
print("Number of articles without abstract fetched:", ds_pmid_abstract['abstract'].isnull().sum())

In [None]:
def search_scopus(article_row, api_key):

    try:

        title = article_row.get('title', '').replace('"', '') # Remove quotes for query
        headers = {"Accept": "application/json"}
        params = {
            "query": f"TITLE-ABS-KEY(\"{title}\")",
            "apiKey": api_key
        }
        
        response = requests.get(ELSEVIER_SEARCH_URL, 
                                headers=headers, params=params, timeout=10)
        response.raise_for_status() 
        data = response.json()

        if 'search-results' in data and data['search-results']['entry']:
            try:
                return data['search-results']['entry'][0]['prism:url']
            except KeyError:
                return None
        return None

    except requests.exceptions.RequestException as e:
        print(f"> Scopus ERROR: {e}")

In [None]:
ds_elsevier = pd.read_csv(PMID_ABSTRACTS)

counters = {
    'total_missing': ds_elsevier['abstract'].isnull().sum(),
    'elsevier_found': 0,
    'failed': 0
}

for i, row in ds_elsevier.iterrows():

    if not pd.isnull(row['abstract']):
        continue

    print(f"[{i + 1}/{len(ds_elsevier)}] Searching ELSEVIER for abstract of article: {row['title']}")
    abstract_url = search_scopus(row, "3f5ff36eb8d1d409e3befea2ed2aa2cc")
    
    if abstract_url:
        response = requests.get(abstract_url, 
                                headers={"Accept": "application/json", 
                                         "X-ELS-APIKey": "3f5ff36eb8d1d409e3befea2ed2aa2cc"})
        if response.status_code == 200:
            data = response.json()
            try:
                abstract_text = data['abstracts-retrieval-response']['coredata']['dc:description']
                ds_elsevier.at[i, 'abstract'] = abstract_text
                print("> Found abstract via ELSEVIER!")
                counters['elsevier_found'] += 1
            except KeyError:
                print("> Abstract not found in ELSEVIER response.")
                counters['failed'] += 1
    else:
        print("> Nope :(")
        counters['failed'] += 1

    time.sleep(1) # Polite 1-second delay

ds_elsevier.to_csv(ELSEVIER_ABSTRACTS, index=False)
print("Summary of ELSEVIER abstract search:")
print(f"Total missing abstracts at start: {counters['total_missing']}")
print(f"Abstracts found via ELSEVIER: {counters['elsevier_found']}")
print(f"Failed attempts: {counters['failed']}")

## **Task 2:**

Use the EUtilities tool to search for articles whose content is not relevant to this task. Size of the dataset should be the same of relevant documents.

In [4]:
def get_articles_pmids_for_title(title, count):
    
    params = {
        "db": "pubmed",
        "term": f"{title}[tiab]",
        "retmode": "json",
        "retmax": count
    }

    try:
        response = requests.get(ESEARCH_URL, params=params)
        response.raise_for_status()
        data = response.json()

        if 'esearchresult' in data and data['esearchresult']['count'] != '0':
            print(f"Found {data['esearchresult']['count']} articles.")
            return data['esearchresult']['idlist']
        else:
            print(f"Found {data['esearchresult']['count']} irrelevant articles.")
            return []

    except requests.exceptions.RequestException as e:
        print(f"Error during request for irrelevant articles: {e}")
        return []


In [6]:
final_df = pd.read_csv('publications_abstract_pubmed_elsevier.csv')
num_with_abstracts = final_df['abstract'].notnull().sum()

irrelevant_pmids_list = get_articles_pmids_for_title("cancer", int(num_with_abstracts))
irrelevant_abstracts = []
for i, pmid in enumerate(irrelevant_pmids_list):

    article_info = {
        'pmid': pmid,
        'abstract': None
    }

    print(f"[{i + 1}/{len(irrelevant_pmids_list)}] Fetching abstract for PMID: {pmid}")
    article_info['abstract'] = fetch_abstract_by_pmid(pmid)
    irrelevant_abstracts.append(article_info)

# Save irrelevant abstracts to a new dataset
irrelevant_df = pd.DataFrame(irrelevant_abstracts)
irrelevant_df.to_csv(IRRELEVANT_PUBLICATIONS, index=False)

Found 2529778 articles.
[1/1203] Fetching abstract for PMID: 41204822
> Fetched abstract!!
[2/1203] Fetching abstract for PMID: 41204804
> Fetched abstract!!
[3/1203] Fetching abstract for PMID: 41204791
> Fetched abstract!!
[4/1203] Fetching abstract for PMID: 41204774
> Fetched abstract!!
[5/1203] Fetching abstract for PMID: 41204772
> Fetched abstract!!
[6/1203] Fetching abstract for PMID: 41204766
> Fetched abstract!!
[7/1203] Fetching abstract for PMID: 41204749
> Fetched abstract!!
[8/1203] Fetching abstract for PMID: 41204725
> Fetched abstract!!
[9/1203] Fetching abstract for PMID: 41204722
> Fetched abstract!!
[10/1203] Fetching abstract for PMID: 41204717
> Fetched abstract!!
[11/1203] Fetching abstract for PMID: 41204712
> Fetched abstract!!
[12/1203] Fetching abstract for PMID: 41204710
> Fetched abstract!!
[13/1203] Fetching abstract for PMID: 41204697
> Fetched abstract!!
[14/1203] Fetching abstract for PMID: 41204684
> Fetched abstract!!
[15/1203] Fetching abstract for P

In [7]:
irrelevant_df

Unnamed: 0,pmid,abstract
0,41204822,1. Cell Cycle. 2025 Nov 8:1-23. doi: 10.1080/1...
1,41204804,1. Int J Environ Health Res. 2025 Nov 8:1-12. ...
2,41204791,1. Cancer Med. 2025 Nov;14(21):e71372. doi: 10...
3,41204774,1. IUBMB Life. 2025 Nov;77(11):e70075. doi: 10...
4,41204772,1. Am J Med Genet A. 2025 Nov 8:e64305. doi: 1...
...,...,...
1198,41193040,1. J Am Coll Radiol. 2025 Nov;22(11S):S492-S50...
1199,41193032,1. Am J Clin Nutr. 2025 Nov;122(5):1195-1203. ...
1200,41193016,1. West J Emerg Med. 2025 Sep 25;26(5):1211-12...
1201,41192978,1. J Gene Med. 2025 Nov;27(11):e70047. doi: 10...


## **Task 4:**

Implement the chosen retrieval system using the programming language of their choice. If the information retrieval system is based on machine learning techniques, the student must split the existing datasets (relevant and non-relevant documents) into three distinct groups (training, validation, and testing) to carry out the model training.

**CHOSEN RETRIEVAL SYSTEM:** BioBERT-based Binary Text Classifier

In [None]:
# Adding target variable 'relevance' 
relevant_df['relevance'] = 1
irrelevant_df['relevance'] = 0

# Combining relevant and irrelevant datasets and maintaining only abstract and relevance columns
features = ['abstract', 'relevance']
combined_df = pd.concat([relevant_df[features], irrelevant_df[features]], ignore_index=True)

# Remove any rows where the abstract is missing (e.g., API fetch failed)
combined_df.dropna(subset=['abstract'], inplace=True)
combined_df.reset_index(drop=True, inplace=True)

# Saving
combined_df.to_csv('combined_publications.csv', index=False)

print("Class distribution:")
print(combined_df['relevance'].value_counts())

combined_df

Following Fine-tuning of BERT for text classification tasks: https://huggingface.co/docs/transformers/en/tasks/sequence_classification

- Train-Test-Validation Split: 80%-10%-10%

In [None]:
RANDOM_STATE = 42

train_df, test_df = train_test_split(combined_df,
                                     test_size=0.2,
                                     stratify=combined_df["relevance"],
                                     random_state=RANDOM_STATE)

val_df, test_df = train_test_split(test_df,
                                   test_size=0.5,
                                   stratify=test_df["relevance"],
                                   random_state=RANDOM_STATE)

print(f"Training size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

- Convert Pandas DataFrame to HuggingFace Dataset

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

- Tokenization of abstracts using BioBERT tokenizer

In [None]:
BERT_MODEL_NAME = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

def tokenize(examples):
    return tokenizer(examples["abstract"], 
                     padding="max_length", 
                     truncation=True,
                     max_length=512 # Maximum length for BERT models
                    )

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Renaming the target column to 'labels' as expected by HuggingFace Trainer
train_dataset = train_dataset.rename_column("relevance", "labels")
val_dataset = val_dataset.rename_column("relevance", "labels")
test_dataset = test_dataset.rename_column("relevance", "labels")

- Loading BioBERT model for binary text classification (relevant vs irrelevant)

In [None]:
id2label = {0: "irrelevant", 1: "relevant"}
label2id = {"irrelevant": 0, "relevant": 1}

model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_NAME, 
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

- Defining evaluation metrics

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, 
                                                               predictions, 
                                                               average="binary",
                                                               zero_division=0)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

- Putting the training arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./biobert_pubmed_classifier",

    # Training hyperparameters
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    # Optimiser settings
    weight_decay=0.01,
    
    # Evaluation settings
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,

    # Model selection    
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    # Performance
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=4,

    seed=RANDOM_STATE,
    push_to_hub=False,
    report_to="none"
)

- Actual training using Trainer API

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

- Evaluating on the test set

In [None]:
predictions_output = trainer.predict(test_dataset)
predictions = np.argmax(predictions_output.predictions, axis=-1)
true_labels = predictions_output.label_ids

# Calculate all metrics
test_metrics = compute_metrics((predictions_output.predictions, true_labels))

print("\nTest Set Results:")
print(f"Accuracy:  {test_metrics['accuracy']:.4f}")
print(f"Precision: {test_metrics['precision']:.4f}")
print(f"Recall:    {test_metrics['recall']:.4f}")
print(f"F1-Score:  {test_metrics['f1']:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(
    true_labels, 
    predictions,
    target_names=['Irrelevant', 'Relevant'],
    digits=4
))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(true_labels, predictions)
print(cm)
print(f"\nTrue Negatives:  {cm[0][0]} (correctly identified irrelevant)")
print(f"False Positives: {cm[0][1]} (incorrectly marked relevant)")
print(f"False Negatives: {cm[1][0]} (missed relevant papers)")
print(f"True Positives:  {cm[1][1]} (correctly identified relevant)")

- saving the trained model

In [None]:
model_save_path = './final_biobert_classifier'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)