# BioMed: Information Retrieval - BioMedical Information Retrieval System

---

**Group:**
- Reyes Castro, Didier Yamil (didier.reyes.castro@alumnos.upm.es)
- Rodriguez Fernández, Cristina ()

**Course:** BioMedical Informatics - 2025/26

**Institution:** Polytechnic University of Madrid (UPM)

**Date:** November 2026

---

## Goal

To develop an Information Retrieval system — specifically, a **binary text classifier** — to identify scientific articles in the PubMed database that are related to a given set of abstracts within a defined research topic. In this case, the focus is on a collection of 1,308 manuscripts containing information on the polyphenol composition of various foods.

## Setup and Installation

In [None]:
# !pip install scikit-learn pandas requests transformers pytorch datasets numpy

In [1]:
import requests
import time
import re

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

  from .autonotebook import tqdm as notebook_tqdm


## **Task 1:** 

Retrieve from PubMed the abstracts associated with each publication in publications.xlsx

(21 minutes with API KEY)

In [3]:
dataset = pd.read_csv('publications.csv')
dataset

Unnamed: 0,id,authors,year_of_publication,title,abbreviation,journal_name,journal_volume,journal_issue,pages,created_at,updated_at
0,1216,"Aaby K., Wrolstad R.E., Ekeberg D., Skrede G.",2007,Polyphenol composition and antioxidant activit...,AABY 2007,Journal of Agricultural and Food Chemistry,55,13,5156-5166,2012-12-01 22:21:08 UTC,2015-04-14 04:25:30 UTC
1,1052,"Abd El Mohsen M.M., Kuhnle G., Rechner A.R., S...",2002,Uptake and metabolism of epicatechin and its a...,ABD EL MOHSEN 2002,Free Radic Biol Med,33,12,1693-702,2015-04-13 21:45:29 UTC,2015-04-14 04:25:30 UTC
2,356,"Abdel-Aal E.-S.M., Hucl P.",2003,Composition and stability of anthocyanins in b...,ABDEL-AAL 2003,Journal of Agricultural and Food Chemistry,51,,2174-2180,2015-04-13 21:45:25 UTC,2015-04-14 04:25:30 UTC
3,458,"Abdel-Aal E.-S. M., Young C., Rabalski I.",2006,"Anthocyanin composition in black, blue, pink, ...",ABDEL-AAL 2006,Journal of Agricultural and Food Chemistry,54,,4696-4704,2006-04-09 12:07:36 UTC,2015-04-14 04:25:31 UTC
4,332,"Abril M., Negueruela A.I., Perez C., Juan T., ...",2005,Preliminary study of resveratrol content in Ar...,Apr-05,Food Chemistry,92,4,729-736,2015-04-13 21:45:25 UTC,2015-04-13 21:45:25 UTC
...,...,...,...,...,...,...,...,...,...,...,...
1303,816,"Zielinski H., Kozlowska H., Lewczuk B.",2001,Bioactive compounds in the cereal grains befor...,ZIELINSKI 2001,Innovative Food Science and Emerging Technologies,2,,159-169,2015-04-13 21:45:28 UTC,2015-04-13 21:45:28 UTC
1304,497,"Zielinski H., Michalska A., Piskula M.K., Kozl...",2006,Antioxidants in thermally treated buckwheat gr...,ZIELINSKI 2006,Molecular Nutrition and Food Research,50,,824-832,2015-04-13 21:45:26 UTC,2015-04-14 13:51:47 UTC
1305,743,"Zimmermann R., Bauermann U., Morales F.",2006,Effects of growing site and nitrogen fertiliza...,ZIMMERMANN 2006,Journal of the Science of Food and Agriculture,86,,415-419,2015-04-13 21:45:27 UTC,2015-04-13 21:45:27 UTC
1306,203,"Zuo Y., Wang C., Zhan J.",2002,"Separation, characterization and quantitation ...",ZUO 2002,Journal of Agricultural and Food Chemistry,50,13,3789-3794,2015-04-13 21:45:24 UTC,2015-04-14 13:51:48 UTC


In [11]:
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
ESEARCH_URL = BASE_URL + "esearch.fcgi"
FETCH_URL = BASE_URL + "efetch.fcgi"
DS_WITH_PMID = 'publications_pmid.csv'

# Step 1: Search for the PMID of the articles by title
def search_pmid(article):
    
    title = article['title']
    params = {
        "db": "pubmed",
        "term": f"{title}",
        "retmode": "json",
        "field": "title"
    }

    try:

        # Trying to find the PMID
        response = requests.get(ESEARCH_URL, params=params)
        response.raise_for_status()
        data = response.json()

        if len(data["esearchresult"]["idlist"]) >= 1:
            pmid = data['esearchresult']['idlist'][0]
            print(f"> Found PMID for article: {pmid}")
            return pmid
        
        # Retrying with a shorter title if no results found
        short_title = ' '.join(title.split()[:8])  # Use first 8
        print(f"> Retrying with shorter title: {short_title}")
        params["term"] = f"{short_title}[Title]"

        response = requests.get(ESEARCH_URL, params=params)
        response.raise_for_status()
        data = response.json()

        if len(data["esearchresult"]["idlist"]) >= 1:
            pmid = data['esearchresult']['idlist'][0]
            print(f"> Found PMID for article: {pmid}")
            return pmid

        print(f"> No PMID found for article.")
        return None

    except requests.exceptions.RequestException as e:
        print(f"> ERROR during request for article: {e}")
        return None
    
ds_pmid = dataset.copy()
for idx, article in ds_pmid.iterrows():
    print(f"[{idx + 1}/{len(ds_pmid)}] Searching PMID for: {article['title']}")
    pmid = search_pmid(article)
    ds_pmid.at[idx, 'pmid'] = pmid

ds_pmid.to_csv(DS_WITH_PMID, index=False)

[1/1308] Searching PMID for: Polyphenol composition and antioxidant activity in strawberry purees  impact of achene level and storage
> Found PMID for article: 17550269
[2/1308] Searching PMID for: Uptake and metabolism of epicatechin and its access to the brain after oral ingestion
> Found PMID for article: 12488137
[3/1308] Searching PMID for: Composition and stability of anthocyanins in blue-grained wheat
> Found PMID for article: 12670152
[4/1308] Searching PMID for: Anthocyanin composition in black, blue, pink, purple, and red cereal grains
> Found PMID for article: 16787017
[5/1308] Searching PMID for: Preliminary study of resveratrol content in Aragon red and rose wines
> Retrying with shorter title: Preliminary study of resveratrol content in Aragon red
> No PMID found for article.
[6/1308] Searching PMID for: HPLC method for the quantification of procyanidins in cocoa and chocolate samples and correlation to total antioxidant capacity
> Found PMID for article: 10552788
[7/1308

In [9]:
print("Number of articles with PMID found:", ds_pmid['pmid'].notnull().sum())

Number of articles with PMID found: 901


In [12]:
# Step 2: Fetch article abstract by PMID
def fetch_abstract_by_pmid(pmid):
    params = {
        "db": "pubmed",
        "id": f"{pmid}",
        "retmode": "text",
        "rettype": "abstract",
    }

    try:
        response = requests.get(FETCH_URL, params=params)
        response.raise_for_status()
        print(f"> Fetched abstract!!")
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"> ERROR fetching abstract for PMID '{pmid}': {e}")
        return None
    
ds_pmid_abstract = ds_pmid.copy()
for idx, article in ds_pmid_abstract.iterrows():
    pmid = article['pmid']
    if pd.notnull(pmid):
        print(f"[{idx + 1}/{len(ds_pmid_abstract)}] Fetching abstract for PMID: {pmid}")
        abstract = fetch_abstract_by_pmid(pmid)
        ds_pmid_abstract.at[idx, 'abstract'] = abstract
    else:
        print(f"[{idx + 1}/{len(ds_pmid_abstract)}] No PMID available, skipping abstract fetch.")
        ds_pmid_abstract.at[idx, 'abstract'] = None

[1/1308] Fetching abstract for PMID: 17550269
> Fetched abstract!!
[2/1308] Fetching abstract for PMID: 12488137
> Fetched abstract!!
[3/1308] Fetching abstract for PMID: 12670152
> Fetched abstract!!
[4/1308] Fetching abstract for PMID: 16787017
> Fetched abstract!!
[5/1308] No PMID available, skipping abstract fetch.
[6/1308] Fetching abstract for PMID: 10552788
> Fetched abstract!!
[7/1308] No PMID available, skipping abstract fetch.
[8/1308] Fetching abstract for PMID: 1659780
> Fetched abstract!!
[9/1308] Fetching abstract for PMID: 39811928
> Fetched abstract!!
[10/1308] Fetching abstract for PMID: 15769171
> Fetched abstract!!
[11/1308] No PMID available, skipping abstract fetch.
[12/1308] Fetching abstract for PMID: 20735138
> Fetched abstract!!
[13/1308] No PMID available, skipping abstract fetch.
[14/1308] No PMID available, skipping abstract fetch.
[15/1308] Fetching abstract for PMID: 16159191
> Fetched abstract!!
[16/1308] No PMID available, skipping abstract fetch.
[17/13

In [18]:
ds_pmid_abstract.to_csv('publications_pmid_abstract.csv', index=False)
print("Number of articles with abstract fetched:", ds_pmid_abstract['abstract'].notnull().sum())
print("Number of articles without abstract fetched:", ds_pmid_abstract['abstract'].isnull().sum())

Number of articles with abstract fetched: 742
Number of articles without abstract fetched: 566


In [2]:
def clean_crossref_abstract(abstract_text):
    """Removes the <jats:p> tags from Crossref abstracts."""
    if abstract_text:
        cleaned = re.sub(r'<[^>]+>', '', abstract_text)
        return cleaned.strip()
    return None

def search_crossref(article):
    try:
        title = article['title']

        params = {
            "query.title": title,
            "rows": 1,
            "mailto": "didier.reyes.castro@alumnos.upm.es"
        }
        response = requests.get("https://api.crossref.org/works", params=params, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            if data['message']['items']:
                item = data['message']['items'][0]
                if 'abstract' in item:
                    return clean_crossref_abstract(item['abstract'])
    except requests.exceptions.RequestException as e:
        print(f"- CROSSREF ERROR: {e}")
        return None

In [3]:
ds_crossref = pd.read_csv('publications_pmid_abstract.csv')

counters = {
    'total_missing': ds_crossref['abstract'].isnull().sum(),
    'crossref_found': 0,
    'failed': 0
}

for i, row in ds_crossref.iterrows():
    
    if not pd.isnull(row['abstract']):
        continue

    print(f"[{i + 1}/{len(ds_crossref)}] Searching CROSSREF for abstract of article: {row['title']}")
    found_abstract = search_crossref(row)
    time.sleep(1) # Polite 1-second delay

    if found_abstract:
        print("> Found abstract!!")
        ds_crossref.at[i, 'abstract'] = found_abstract
        counters['crossref_found'] += 1
    else:
        print("> Nope :(")
        counters['failed'] += 1

ds_crossref.to_csv('publications_abstract_pubmed_crossref.csv', index=False)

[5/1308] Searching CROSSREF for abstract of article: Preliminary study of resveratrol content in Aragon red and rose wines
> Nope :(
[7/1308] Searching CROSSREF for abstract of article: Enhancement of total phenolics and antioxidant properties of some tropical green leafy vegetables by steam cooking
> Nope :(
[11/1308] Searching CROSSREF for abstract of article: Correlation of tocopherol, tocotrienol, gamma-oryzanol and total polyphenol content in rice bran with different antioxidant capacity assays
> Nope :(
[13/1308] Searching CROSSREF for abstract of article: High-Performance Liquid-Chromatography of Selected Phenolic-Compounds in Olive Oils
> Found abstract!!
[14/1308] Searching CROSSREF for abstract of article: Functional attributes of soybean seeds and products, with reference to isoflavone content and antioxidant activity
> Nope :(
[16/1308] Searching CROSSREF for abstract of article: Compositional and functional characteristics of dates, syrups, and their by-products
> Nope :(


In [4]:
print("Total missing abstracts initially:", counters['total_missing'])
print("Abstracts found via Crossref:", counters['crossref_found'])
print("Failed attempts:", counters['failed'])

Total missing abstracts initially: 566
Abstracts found via Crossref: 119
Failed attempts: 447


## **Task 2:**

Use the EUtilities tool to search for articles whose content is not relevant to this task. Size of the dataset should be the same of relevant documents.

In [None]:
def get_articles_pmids_for_title(title, count, api_key=None):
    
    params = {
        "db": "pubmed",
        "term": f"{title}[Title]",
        "retmode": "json",
        "retmax": count,
        "api_key": api_key
    }

    try:
        response = requests.get(ESEARCH_URL, params=params)
        response.raise_for_status()
        data = response.json()

        if 'esearchresult' in data and data['esearchresult']['count'] != '0':
            return data['esearchresult']['idlist']
        else:
            print(f"Found {data['esearchresult']['count']} irrelevant articles.")
            return []

    except requests.exceptions.RequestException as e:
        print(f"Error during request for irrelevant articles: {e}")
        return []


In [None]:
IRRELEVANT_PUBLICATIONS = 'irrelevant_publications.csv'

irrelevant_pmids_list = get_articles_pmids_for_title("cancer", len(relevant_df), api_key="8e029cc2ba291ed9ee30e494f27c18017408")

irrelevant_abstracts = []
for pmid in irrelevant_pmids_list:

    article_info = {
        'pmid': pmid,
        'abstract': None
    }

    article_info['abstract'] = fetch_abstract_by_pmid(pmid, api_key="8e029cc2ba291ed9ee30e494f27c18017408")
    irrelevant_abstracts.append(article_info)

    # CHANGE ME TO 0.1 IF YOU HAVE AN API KEY
    print("Sleeping for 0.1...")
    time.sleep(0.1)  # Delaying 0.1s to respect NCBI rate limits (3 requests per second)

# Save irrelevant abstracts to a new dataset
irrelevant_df = pd.DataFrame(irrelevant_abstracts)
irrelevant_df.to_csv(IRRELEVANT_PUBLICATIONS, index=False)

In [None]:
irrelevant_df

## **Task 4:**

Implement the chosen retrieval system using the programming language of their choice. If the information retrieval system is based on machine learning techniques, the student must split the existing datasets (relevant and non-relevant documents) into three distinct groups (training, validation, and testing) to carry out the model training.

**CHOSEN RETRIEVAL SYSTEM:** BioBERT-based Binary Text Classifier

In [None]:
# Adding target variable 'relevance' 
relevant_df['relevance'] = 1
irrelevant_df['relevance'] = 0

# Combining relevant and irrelevant datasets and maintaining only abstract and relevance columns
features = ['abstract', 'relevance']
combined_df = pd.concat([relevant_df[features], irrelevant_df[features]], ignore_index=True)

# Remove any rows where the abstract is missing (e.g., API fetch failed)
combined_df.dropna(subset=['abstract'], inplace=True)
combined_df.reset_index(drop=True, inplace=True)

# Saving
combined_df.to_csv('combined_publications.csv', index=False)

print("Class distribution:")
print(combined_df['relevance'].value_counts())

combined_df

Following Fine-tuning of BERT for text classification tasks: https://huggingface.co/docs/transformers/en/tasks/sequence_classification

- Train-Test-Validation Split: 80%-10%-10%

In [None]:
RANDOM_STATE = 42

train_df, test_df = train_test_split(combined_df,
                                     test_size=0.2,
                                     stratify=combined_df["relevance"],
                                     random_state=RANDOM_STATE)

val_df, test_df = train_test_split(test_df,
                                   test_size=0.5,
                                   stratify=test_df["relevance"],
                                   random_state=RANDOM_STATE)

print(f"Training size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

- Convert Pandas DataFrame to HuggingFace Dataset

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

- Tokenization of abstracts using BioBERT tokenizer

In [None]:
BERT_MODEL_NAME = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

def tokenize(examples):
    return tokenizer(examples["abstract"], 
                     padding="max_length", 
                     truncation=True,
                     max_length=512 # Maximum length for BERT models
                    )

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Renaming the target column to 'labels' as expected by HuggingFace Trainer
train_dataset = train_dataset.rename_column("relevance", "labels")
val_dataset = val_dataset.rename_column("relevance", "labels")
test_dataset = test_dataset.rename_column("relevance", "labels")

- Loading BioBERT model for binary text classification (relevant vs irrelevant)

In [None]:
id2label = {0: "irrelevant", 1: "relevant"}
label2id = {"irrelevant": 0, "relevant": 1}

model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_NAME, 
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

- Defining evaluation metrics

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, 
                                                               predictions, 
                                                               average="binary",
                                                               zero_division=0)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

- Putting the training arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./biobert_pubmed_classifier",

    # Training hyperparameters
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    # Optimiser settings
    weight_decay=0.01,
    
    # Evaluation settings
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,

    # Model selection    
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    # Performance
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=4,

    seed=RANDOM_STATE,
    push_to_hub=False,
    report_to="none"
)

- Actual training using Trainer API

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

- Evaluating on the test set

In [None]:
predictions_output = trainer.predict(test_dataset)
predictions = np.argmax(predictions_output.predictions, axis=-1)
true_labels = predictions_output.label_ids

# Calculate all metrics
test_metrics = compute_metrics((predictions_output.predictions, true_labels))

print("\nTest Set Results:")
print(f"Accuracy:  {test_metrics['accuracy']:.4f}")
print(f"Precision: {test_metrics['precision']:.4f}")
print(f"Recall:    {test_metrics['recall']:.4f}")
print(f"F1-Score:  {test_metrics['f1']:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(
    true_labels, 
    predictions,
    target_names=['Irrelevant', 'Relevant'],
    digits=4
))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(true_labels, predictions)
print(cm)
print(f"\nTrue Negatives:  {cm[0][0]} (correctly identified irrelevant)")
print(f"False Positives: {cm[0][1]} (incorrectly marked relevant)")
print(f"False Negatives: {cm[1][0]} (missed relevant papers)")
print(f"True Positives:  {cm[1][1]} (correctly identified relevant)")

- saving the trained model

In [None]:
model_save_path = './final_biobert_classifier'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)