# BioMed: Information Retrieval - BioMedical Information Retrieval System

---

**Group:**
- Reyes Castro, Didier Yamil (didier.reyes.castro@alumnos.upm.es)
- Rodriguez Fernández, Cristina ()

**Course:** BioMedical Informatics - 2025/26

**Institution:** Polytechnic University of Madrid (UPM)

**Date:** November 2026

---

## Goal

To develop an Information Retrieval system — specifically, a **binary text classifier** — to identify scientific articles in the PubMed database that are related to a given set of abstracts within a defined research topic. In this case, the focus is on a collection of 1,308 manuscripts containing information on the polyphenol composition of various foods.

## Setup and Installation

In [None]:
# !pip install pandas requests transformers pytorch datasets accelerate

In [None]:
import requests
import time

import pandas as pd
from transformers import AutoTokenizer, BertForSequenceClassification
from datasets import Dataset

## **Task 1:** 

Retrieve from PubMed the abstracts associated with each publication in publications.xlsx

In [None]:
dataset = pd.read_csv('publications.csv')
dataset.head()

In [None]:
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
ESEARCH_URL = BASE_URL + "esearch.fcgi"
FETCH_URL = BASE_URL + "efetch.fcgi"

# Step 1: Search for the PMID of the article by title
def search_pmid_by_title(title, api_key=None):
    params = {
        "db": "pubmed",
        "term": f"{title}[Title]",
        "retmode": "json",
        "api_key": api_key
    }

    try:

        response = requests.get(ESEARCH_URL, params=params)
        response.raise_for_status()
        data = response.json()

        if 'esearchresult' in data and data['esearchresult']['count'] != '0':
            return data['esearchresult']['idlist'][0]
        else:
            print(f"Found {data['esearchresult']['count']} PMIDs for title: {title}. Skipping...")
            return None

    except requests.exceptions.RequestException as e:
        print(f"Error during request for title '{title}': {e}")
        return None

# Step 2: Fetch article abstract by PMID
def fetch_abstract_by_pmid(pmid, api_key=None):
    params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "text",
        "rettype": "abstract",
        "api_key": api_key
    }

    try:
        response = requests.get(FETCH_URL, params=params)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching abstract for PMID '{pmid}': {e}")
        return None

# Process each article in the dataset
relevant_abstracts = []
for i, article in dataset.iterrows():

    article_info = {
        'id': article['id'],
        'pmid': None,
        'title': article['title'],
        'abstract': None
    }

    title = article['title']
    pmid = search_pmid_by_title(title)
    
    if pmid:
        article_info['pmid'] = pmid
        abstract = fetch_abstract_by_pmid(pmid)
        article_info['abstract'] = abstract

    relevant_abstracts.append(article_info)

    # CHANGE ME TO 0.1 IF YOU HAVE AN API KEY
    print("Sleeping for 1...")
    time.sleep(1)  # Delaying 1s to respect NCBI rate limits (3 requests per second)

# Add relevant_abstracts to a new dataset
relevant_df = pd.DataFrame(relevant_abstracts)

# Save the updated dataset
relevant_df.to_csv('publications_v2.csv', index=False)

There are a lot of PMIDs whose abstract is not available :( ... Ask professor?

## **Task 2:**

Use the EUtilities tool to search for articles whose content is not relevant to this task. Size of the dataset should be the same of relevant documents.

In [None]:
def search_irrelevant_articles(term, count, api_key=None):
    
    print(f"Fetching {count} irrelevant articles...")

    params = {
        "db": "pubmed",
        "term": term,
        "retmode": "json",
        "retmax": count,
        "api_key": api_key
    }

    try:
        response = requests.get(ESEARCH_URL, params=params)
        response.raise_for_status()
        data = response.json()

        if 'esearchresult' in data and data['esearchresult']['count'] != '0':
            return data['esearchresult']['idlist']
        else:
            print(f"Found {data['esearchresult']['count']} irrelevant articles.")
            return []

    except requests.exceptions.RequestException as e:
        print(f"Error during request for irrelevant articles: {e}")
        return []

irrelevant_pmids_list = search_irrelevant_articles("cancer[Title]", len(dataset))

irrelevant_abstracts = []
for pmid in irrelevant_pmids_list:

    article_info = {
        'pmid': pmid,
        'abstract': None
    }

    article_info['abstract'] = fetch_abstract_by_pmid(pmid)
    irrelevant_abstracts.append(article_info)

    # CHANGE ME TO 0.1 IF YOU HAVE AN API KEY
    print("Sleeping for 1...")
    time.sleep(1)  # Delaying 1s to respect NCBI rate limits (3 requests per second)

# Save irrelevant abstracts to a new dataset
irrelevant_df = pd.DataFrame(irrelevant_abstracts)
irrelevant_df.to_csv('irrelevant_publications.csv', index=False)

In [None]:
# TODO: CHECK THIS OUT!! SEEMS NOT TO BE WORKING
# There are strange abstract in the irrelevant dataset like "1.", erasing them and researching...
# irrelevant_dataset_cleaned = irrelevant_dataset[~irrelevant_dataset['abstract'].str.match('1.')]
#
# irrelevant_dataset_cleaned

In [None]:
# Fetching other irrelevant abstracts
# new_irrelevant_pmids_list = search_irrelevant_articles("pneumonia[Title]", len(dataset) - len(irrelevant_dataset), "8e029cc2ba291ed9ee30e494f27c18017408")
# new_irrelevant_abstracts = []
# for pmid in new_irrelevant_pmids_list:
#     abstract = fetch_abstract_by_pmid(pmid, "8e029cc2ba291ed9ee30e494f27c18017408")
#     new_irrelevant_abstracts.append(abstract)
#     print("Sleeping for 0.1...")
#     time.sleep(0.1)  # Delaying 0.1s to respect NCBI rate limits (10 requests per second)
# 
# # Adding abstracts to the irrelevant dataset
# new_irrelevant_dataset = pd.DataFrame({'pmid': new_irrelevant_pmids_list, 'abstract': new_irrelevant_abstracts})
# irrelevant_dataset = pd.concat([irrelevant_dataset, new_irrelevant_dataset], ignore_index=True)
# irrelevant_dataset.to_csv('irrelevant_publications_v2.csv', index=False)

## **Task 4:**

Implement the chosen retrieval system using the programming language of their choice. If the information retrieval system is based on machine learning techniques, the student must split the existing datasets (relevant and non-relevant documents) into three distinct groups (training, validation, and testing) to carry out the model training.

**CHOSEN RETRIEVAL SYSTEM:** BioBERT-based Binary Text Classifier

In [None]:
# Adding target variable 'relevance' 
relevant_df['relevance'] = 1
irrelevant_df['relevance'] = 0

# Combining relevant and irrelevant datasets and maintaining only pmid and abstract columns
features = ['pmid', 'abstract', 'relevance']
combined_df = pd.concat([relevant_df[features], irrelevant_df[features]], ignore_index=True)

# Remove any rows where the abstract is missing (e.g., API fetch failed)
combined_df.dropna(subset=['abstract'], inplace=True)
combined_df.reset_index(drop=True, inplace=True)

# Saving
combined_df.to_csv('combined_publications.csv', index=False)

print("Class distribution:")
print(combined_df['relevance'].value_counts())

Following Fine-tuning of BioBERT for text classification tasks: https://huggingface.co/docs/transformers/training

In [None]:
# Converting Pandas DF to HuggingFace Dataset
combined_hf = Dataset.from_pandas(combined_df)

# Tokenizing abstracts using BioBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def tokenize(examples):
    return tokenizer(examples['abstract'], padding="max_length", truncation=True)

tokenized_dataset = combined_hf.map(tokenize, batched=True)

In [None]:
# Renaming targe column to 'labels' as expected by HuggingFace
tokenized_dataset = tokenized_dataset.rename_column("relevance", "labels")

# Dividing dataset into training, evaluation and testing sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, stratify_by_column="labels")
test_validation_split = train_test_split['test'].train_test_split(test_size=0.5, stratify_by_column="labels")

# Final datasets
final_datasets = {
    'train': train_test_split['train'],
    'validation': test_validation_split['train'],
    'test': test_validation_split['test']
}