## Search and Pull PubMed articles

### Set Up
1. Load environment file and install required libraries listed in requirements.txt (lxml, bs4, Bio)
2. Import libraries and set up API tokens & secrets

In [None]:
'''
pip install Bio
pip install lxml
pip install bs4
pip install huggingface_hub[cli]
pip install huggingface_hub
'''

In [14]:
import time
import json
import glob
import pandas as pd

import os
from dotenv import load_dotenv

from Bio import Entrez
from Bio import Medline

from pathlib import Path
from bs4 import BeautifulSoup


In [9]:
''' Ensure that API tokens, secrets and other credentials are stored in local .env file. See '.env.example' for example '''

load_dotenv() # Function call to look for .env and load it

True

### Search PubMed & PMC Open Access using eSearch/eFetch
eSearch: Searches and retrieves primary IDs from PubMed (for use in EFetch, ELink, and ESummary) and term translations and optionally retains results for future use in the user’s environment.

eFetch: extracts journal information from the PMC Open Access Database based on ID list

For information, go to https://biopython.org/docs/1.75/api/Bio.Entrez.html or https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch or https://pubmed.ncbi.nlm.nih.gov/help/#search-field-descriptions-and-tags


The **PMC Open Access Database** contains journal articles that are made available under license terms that allow reuse. https://pmc.ncbi.nlm.nih.gov/tools/openftlist/ 

*More information on how to perform advance search can be found here: https://pmc.ncbi.nlm.nih.gov/about/userguide/#searchfields*

In [11]:
''' Searching PubMed (an existing account with PubMed is required - Registration is free) '''

# Connect to PubMed with email
Entrez.email = os.getenv('ENTREZ_EMAIL')

# Define search terms for PubMed
search_term = 'pancreatic cancer[ti] AND pancreatic cancer[ab] AND ' \
'(symptom[tiab] OR symptoms[tiab] "clinical presentation"[tiab] OR signs[tiab] OR manifestations[tiab] OR "signs and symptoms"[mh]) AND free full text[sb]'

handle = Entrez.esearch(db="pubmed", term=search_term, retmax=2000) 
record = Entrez.read(handle)
handle.close()

# Retrieve and store PubMed IDs from search results
pubmed_id_list = record["IdList"]

print(f" {len(pubmed_id_list)} PubMed articles found")

 672 PubMed articles found


In [12]:
''' Search PMC Open Access Database '''

# Define search terms
search_term_2 = (
    'pancreatic cancer[ab] AND pancreatic cancer[ti] AND '
    '(symptom[ab] OR symptoms[ab] OR "clinical presentation"[AB] OR signs[AB] OR manifestations[AB]) '
    'AND open access[filter]')

handle = Entrez.esearch(db="pmc", term=search_term_2, retmax=2000) 
record = Entrez.read(handle)
handle.close()

# Get list of PMC Open Access article IDs from search results
pmc_id_list = record["IdList"]

print(f" {len(pmc_id_list)} PMC Open Access articles found")

 442 PMC Open Access articles found


In [None]:
''' Match PMC ID to PubMed ID to get detailed article metadata and abstract (Medline format) into csv file '''

pmids = []

# PMC LinkOut: get related PMIDs for each PMCID
for pmcid in pmc_id_list:  # Limit to first 5 for demo
    handle = Entrez.elink(dbfrom="pmc", db="pubmed", id=pmcid)
    linkset = Entrez.read(handle)
    handle.close()

    for link in linkset[0]["LinkSetDb"]:
        if link["LinkName"] == "pmc_pubmed":
            for link_item in link["Link"]:
                pmids.append(link_item["Id"])

print(f"Mapped PMCIDs to PMIDs: {pmids[:5]}")

In [None]:
''' Get PubMed metadata and abstracts(Medline) '''

# Initialize list to store parsed records
records = []

# Batch size for PubMed efetch (max 10,000 per NCBI)
batch_size = 100

# Loop through id_list in chunks
for start in range(0, len(pmids), batch_size):
    end = min(start + batch_size, len(pmids))
    batch_ids = pmids[start:end]

    try:
        # Fetch records from PubMed using Medline format
        handle = Entrez.efetch(
            db="pubmed",
            id=batch_ids,
            rettype="medline",
            retmode="text"
        )

        # Parse the plain-text MEDLINE into structured dicts
        batch_data = Medline.parse(handle)
        
        # Append each parsed record to the records list
        records.extend(batch_data)

        handle.close()

        print(f" Fetched records {start + 1} to {end}")

        # NCBI rate limit: 3 requests/sec without API key
        time.sleep(0.4)

    except Exception as e:
        print(f" Error fetching records {start + 1} to {end}: {e}")
        continue

✅ Fetched records 1 to 100
✅ Fetched records 101 to 200
✅ Fetched records 201 to 300
✅ Fetched records 301 to 400
✅ Fetched records 401 to 405


In [None]:
# Flatten and normalize the records

from pathlib import Path
working_dir = Path.cwd()

df = pd.json_normalize(records)

data_path = working_dir/data/"raw_pubmed_records.csv"

# Save to CSV
df.to_csv(data_path, index=False, encoding='utf-8')


In [None]:
import csv

# Define the CSV file name
output_csv = "/data/pubmed_medline_records.csv"

# Define fields you want to extract and map them to readable names
csv_fields = [
    ("PMID", "PMID"),
    ("TI", "Title"),
    ("AB", "Abstract"),
    ("MH", "MeSH headings"),
    ("AU", "Authors"),
    ("JT", "Journal"),
    ("DP", "Year"),
    ("DP", "Month"),
    ("SO", "Source"),
    ("PL", "Country")
]

# Open CSV for writing
with open(output_csv, mode="w", encoding="utf-8", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=[field[1] for field in csv_fields])
    writer.writeheader()

    for record in records:
        row = {}
        for medline_key, csv_key in csv_fields:
            value = record.get(medline_key, "")

            # Join lists into a semicolon-separated string
            if isinstance(value, list):
                value = "; ".join(value)

            if medline_key == "DP":
                parts = value.split(" ")
                if csv_key == "Year":
                    value = parts[0] if len(parts) > 0 else ""
                elif csv_key == "Month":
                    value = parts[1] if len(parts) > 1 else ""

            row[csv_key] = value

        writer.writerow(row)

print(f" Saved {len(records)} records to {output_csv}")


In [None]:
records

[{'PMID': '40471466',
  'OWN': 'NLM',
  'STAT': 'MEDLINE',
  'DCOM': '20250605',
  'LR': '20250608',
  'IS': '1941-6636 (Electronic) 1941-6628 (Print)',
  'VI': '56',
  'IP': '1',
  'DP': '2025 Jun 5',
  'TI': 'Pancreatic Cancer Risk Assessment Tools in Primary Care: A Mixed Methods Systematic Review.',
  'PG': '128',
  'LID': '10.1007/s12029-025-01229-5 [doi] 128',
  'AB': "BACKGROUND: Pancreatic cancer is the twelfth most common cancer worldwide, but high mortality rates make it the sixth leading cause of cancer deaths. Diagnosis is frequently too late for curative intervention. Risk assessment tools incorporating diagnostic prediction models may assist early pancreatic cancer detection by primary care clinicians. AIM AND METHODS: This mixed methods systematic review aims to identify risk assessment tools which can be used for the detection of pancreatic cancer and have been investigated in primary care. It also seeks to synthesise the qualitative and quantitative evidence relating t

### Extract PMC Open Access articles and full-text(XML format)

In [None]:
#Extrating full texts
import time
import os

# Create output folder
os.makedirs("/data/pmc_xml", exist_ok=True)

batch_size = 10  # smaller batch because files are big

for start in range(0, len(pmc_id_list), batch_size):
    end = min(start + batch_size, len(pmc_id_list))
    batch_ids = pmc_id_list[start:end]

    try:
        handle = Entrez.efetch(
            db="pmc",
            id=batch_ids,
            rettype="full",   # or "xml"
            retmode="xml"
        )

        xml_data = handle.read()
        xml_data_str = xml_data.decode('utf-8')
        handle.close()
    

        # Save each batch or each ID
        with open(f"/data/pmc_xml/pmc_{start+1}_{end}.xml", "w", encoding="utf-8") as f:
            f.write(xml_data_str)

        print(f"Downloaded PMC full-text {start+1}–{end}")

        time.sleep(0.5)

    except Exception as e:
        print(f"Error fetching PMCIDs {start+1}–{end}: {e}")


✅ Downloaded PMC full-text 1–10
✅ Downloaded PMC full-text 11–20
✅ Downloaded PMC full-text 21–30
✅ Downloaded PMC full-text 31–40
✅ Downloaded PMC full-text 41–50
✅ Downloaded PMC full-text 51–60
✅ Downloaded PMC full-text 61–70
✅ Downloaded PMC full-text 71–80
✅ Downloaded PMC full-text 81–90
✅ Downloaded PMC full-text 91–100
✅ Downloaded PMC full-text 101–110
✅ Downloaded PMC full-text 111–120
✅ Downloaded PMC full-text 121–130
✅ Downloaded PMC full-text 131–140
✅ Downloaded PMC full-text 141–150
✅ Downloaded PMC full-text 151–160
✅ Downloaded PMC full-text 161–170
✅ Downloaded PMC full-text 171–180
✅ Downloaded PMC full-text 181–190
✅ Downloaded PMC full-text 191–200
✅ Downloaded PMC full-text 201–210
✅ Downloaded PMC full-text 211–220
✅ Downloaded PMC full-text 221–230
✅ Downloaded PMC full-text 231–240
✅ Downloaded PMC full-text 241–250
✅ Downloaded PMC full-text 251–260
✅ Downloaded PMC full-text 261–270
✅ Downloaded PMC full-text 271–280
✅ Downloaded PMC full-text 281–290
✅ Dow

In [None]:
#Split the batches to save as individual xml_files

import time
import glob
from bs4 import BeautifulSoup

input_dir = '/data/pmc_xml'
output_dir = '/data/pmc_xml_files'
os.makedirs(output_dir, exist_ok=True)

batch_files = glob.glob(os.path.join(input_dir, "*.xml"))

for file in batch_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()

    soup = BeautifulSoup(content, 'lxml-xml')

    # Make sure we are inside the right root
    articleset = soup.find('pmc-articleset')
    if not articleset:
        print(f"No <pmc-articleset> found in {file}")
        continue

    articles = articleset.find_all('article')
    print(f"{file}: Found {len(articles)} articles")

    for i, article in enumerate(articles):
        pmcid_tag = article.find('article-id', {'pub-id-type': 'pmcid'})
        pmcid = pmcid_tag.text.strip() if pmcid_tag else f"{os.path.basename(file)}_article{i+1}"

        output_path = os.path.join(output_dir, f"{pmcid}.xml")
        with open(output_path, 'w', encoding='utf-8') as out_f:
            out_f.write(str(article))

print(f"\n Splitting complete. All articles saved to {output_dir}")


In [None]:
#Count number of xml files
xml_files = [f for f in os.listdir(output_dir) if f.endswith('.xml')]

print(f"Number of JSON files: {len(xml_files)}")

Number of JSON files: 407


### Extract article body

In [None]:
import os
import glob
import json
from bs4 import BeautifulSoup

input_dir = "/data/pmc_xml_files/"
output_dir = "/data/pmc_extracted_text"
os.makedirs(output_dir, exist_ok=True)


for xml_file in glob.glob(os.path.join(input_dir, "*.xml")):
    with open(xml_file, encoding="utf-8") as f:
        xml_content = f.read()

    soup = BeautifulSoup(xml_content, "xml")

    # Get PMC ID from XML <article-id pub-id-type="pmcid"> or filename
    pmcid_tag = soup.find("article-id", {"pub-id-type": "pmcid"})
    if pmcid_tag:
        pmcid = pmcid_tag.get_text(strip=True)
    else:
        pmcid = os.path.splitext(os.path.basename(xml_file))[0]  # fallback to filename

    # Remove section titles and other non-body tags
    for tag in soup.find_all(["title", "fig", "table-wrap", "ref-list", "supplementary-material"]):
        tag.decompose()

    # Extract all paragraph text
    paragraphs = soup.find_all("p")
    full_text = "\n\n".join(p.get_text(strip=True) for p in paragraphs)

    # Save to JSON
    output_path = os.path.join(output_dir, f"{pmcid}.json")
    with open(output_path, "w", encoding="utf-8") as out_f:
        json.dump({"pmcid": pmcid, "text": full_text}, out_f, ensure_ascii=False, indent=2)


In [None]:
#read extracted json texts (example: PMC1751121.json)
import json
with open('/data/pmc_extracted_text/PMC1751121.json', "r", encoding="utf-8") as f:
    data = json.load(f)

# Print preview
print("PMC ID:", data["pmcid"])
print(data["text"][:500])

PMC ID: PMC1751121
Pancreatic cancer is a deadly disease. Discovery of the mutated genes that cause the inherited form(s) of the disease may shed light on the mechanism(s) of oncogenesis. Previously we isolated a susceptibility locus for familial pancreatic cancer to chromosome location 4q32–34. In this study, our goal was to discover the identity of the familial pancreatic cancer gene on 4q32 and determine the function of that gene.

A customized microarray of the candidate chromosomal region affecting pancreatic


### Extract conclusion and abstract

In [None]:
# Directory containing your saved XML files
xml_dir = Path("/data/pmc_xml")
output_file = Path("/data/pmc_abstract_conclusion.json")

# Ensure output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)

# Keywords for matching conclusion sections
conclusion_headings = {"conclusion", "conclusions", "5. conclusions"}

# Extract Abstract and Conclusions from each XML file
articles_data = []

for xml_file in xml_dir.glob("*.xml"):
    with open(xml_file, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "lxml")

    for article in soup.find_all("article"):
        article_data = {"pmcid": None, "abstract": "", "conclusion": ""}

        # Extract PMCID
        pmcid_tag = article.find("article-id", {"pub-id-type": "pmc"})
        if pmcid_tag:
            article_data["pmcid"] = pmcid_tag.get_text(strip=True)

        # Extract Abstract
        abstract = article.find("abstract")
        if abstract:
            article_data["abstract"] = abstract.get_text(separator=" ", strip=True)

        # Extract Conclusion-like sections
        conclusion_text = ""
        for sec in article.find_all("sec"):
            title_tag = sec.find("title")
            if title_tag:
                section_title = title_tag.get_text(strip=True).lower()
                if section_title in conclusion_headings:
                    conclusion_text += " " + sec.get_text(separator=" ", strip=True)

        article_data["conclusion"] = conclusion_text.strip()

        if article_data["abstract"] or article_data["conclusion"]:
            articles_data.append(article_data)

# Save to JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(articles_data, f, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(articles_data)} records to {output_file}")


In [None]:
input_dir = "/data/pmc_xml_files/"
output_dir = "/data/pmc_abstract_conclusion_text"
os.makedirs(output_dir, exist_ok=True)

# Define acceptable headings for conclusions
conclusion_headings = {"conclusion", "conclusions", "5. conclusions"}

for xml_file in glob.glob(os.path.join(input_dir, "*.xml")):
    with open(xml_file, encoding="utf-8") as f:
        soup = BeautifulSoup(f.read(), "xml")

    # Get PMCID
    pmcid_tag = soup.find("article-id", {"pub-id-type": "pmcid"})
    pmcid = pmcid_tag.get_text(strip=True) if pmcid_tag else os.path.splitext(os.path.basename(xml_file))[0]

    #Extrac Abstracts 
    abstract_texts = []
    abstract_tags = soup.find_all("abstract", recursive=True)
    for abstract_tag in abstract_tags:
        # Some abstracts have nested <sec><title> and <p> inside
        for p in abstract_tag.find_all("p"):
            abstract_texts.append(p.get_text(strip=True))

    #Extract conclusions
    conclusion_texts = []
    for sec in soup.find_all("sec"):
        title_tag = sec.find("title")
        if title_tag and title_tag.get_text(strip=True).lower() in conclusion_headings:
            for p in sec.find_all("p"):
                conclusion_texts.append(p.get_text(strip=True))

    # Skip if either abstract or conclusion is missing
    if not abstract_texts or not conclusion_texts:
        continue

    combined_text = "\n\n".join(abstract_texts + conclusion_texts)

    # Save to JSON
    output_path = os.path.join(output_dir, f"{pmcid}.json")
    with open(output_path, "w", encoding="utf-8") as out_f:
        json.dump({"pmcid": pmcid, "text": combined_text}, out_f, ensure_ascii=False, indent=2)

print(" Finished extracting abstracts and conclusions.")


✅ Finished extracting abstracts and conclusions.


In [None]:
''' Push extracted files to Huggingface Hub '''

from huggingface_hub import login, notebook_login, HfApi

login(token= os.getenv('HUGGINGFACE_TOKEN'))
api = HfApi()

# Upload all the content from the local folder to hf remote Space.
api.upload_folder(
    folder_path="/data/pmc_abstract_conclusion_text",
    repo_id=os.getenv(f"HF_REPO_ID" + "pc_pmc_abstract_conclusion"),
    repo_type="dataset",
)

api.upload_file(
    path_or_fileobj="/data/pubmed_medline_records.csv",
    path_in_repo="article-metadata-medline.csv",
    repo_id=os.getenv(f"HF_REPO_ID" + "pc_pmc_abstract_conclusion"),
    repo_type="dataset",
)

In [None]:
''' [Optional] Upload all extracted article texts 

api = HfApi()

# Upload all extracted article texts
api.upload_folder(
    folder_path="/data/pmc_extracted_text",
    repo_id=os.getenv("HF_REPO_ID"),
    repo_type="dataset",
)

#Upload article metadata to hf
api.upload_file(
    path_or_fileobj="/data/pubmed_medline_records.csv",
    path_in_repo="article-metadata-medline.csv",
    repo_id=os.getenv("HF_REPO_ID"),
    repo_type="dataset",
)

'''