# Download articles from PubMed Central

https://www.ncbi.nlm.nih.gov/pmc/tools/oai/

In [1]:
import logging
import zipfile
import tqdm

import lxml.etree
import pandas
import sickle

In [2]:
# Configure sickle OAI harvester
sickler = sickle.Sickle(endpoint="https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi")

# Namespaces for parsing PMC-OAI XML
namespaces = {
    "oai": "http://www.openarchives.org/OAI/2.0/",
    "jats": "https://jats.nlm.nih.gov/ns/archiving/1.2/",
    "dtd": "https://dtd.nlm.nih.gov/ns/archiving/2.3/",
}

## Specify PMC-OAI sets

In [3]:
# For each journal in articles.tsv.xz, show two articles and their corresponding sets
pubmed_df = pandas.read_table('data/pubmed/articles.tsv.xz')
pubmed_df = pubmed_df[['pmcid', 'journal']].dropna()
journal_to_n_articles = pubmed_df.journal.value_counts().to_dict()
journal_df = (
    pubmed_df
    .sample(frac=1, random_state=0)
    .groupby('journal').head(2)
    .sort_values(['journal', 'pmcid'])
)
setspecs = list()
for pmcid in journal_df.pmcid:
    record = sickler.GetRecord(identifier=f"oai:pubmedcentral.nih.gov:{pmcid[3:]}", metadataPrefix='pmc_fm')
    setspecs.append(', '.join(record.header.setSpecs))
journal_df["oai_set"] = setspecs
journal_df

Unnamed: 0,pmcid,journal,oai_set
5678,PMC2092437,BMC Bioinformatics,"bmcbioi, pmc-open"
12870,PMC3248887,BMC Bioinformatics,"bmcbioi, pmc-open"
8406,PMC2687970,Bioinformatics,"bioinfo, pmc-open"
24181,PMC5860615,Bioinformatics,bioinfo
9666,PMC2816690,PLoS Comput Biol,"ploscomp, pmc-open"
13793,PMC3355066,PLoS Comput Biol,"ploscomp, pmc-open"


In [4]:
# Define journals to download article frontmatter for
journal_to_oai_set = {
    'BMC Bioinformatics': 'bmcbioi',
    'Bioinformatics': 'bioinfo',
    'PLoS Comput Biol': 'ploscomp',
}

## Dowload sets

In [5]:
def download_frontmatter_set(oai_set, path, tqdm=None, n_records=None):
    zip_file = zipfile.ZipFile(path, mode='w', compression=zipfile.ZIP_LZMA)
    records = sickler.ListRecords(metadataPrefix="pmc_fm", set=oai_set, ignore_deleted=True)
    if tqdm is not None:
        records = tqdm(records, total=n_records, desc=oai_set)
    for record in records:
        article = record.xml.find("oai:metadata/{*}article", namespaces=namespaces)
        if article is None:
            logging.warning(f'failure to extract <article> from\n{record.raw}')
        pmcid = article.findtext("{*}front/{*}article-meta/{*}article-id[@pub-id-type='pmcid']")
        xml_str = lxml.etree.tostring(article, encoding='unicode')
        zip_file.writestr(f'{pmcid}.xml', data=xml_str)
    zip_file.close()

In [6]:
for journal, oai_set in journal_to_oai_set.items():
    path = f'data/pmc/oai/pmc_fm/{oai_set}.zip'
    download_frontmatter_set(
        oai_set,
        path,
        tqdm=tqdm.tqdm_notebook,
        n_records=journal_to_n_articles[journal],
    )

HBox(children=(IntProgress(value=0, description='bmcbioi', max=9099, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='bioinfo', max=4797, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='ploscomp', max=6297, style=ProgressStyle(description_width='i…




In [7]:
! ls --size --human-readable data/pmc/oai/pmc_fm

total 54M
11M bioinfo.zip  23M bmcbioi.zip  21M ploscomp.zip


## Example retrieval of a single `<article>`

In [8]:
# testing
record = sickler.GetRecord(identifier="oai:pubmedcentral.nih.gov:3355066", metadataPrefix="pmc_fm")
article = record.xml.find("oai:metadata/{*}article", namespaces=namespaces)