# Download XML efetch records for PubMed IDs in the specified journals

Uses functions from `pubmedpy` rather than the `eutils` Python package, which [does not support](https://github.com/biocommons/eutils/issues/124/) pagination.

In [1]:
import pathlib
import lzma
import tqdm

from pubmedpy.eutilities import esearch_query, download_pubmed_ids

In [2]:
# NLM Title Abbreviations from https://www.ncbi.nlm.nih.gov/nlmcatalog/journals
# journals = [
#     "Bioinformatics",
#     "BMC Bioinformatics",
#     "PLoS Comput Biol",
# ]

In [4]:
%%time
# for journal in journals:
payload = {
    'db': 'pubmed',
#         'term': f'"journal article"[pt] AND 1993:2019[dp] AND "{journal}"[Journal]',
    'term': f'"journal article"[pt] AND 1993:2019[dp] AND "computational biology"[MeSH Terms]',
}
pubmed_ids = esearch_query(payload, tqdm=tqdm.tqdm_notebook)
pubmed_ids = sorted(map(int, pubmed_ids))
print(f'{len(pubmed_ids):,} articles for computational biology')

path = pathlib.Path('data/pubmed/efetch').joinpath('compbio.xml.xz')
path.parent.mkdir(parents=True, exist_ok=True)
with lzma.open(path, 'wt') as write_file:
    download_pubmed_ids(
        pubmed_ids, write_file, endpoint='efetch',
        retmax=200, retmin=50, sleep=0, error_sleep=1,
        tqdm=tqdm.tqdm_notebook,
    )

HBox(children=(FloatProgress(value=0.0, max=180761.0), HTML(value='')))


180,761 articles for computational biology


HBox(children=(FloatProgress(value=0.0, max=180761.0), HTML(value='')))


CPU times: user 22min 3s, sys: 23.8 s, total: 22min 27s
Wall time: 1h 4min 54s
