# Download XML efetch records for PubMed IDs in the specified journals

Uses functions from `pubmedpy` rather than the `eutils` Python package, which [does not support](https://github.com/biocommons/eutils/issues/124/) pagination.

In [1]:
import pathlib
import lzma
import tqdm

from pubmedpy.eutilities import esearch_query, download_pubmed_ids

In [2]:
# NLM Title Abbreviations from https://www.ncbi.nlm.nih.gov/nlmcatalog/journals
journals = [
    "Bioinformatics",
    "BMC Bioinformatics",
    "PLoS Comput Biol",
]

In [3]:
%%time
for journal in journals:
    payload = {
        'db': 'pubmed',
        'term': f'"journal article"[pt] AND 1997:2019[dp] AND "{journal}"[Journal]',
    }
    pubmed_ids = esearch_query(payload, tqdm=tqdm.tqdm_notebook)
    pubmed_ids = sorted(map(int, pubmed_ids))
    print(f'{len(pubmed_ids):,} articles for {journal}')
    
    path = pathlib.Path('data/pubmed/efetch').joinpath(f'{journal}.xml.xz')
    path.parent.mkdir(parents=True, exist_ok=True)
    with lzma.open(path, 'wt') as write_file:
        download_pubmed_ids(
            pubmed_ids, write_file, endpoint='efetch',
            retmax=200, retmin=50, sleep=0, error_sleep=1,
            tqdm=tqdm.tqdm_notebook,
        )

HBox(children=(IntProgress(value=0, max=13837), HTML(value='')))


13,837 articles for Bioinformatics


HBox(children=(IntProgress(value=0, max=13837), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9409), HTML(value='')))


9,409 articles for BMC Bioinformatics


HBox(children=(IntProgress(value=0, max=9409), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6509), HTML(value='')))


6,509 articles for PLoS Comput Biol


HBox(children=(IntProgress(value=0, max=6509), HTML(value='')))


CPU times: user 2min 16s, sys: 3.06 s, total: 2min 19s
Wall time: 7min 16s
