# Download XML efetch records for PubMed IDs in the specified journals

Uses functions from `pubmedpy` rather than the `eutils` Python package, which [does not support](https://github.com/biocommons/eutils/issues/124/) pagination.

In [1]:
import pathlib
import lzma
import tqdm

from pubmedpy.eutilities import esearch_query, download_pubmed_ids

In [2]:
%%time
# for journal in journals:
payload = {
    'db': 'pubmed',
    'term': '"journal article"[pt] AND 1993:2019[dp] AND "computational biology"[MeSH Terms] AND English[Language]',
}
pubmed_ids = esearch_query(payload, tqdm=tqdm.tqdm_notebook)
pubmed_ids = sorted(map(int, pubmed_ids))
print(f'{len(pubmed_ids):,} articles for English computational biology')

HBox(children=(FloatProgress(value=0.0, max=179656.0), HTML(value='')))


179,656 articles for English computational biology
CPU times: user 1.07 s, sys: 55.3 ms, total: 1.13 s
Wall time: 14.9 s


In [3]:
%%time
path = pathlib.Path('data/pubmed/esummary/compbio-english.xml.xz')
path.parent.mkdir(parents=True, exist_ok=True)
with lzma.open(path, 'wt') as write_file:
    download_pubmed_ids(
        pubmed_ids, write_file, endpoint='esummary',
        retmax=200, retmin=50, sleep=0, error_sleep=1,
        tqdm=tqdm.tqdm_notebook,
    )

HBox(children=(FloatProgress(value=0.0, max=179656.0), HTML(value='')))


CPU times: user 2min 45s, sys: 1.85 s, total: 2min 47s
Wall time: 9min 25s


In [4]:
%%time
path = pathlib.Path('data/pubmed/efetch/compbio-english.xml.xz')
path.parent.mkdir(parents=True, exist_ok=True)
with lzma.open(path, 'wt') as write_file:
    download_pubmed_ids(
        pubmed_ids, write_file, endpoint='efetch',
        retmax=200, retmin=50, sleep=0, error_sleep=1,
        tqdm=tqdm.tqdm_notebook,
    )

HBox(children=(FloatProgress(value=0.0, max=179656.0), HTML(value='')))


CPU times: user 12min 30s, sys: 6.53 s, total: 12min 37s
Wall time: 42min 15s
