# Massive PubMed record summary querier

In [1]:
import lzma
import os

import pandas
import tqdm

from pydelays.eutilities import esearch_query, pubmed_esummary

In [3]:
# Publication dates to query
start_year = 1960
end_year = 2017

In [4]:
path_ids = os.path.join('download', f'esearch_journal-articles_{start_year}-{end_year}.tsv.xz')
path_summaries = os.path.join('download', f'esummary_journal-articles_{start_year}-{end_year}.xml.xz')

# `esearch`: retrieve all PubMed article IDs

In [6]:
# Run esearch queries
payload = {
    'db': 'pubmed',
    'term': f'journal article[Publication Type] AND {start_year}:{end_year}[Date - Publication]'
}
pubmed_ids = esearch_query(payload, tqdm=tqdm.tqdm_notebook)
pubmed_ids = sorted(map(int, pubmed_ids))
id_df = pandas.DataFrame({'pubmed_id': pubmed_ids})
id_df.to_csv(path_ids, compression='xz', sep='\t', index=False)




# `esummary`: retrieve article summaries

In [7]:
# Read pubmed IDs
pubmed_ids = list(pandas.read_table(path_ids).pubmed_id)
f'{len(pubmed_ids):,} pubmed IDs'

'23,711,961 pubmed IDs'

In [9]:
%%time
# Run esummary queries
with lzma.open(path_summaries, 'wt') as write_file:
    pubmed_esummary(pubmed_ids, write_file, retmax=500, retmin=50, sleep=0, error_sleep=1, tqdm=tqdm.tqdm_notebook)

9727500/|/ 41%|| 9727500/23711961 [5:03:24<8:05:42, 479.87articles/s]
CPU times: user 4h 30min 53s, sys: 1min 48s, total: 4h 32min 41s
Wall time: 13h 30min 56s
