# Massive PubMed record summary querier

In [1]:
import collections
import datetime
import logging
import lzma
import os
import time

import lxml.etree
import pandas
import requests
import tqdm

In [2]:
logger = logging.getLogger()

In [3]:
# Publication dates to query
start_year = 1960
end_year = 2017

In [4]:
path_ids = os.path.join('download', f'esearch_journal-articles_{start_year}-{end_year}.tsv.xz')
path_summaries = os.path.join('download', f'esummary_journal-articles_{start_year}-{end_year}.xml.xz')

# `esearch`: retrieve all PubMed article IDs

In [5]:
def esearch_query(payload, retmax = 10000, sleep=0.34):
    """
    Return identifiers using the ESearch E-utility.
    """
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    payload['rettype'] = 'xml'
    payload['retmax'] = retmax
    payload['retstart'] = 0
    ids = list()
    count = 1
    progress_bar = None
    while payload['retstart'] < count:
        response = requests.get(url, params=payload)
        tree = lxml.etree.fromstring(response.content)
        count = int(tree.findtext('Count'))
        if not progress_bar:
            progress_bar = tqdm.tqdm_notebook(total=count, unit='ids')
        add_ids = [id_.text for id_ in tree.findall('IdList/Id')]
        ids += add_ids
        payload['retstart'] += retmax
        progress_bar.update(len(add_ids))
        time.sleep(sleep)
    progress_bar.close()
    return ids

In [6]:
# Run esearch queries
payload = {
    'db': 'pubmed',
    'term': f'journal article[Publication Type] AND {start_year}:{end_year}[Date - Publication]'
}
pubmed_ids = esearch_query(payload)
pubmed_ids = sorted(map(int, pubmed_ids))
id_df = pandas.DataFrame({'pubmed_id': pubmed_ids})
id_df.to_csv(path_ids, compression='xz', sep='\t', index=False)




# `esummary`: retrieve article summaries

In [7]:
# Read pubmed IDs
pubmed_ids = list(pandas.read_table(path_ids).pubmed_id)
f'{len(pubmed_ids):,} pubmed IDs'

'23,711,961 pubmed IDs'

In [8]:
def pubmed_esummary(ids, write_file, retmax=100, retmin=20, sleep=0.34, error_sleep=10):
    """Submit an ESummary query for PubMed records and write results as xml to write_file."""
    
    # Base URL for PubMed's esummary eutlity
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
        
    # Set up progress stats
    n_total = len(ids)
    successive_errors = 0
    progress_bar = tqdm.tqdm_notebook(total=n_total, unit='articles')

    # Write first line of XML
    write_file.write('<eSummaryResult>\n')

    # Set up queue
    idq = collections.deque()
    for i in range(0, len(ids), retmax):
        idq.append(ids[i:i+retmax])

    # Query until the queue is empty
    while idq:
        time.sleep(sleep)
        id_subset = idq.popleft()
        id_subset_len = len(id_subset)
        
        # Perform eutilities API request
        id_string = ','.join(map(str, id_subset))
        payload = {'db': 'pubmed', 'id': id_string, 'rettype': 'xml'}
        try:
            response = requests.get(url, params=payload)
            tree = lxml.etree.fromstring(response.content)
            successive_errors = 0
        except Exception as e:
            successive_errors += 1
            logger.warning(f'{successive_errors} successive error: {id_subset_len} IDs [{id_subset[0]} … {id_subset[-1]}] threw {e}')
            if id_subset_len >= retmin * 2:
                mid = len(id_subset) // 2
                idq.appendleft(id_subset[:mid])
                idq.appendleft(id_subset[mid:])
            else:
                idq.appendleft(id_subset)
            time.sleep(error_sleep * successive_errors)
            continue

        # Write XML to file
        for docsum in tree.getchildren():
            xml_str = lxml.etree.tostring(docsum, encoding='unicode')
            write_file.write(xml_str)
        
        # Report progress
        progress_bar.update(id_subset_len)
    
    progress_bar.close()
    # Write final line of XML
    write_file.write('</eSummaryResult>\n')

In [9]:
%%time
# Run esummary queries
with lzma.open(path_summaries, 'wt') as write_file:
    pubmed_esummary(pubmed_ids, write_file, retmax=500, retmin=50, sleep=0, error_sleep=1)

9727500/|/ 41%|| 9727500/23711961 [5:03:24<8:05:42, 479.87articles/s]
CPU times: user 4h 30min 53s, sys: 1min 48s, total: 4h 32min 41s
Wall time: 13h 30min 56s
