# Massive PubMed record summary querier

In [None]:
import bz2
import gzip
import os
import time
import xml.etree.ElementTree

import pandas
import requests

# `esearch`: retrieve all PubMed article IDs

In [None]:
def esearch_query(payload, retmax = 10000, sleep=0.34):
    """
    Return identifiers using the ESearch E-utility.
    """
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    payload['rettype'] = 'xml'
    payload['retmax'] = retmax
    payload['retstart'] = 0
    ids = list()
    count = 1
    while payload['retstart'] < count:
        response = requests.get(url, params=payload)
        tree = xml.etree.ElementTree.fromstring(response.text)
        count = int(tree.findtext('Count'))
        ids += [id_.text for id_ in tree.findall('IdList/Id')]
        payload['retstart'] += retmax
        print('esearch {:.3%} complete'.format(payload['retstart'] / count), end='\r')
        time.sleep(sleep)
    return ids

In [None]:
%%time
# Run esearch queries
payload = {'db': 'pubmed', 'term': 'journal article[pt] AND 1960:2015[pdat]', 'rettype': 'xml'}
pubmed_ids = esearch_query(payload)
pubmed_ids = sorted(map(int, pubmed_ids))
len(pubmed_ids)

In [None]:
# Save pubmed IDs to a TSV
id_df = pandas.DataFrame({'pubmed_id': pubmed_ids})
path = os.path.join('download', 'esearch_journal-articles_1960-2015.tsv.gz')
with gzip.open(path, 'wt') as write_file:
    id_df.to_csv(write_file, sep='\t', index=False)

# `esummary`: retrieve article summaries

In [None]:
# Read pubmed IDs
path = os.path.join('download', 'esearch_journal-articles_1960-2015.tsv.gz')
pubmed_ids = list(pandas.read_table(path).pubmed_id)

In [None]:
def pubmed_esummary(ids, write_file, retmax=100, sleep=0.34):
    """Submit an ESummary query for PubMed records and write results as xml to write_file."""
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    id_subsets = list(ids[i:i+retmax] for i in range(0, len(ids), retmax))
    write_file.write('<eSummaryResult>\n')
    for i, id_subset in enumerate(id_subsets):
        id_string = ','.join(map(str, id_subset))
        payload = {'db': 'pubmed', 'id': id_string, 'rettype': 'xml'}
        response = requests.get(url, params=payload)
        tree = xml.etree.ElementTree.fromstring(response.text)
        for docsum in tree.getchildren():
            xml_str = xml.etree.ElementTree.tostring(docsum, encoding='unicode')
            write_file.write(xml_str)
        print('esearch {:.4%} complete'.format(i / len(id_subsets)), end='\r')
        time.sleep(sleep)
    write_file.write('</eSummaryResult>\n')

In [None]:
%%time
# Run esummary queries
path = os.path.join('download', 'esummary_journal-articles_1960-2015.xml.bz2')
with bz2.open(path, 'wt') as write_file:
    pubmed_esummary(pubmed_ids, write_file)