In [None]:
import requests
import urlparse
import urllib
import os
import time
import xml.etree.ElementTree as ET

In [None]:
with open('mesh_diseases.txt', 'r') as f:
    diseases = [line.strip() for line in f.readlines() if len(line) > 1]

In [None]:
def _entrez(scheme, netloc, path, **params):
    params.update({
        'tool': 'modelorgspeirson',
        'email': 'erick.peirson@asu.edu',
    })
    query = urllib.urlencode(params)
    target = urlparse.urlunsplit((scheme, netloc, path, query, ''))
    return requests.get(target).text


def esearch(scheme='http', netloc='eutils.ncbi.nlm.nih.gov', 
            path='entrez/eutils/esearch.fcgi', **params):
    return _entrez(scheme, netloc, path, **params)
    
    
def efetch(scheme='http', netloc='eutils.ncbi.nlm.nih.gov', 
           path='entrez/eutils/esearch.fcgi', **params):
    return _entrez(scheme, netloc, path, **params)

In [None]:
def pubmed_for_mesh(term, year, retmax=10000):
    params = {
        'db': 'pubmed',
        'retmax': retmax,    # Number of results.
        'term': term,
        'field': 'Mesh',
        'mindate': year,
        'maxdate': year,    # Ranges are inclusive in NCBI.
        'datetype': 'pdat',    # Publication date.
    }
    return esearch(**params)

In [None]:
def extract_pmids(result_raw):
    e = ET.fromstring(result_raw)
    return [id_elem.text for id_elem in e.find('IdList').getchildren()]

In [None]:
def build_path(term, year, datafile, base='data'):
    """
    Generate a path to a datafile for a specific term and year. 
    
    Will attempt to recursively create any missing directories.
    
    Parameters
    ----------
    term : str
    year : int
    datafile : str
        E.g. 'pmids.txt'
    base : str
        Base directory for data. Defaults to ./data.
    
    Returns
    -------
    str
        Path to output file.
    """
    dirpath = os.path.join(base, term, str(year))
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
    return os.path.join(dirpath, datafile)

In [None]:
for term in diseases:
    for year in xrange(1975, 2016):
        # NCBI permits no more than 3 requests per second.
        time.sleep(0.5)
        print '\rterm:', term, 'year:', year,
        
        pmids = extract_pmids(pubmed_for_mesh(term, year))
        outpath = build_path(term, year, 'pmids.txt')
        with open(outpath, 'w') as f:
            f.write('\n'.join(pmids))