In [49]:
import requests
import urlparse
import urllib
import os
import time
import copy
import xml.etree.ElementTree as ET

In [3]:
with open('mesh_diseases.txt', 'r') as f:
    diseases = [line.strip() for line in f.readlines() if len(line) > 1]

In [42]:
def _entrez(scheme, netloc, path, **params):
    params.update({
        'tool': 'modelorgspeirson',
        'email': 'erick.peirson@asu.edu',
    })
    query = urllib.urlencode(params)
    target = urlparse.urlunsplit((scheme, netloc, path, query, ''))
    return requests.get(target).text


def esearch(scheme='http', netloc='eutils.ncbi.nlm.nih.gov',
            path='entrez/eutils/esearch.fcgi',
            handler=extract_pmids, **params):
    """
    Perform an ESearch request.

    See http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch.
    """
    return handler(_entrez(scheme, netloc, path, **params))
    
    
def efetch(scheme='http', netloc='eutils.ncbi.nlm.nih.gov', 
           path='entrez/eutils/efetch.fcgi', 
           handler=lambda d: ET.fromstring(d.encode('utf-8')), **params):
    return handler(_entrez(scheme, netloc, path, **params))

In [5]:
def pubmed_for_mesh(term, year, retmax=10000):
    params = {
        'db': 'pubmed',
        'retmax': retmax,    # Number of results.
        'term': term,
        'field': 'Mesh',
        'mindate': year,
        'maxdate': year,    # Ranges are inclusive in NCBI.
        'datetype': 'pdat',    # Publication date.
    }
    return esearch(**params)

In [6]:
def extract_pmids(result_raw):
    e = ET.fromstring(result_raw)
    return [id_elem.text for id_elem in e.find('IdList').getchildren()]

In [7]:
def build_path(term, year, datafile, base='data', make=False):
    """
    Generate a path to a datafile for a specific term and year. 
    
    Will attempt to recursively create any missing directories.
    
    Parameters
    ----------
    term : str
    year : int
    datafile : str
        E.g. 'pmids.txt'
    base : str
        Base directory for data. Defaults to ./data.
    
    Returns
    -------
    str
        Path to output file.
    """
    dirpath = os.path.join(base, term, str(year))
    if make and not os.path.exists(dirpath):
        os.makedirs(dirpath)
    return os.path.join(dirpath, datafile)

In [8]:
for term in diseases:
    for year in xrange(1975, 2016):
        # NCBI permits no more than 3 requests per second.
        time.sleep(0.5)
        print '\rterm:', term, 'year:', year,
        
        pmids = extract_pmids(pubmed_for_mesh(term, year))
        outpath = build_path(term, year, 'pmids.txt', make=True)
        with open(outpath, 'w') as f:
            f.write('\n'.join(pmids))

term: Bacterial Infections and Mycoses year: 1981

KeyboardInterrupt: 

In [43]:
root = efetch(id=','.join(['6185069', '6794856']), db='pubmed', rettype='xml')

In [61]:
for article in root.findall('PubmedArticle'):
    newTree = ET.ElementTree(element=copy.deepcopy(article))
    newTree.write('')

In [70]:
article.find('MedlineCitation/PMID').text

'6794856'

In [52]:
t.write()

<Element 'PubmedArticle' at 0x106f90d90>

In [12]:
import numpy as np
import pandas as pd

In [73]:
list(df.PMID[0:50])

['7346770',
 '7342204',
 '7342199',
 '7312017',
 '7312012',
 '7050888',
 '7050885',
 '7050884',
 '7043671',
 '7039058',
 '7329588',
 '7329699',
 '7330652',
 '7329801',
 '7322240',
 '7036374',
 '7036372',
 '7342471',
 '7327091',
 '7327089',
 '7043900',
 '7035787',
 '6458758',
 '7312001',
 '7311996',
 '6950290',
 '6916320',
 '6460338',
 '6460234',
 '6801837',
 '7336547',
 '7336546',
 '7330737',
 '7322916',
 '7322915',
 '7313655',
 '7313654',
 '7036371',
 '7033931',
 '6895794',
 '6797590',
 '6458888',
 '6171698',
 '6171697',
 '6171696',
 '6118786',
 '6118781',
 '6118780',
 '6118769',
 '6118761']

In [16]:
df = pd.DataFrame(data=pmids, columns=['PMID'])

In [25]:
df.sample(1000)

Unnamed: 0,PMID
4011,6185069
2804,6794856
6865,6787342
9901,7009863
8967,6782863
8522,7237871
9708,7206103
6895,6455136
1890,7301149
8624,7214121
