In [6]:
from Bio import Entrez
from Bio import Medline

In [7]:
def get_count(term):
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    count_handle = Entrez.esearch(db="pubmed",
                                  sort="relevance",
                                  retmode="xml",
                                  rettype="count",
                                  field="DP",
                                  term=term)
    count_results = Entrez.read(count_handle)
    count = int(count_results["Count"])
    
    return count

In [8]:
def chunked_pmids(term, chunksize=1000):
    
    count = get_count(term)
    
    retmax_requests = list(range(0, count, chunksize))
                
    pmids = []
    
    print("{} blocks to process".format(len(retmax_requests)))
    
    for i, retmax in enumerate(retmax_requests):
        
        print("Processing block {}".format(i))
        
        pmid_handle = Entrez.esearch(db="pubmed",
                                     sort="relevance",
                                     retmode="xml",
                                     usehistory='y',
                                     retstart=retmax,
                                     retmax=chunksize,
                                     field="DP",
                                     term=term)
        pmids.append(Entrez.read(pmid_handle)["IdList"])
        
    return pmids

In [9]:
def fetch_medline(pmids):
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    handle = Entrez.efetch(db='pubmed',
                           id=pmids,
                           rettype='medline',
                           retmode='text')
    records = Medline.parse(handle)
    
    return records

In [10]:
def make_results_dict(pmid_chunks):
    """
    Fetch_medline(chunk) returns a generator object of medline records. we iterate through it saving the records 
    to a list. We make a dict of the list indexed by the pubmed id. 
    """
    
    temp_records = []
    records = []
    results = {}
        
    for chunk in tqdm(pmid_chunks):
        temp_records = fetch_medline(chunk)
        for record in temp_records:
            records.append(record)
    
    for record in records:
        results[record.get('PMID')] = record
        
    return results


In [11]:
def fetch_a_year_of_medline(year):
    
    print("{} records to fetch".format(get_count(year)))
    pmid_chunks = chunked_pmids(year, 1000)
    results = make_results_dict(pmid_chunks)
    
    return results

In [None]:
results = fetch_a_year_of_medline('2017')

571781 records to fetch
572 blocks to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
Processing block 10
Processing block 11
Processing block 12
Processing block 13
Processing block 14
Processing block 15
Processing block 16
Processing block 17
Processing block 18
Processing block 19
Processing block 20
Processing block 21
Processing block 22
Processing block 23
