In [1]:
from Bio import Entrez
from Bio import Medline
from tqdm import tqdm

import sqlite3

In [2]:
def get_count(term):
    """
    Use pubmed api to get a count of number of results for a search term
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    count_handle = Entrez.esearch(db="pubmed",
                                  sort="relevance",
                                  retmode="xml",
                                  rettype="count",
                                  field="DP",
                                  term=term)
    count_results = Entrez.read(count_handle)
    count = int(count_results["Count"])
    
    return count

In [3]:
def chunked_pmids(term, chunksize=1000):
    """
    Use pubmed api to fetch blocks of pmids for a search term
    """
    
    count = get_count(term)
    
    retmax_requests = list(range(0, count, chunksize))
                
    pmids = []
    
    print("{} blocks of to process".format(len(retmax_requests)))
    
    for i, retmax in enumerate(retmax_requests):
        
        print("Processing block {}".format(i))
        
        pmid_handle = Entrez.esearch(db="pubmed",
                                     sort="relevance",
                                     retmode="xml",
                                     usehistory='y',
                                     retstart=retmax,
                                     retmax=chunksize,
                                     field="DP",
                                     term=term)
        pmids.append(Entrez.read(pmid_handle)["IdList"])
        
    return pmids

In [4]:
def fetch_medline(pmids):
    """
    Use pubmed api to fetch medline record for pmids
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    handle = Entrez.efetch(db='pubmed',
                           id=pmids,
                           rettype='medline',
                           retmode='text')
    records = Medline.parse(handle)
    
    return records

In [5]:
def getpapers(pmid_chunks):
    """
    Fetch_medline(chunk) returns a generator object of medline records. we iterate through it saving the records 
    to a list. We make a dict of the list indexed by the pubmed id. 
    """
    
    papers = []
    
    print("fetching medline records:")
    
    for chunk in tqdm(pmid_chunks):
        records = fetch_medline(chunk)
        for record in records:
            try:
                  papers.append((record['PMID'], (', '.join(record['AU'])), record['DP'], 
                                 record['TI'], record['JT'], (', '.join(record['MH'])), 
                                 (', '.join(record['PT'])), record['AB']))            
            except: continue
            
    return papers


In [16]:
def save_papers(papers):
    conn = sqlite3.connect('papers.db')
    c = conn.cursor()
    
    # Drop table if already exists
    c.execute("DROP TABLE IF EXISTS papers")
    
    # Create table
    c.execute('''CREATE TABLE papers
             (pmid, author, date, title, journal, key_words, pub_type, abstract)''')

    # Insert a rows of data
    c.executemany('INSERT INTO papers VALUES (?,?,?,?,?,?,?,?)', papers)

    # Save (commit) the changes
    conn.commit()

    # We can also close the connection if we are done with it.
    # Just be sure any changes have been committed or they will be lost.
    conn.close()
    

In [7]:
def fetch_papers(year):
    
    print("{} records to fetch".format(get_count(year)))
    pmid_chunks = chunked_pmids(year, 500)
    papers = getpapers(pmid_chunks)
    
    return papers

In [8]:
papers = fetch_papers(2017)

579352 records to fetch
1159 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
Processing block 10
Processing block 11
Processing block 12
Processing block 13
Processing block 14
Processing block 15
Processing block 16
Processing block 17
Processing block 18
Processing block 19
Processing block 20
Processing block 21
Processing block 22
Processing block 23
Processing block 24
Processing block 25
Processing block 26
Processing block 27
Processing block 28
Processing block 29
Processing block 30
Processing block 31
Processing block 32
Processing block 33
Processing block 34
Processing block 35
Processing block 36
Processing block 37
Processing block 38
Processing block 39
Processing block 40
Processing block 41
Processing block 42
Processing block 43
Processing block 44
Processing block 45
Processing block 46
Processing block 47


  0%|          | 0/1159 [00:00<?, ?it/s]

fetching medline records:


100%|██████████| 1159/1159 [57:58<00:00,  2.93s/it] 


In [17]:
save_papers(papers)

In [18]:
conn = sqlite3.connect('papers.db')
c = conn.cursor()

for row in c.execute('SELECT * FROM papers ORDER BY date LIMIT 5'):
        print(row)


('28423065', 'Gil E, Na SJ, Ryu JA, Lee DS, Chung CR, Cho YH, Jeon K, Sung K, Suh GY, Yang JH', '2017', 'Association of body mass index with clinical outcomes for in-hospital cardiac arrest adult patients following extracorporeal cardiopulmonary resuscitation.', 'PloS one', 'Adolescent, Adult, Aged, Aged, 80 and over, Body Mass Index, Cardiopulmonary Resuscitation/*methods, *Extracorporeal Membrane Oxygenation, Female, Heart Arrest/complications/*diagnosis/mortality/surgery, Hospital Mortality, Humans, Male, Middle Aged, Obesity/complications/*diagnosis/mortality/surgery, Patient Discharge/statistics & numerical data, *Registries, Republic of Korea, Retrospective Studies, Survival Rate, Time Factors, Treatment Outcome', 'Journal Article, Observational Study', 'BACKGROUND: Obesity might be associated with disturbance of cannulation in situation of extracorporeal cardiopulmonary resuscitation (ECPR). However, limited data are available on obesity in the setting of ECPR. Therefore, we inv