In [1]:
from Bio import Entrez
from Bio import Medline
from tqdm import tqdm

import sqlite3

In [2]:
def get_count(term):
    """
    Use pubmed api to get a count of number of results for a search term
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    count_handle = Entrez.esearch(db="pubmed",
                                  sort="relevance",
                                  retmode="xml",
                                  rettype="count",
                                  #field="DP",
                                  term=term)
    count_results = Entrez.read(count_handle)
    count = int(count_results["Count"])
    
    return count

In [3]:
def chunked_pmids(term, chunksize=1000):
    """
    Use pubmed api to fetch blocks of pmids for a search term
    """
    
    count = get_count(term)
    
    retmax_requests = list(range(0, count, chunksize))
                
    pmids = []
    
    print("{} blocks of to process".format(len(retmax_requests)))
    
    for i, retmax in enumerate(retmax_requests):
        
        print("Processing block {}".format(i))
        
        pmid_handle = Entrez.esearch(db="pubmed",
                                     sort="relevance",
                                     retmode="xml",
                                     usehistory='y',
                                     retstart=retmax,
                                     retmax=chunksize,
                                     #field="DP",
                                     term=term)
        pmids.append(Entrez.read(pmid_handle)["IdList"])
            
    return pmids

In [4]:
def fetch_medline(pmids):
    """
    Use pubmed api to fetch medline record for pmids
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    handle = Entrez.efetch(db='pubmed',
                           id=pmids,
                           rettype='medline',
                           retmode='text')
    records = Medline.parse(handle)
    
    return records

In [5]:
def getpapers(pmid_chunks):
    """
    Fetch_medline(chunk) returns a generator object of medline records. we iterate through it saving the records 
    to a list. We make a dict of the list indexed by the pubmed id. 
    """
    
    papers = []
    
    print("fetching medline records:")
    
    for chunk in tqdm(pmid_chunks):
        records = fetch_medline(chunk)
        for record in records:
            try:
                  papers.append((record['PMID'], (', '.join(record['AU'])), record['DP'], 
                                 record['TI'], record['JT'], (', '.join(record['MH'])), 
                                 (', '.join(record['PT'])), record['AB']))            
            except: continue
            
    return papers


In [6]:
def save_papers(papers):
    conn = sqlite3.connect('papers.db')
    c = conn.cursor()
    
    # Drop table if already exists
    c.execute("DROP TABLE IF EXISTS papers")
    
    # Create table
    c.execute('''CREATE TABLE papers
             (pmid, author, date, title, journal, key_words, pub_type, abstract)''')

    # Insert a rows of data
    c.executemany('INSERT INTO papers VALUES (?,?,?,?,?,?,?,?)', papers)

    # Save (commit) the changes
    conn.commit()

    # We can also close the connection if we are done with it.
    # Just be sure any changes have been committed or they will be lost.
    conn.close()
    

In [7]:
def fetch_papers(year):
    
    print("{} records to fetch".format(get_count(year)))
    pmid_chunks = chunked_pmids(year, 500)
    papers = getpapers(pmid_chunks)
    
    return papers

In [8]:
papers = fetch_papers("Idiopathic Pulmonary Fibrosis")

7150 records to fetch
15 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
Processing block 10
Processing block 11
Processing block 12
Processing block 13
Processing block 14


  0%|          | 0/15 [00:00<?, ?it/s]

fetching medline records:


100%|██████████| 15/15 [00:44<00:00,  2.26s/it]


In [9]:
save_papers(papers)

In [10]:
conn = sqlite3.connect('papers.db')
c = conn.cursor()

for row in c.execute('SELECT * FROM papers ORDER BY date LIMIT 5'):
        print(row)


('14272497', 'KOCH B', '1965 Apr 10', 'FAMILIAL FIBROCYSTIC PULMONARY DYSPLASIA: OBSERVATIONS IN ONE FAMILY.', 'Canadian Medical Association journal', '*Carcinoma, *Carcinoma, Bronchogenic, *Carcinoma, Squamous Cell, *Cyanosis, *Diagnosis, *Dyspnea, *Genetics, Medical, Humans, *Hypertension, *Hypertension, Pulmonary, *Idiopathic Pulmonary Fibrosis, *Joint Diseases, *Lung, *Lung Diseases, *Lung Neoplasms, *Osteoarthropathy, Secondary Hypertrophic, *Pathology, *Polycythemia, *Pulmonary Fibrosis, *Sweating', 'Journal Article', 'At least 31 cases of familial fibrocystic pulmonary dysplasia, within 10 families, have been described in the world literature. The mode of genetic transmission of this disease, however, has been uncertain until now. The author observed three unequivocal and five probable cases of familial fibrocystic pulmonary dysplasia among 56 members of one family. Diagnostic criteria included progressive dyspnea and cyanosis, digital clubbing, pulmonary hypertension, negative 