In [1]:
from Bio import Entrez
from Bio import Medline
from tqdm import tqdm

import sqlite3

In [2]:
def get_count(term):
    """
    Use pubmed api to get a count of number of results for a search term
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    count_handle = Entrez.esearch(db="pubmed",
                                  sort="relevance",
                                  retmode="xml",
                                  rettype="count",
                                  field="DP",
                                  term=term)
    count_results = Entrez.read(count_handle)
    count = int(count_results["Count"])
    
    return count

In [3]:
def chunked_pmids(term, chunksize=1000):
    """
    Use pubmed api to fetch blocks of pmids for a search term
    """
    
    count = get_count(term)
    
    retmax_requests = list(range(0, count, chunksize))
                
    pmids = []
    
    print("{} blocks of to process".format(len(retmax_requests)))
    
    for i, retmax in enumerate(retmax_requests):
        
        print("Processing block {}".format(i))
        
        pmid_handle = Entrez.esearch(db="pubmed",
                                     sort="relevance",
                                     retmode="xml",
                                     usehistory='y',
                                     retstart=retmax,
                                     retmax=chunksize,
                                     field="DP",
                                     term=term)
        pmids.append(Entrez.read(pmid_handle)["IdList"])
        
    return pmids

In [4]:
def fetch_medline(pmids):
    """
    Use pubmed api to fetch medline record for pmids
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    handle = Entrez.efetch(db='pubmed',
                           id=pmids,
                           rettype='medline',
                           retmode='text')
    records = Medline.parse(handle)
    
    return records

In [11]:
def getpapers(pmid_chunks):
    """
    Fetch_medline(chunk) returns a generator object of medline records. we iterate through it saving the records 
    to a list. We make a dict of the list indexed by the pubmed id. 
    """
    
    papers = []
    
    print("fetching medline records:")
    
    for chunk in tqdm(pmid_chunks):
        records = fetch_medline(chunk)
        for record in records:
            try:
                papers.append((record['PMID'], record['AU'][0], record['DP'], record['TI'], record['JT'], record['AB']))
            except: continue
            
    return papers


In [None]:
def save_papers(papers):
    conn = sqlite3.connect('papers.db')
    c = conn.cursor()

    # Create table
    c.execute('''CREATE TABLE papers
             (pmid, author, date, title, journal, abstract)''')

    # Insert a rows of data
    c.executemany('INSERT INTO papers VALUES (?,?,?,?,?,?)', papers)

    # Save (commit) the changes
    conn.commit()

    # We can also close the connection if we are done with it.
    # Just be sure any changes have been committed or they will be lost.
    conn.close()
    

In [None]:
def fetch_papers(year):
    
    print("{} records to fetch".format(get_count(year)))
    pmid_chunks = chunked_pmids(year, 500)
    papers = getpapers(pmid_chunks)
    
    return papers

In [None]:
papers = fetch_papers(2017)

571779 records to fetch
1144 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
Processing block 10
Processing block 11
Processing block 12
Processing block 13
Processing block 14
Processing block 15
Processing block 16
Processing block 17
Processing block 18
Processing block 19
Processing block 20
Processing block 21
Processing block 22
Processing block 23
Processing block 24
Processing block 25
Processing block 26
Processing block 27
Processing block 28
Processing block 29
Processing block 30
Processing block 31
Processing block 32
Processing block 33
Processing block 34
Processing block 35
Processing block 36
Processing block 37
Processing block 38
Processing block 39
Processing block 40
Processing block 41
Processing block 42
Processing block 43
Processing block 44
Processing block 45
Processing block 46
Processing block 47


In [15]:
save_papers(papers)

In [16]:
for row in c.execute('SELECT * FROM papers ORDER BY date'):
        print(row)


('19866945', 'Reed W', '1900 Dec 15', 'A COMPARATIVE STUDY OF THE BIOLOGICAL CHARACTERS AND PATHOGENESIS OF BACILLUS X (STERNBERG), BACILLUS ICTEROIDES (SANARELLI), AND THE HOG-CHOLERA BACILLUS (SALMON AND SMITH).', 'The Journal of experimental medicine', '1. Bacillus X (Sternberg) belongs to the colon group. 2. Bacillus icteroides (Sanarelli) is a member of the hog-cholera group. 3. The various channels of infection, the duration of the disease and the gross and microscopical lesions in mice, guinea-pigs and rabbits are the same for Bacillus icteroides and the hog-cholera bacillus. 4. The clinical symptoms and the lesions observed in dogs inoculated intravenously with Bacillus icteroides, are reproduced in these animals by infection with the hog-cholera bacillus. 5. Bacillus icteroides when fed to the domestic pig causes fatal infection, accompanied by diphtheritic, necrotic and ulcerative lesions in the digestive tract, such as are seen in hogs when infected with the hog-cholera baci