In [45]:
from Bio import Entrez
from Bio import Medline
import datetime 

Entrez.email = "bgreshake@googlemail.com"

search_term = "((gender[Title/Abstract]) OR sex[Title/Abstract]) AND (fMRI[Title/Abstract] OR EEG[Title/Abstract])"

In [46]:
crawltime = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

# get number of articles we need to fetch
handle = Entrez.esearch(db="pubmed", term=search_term)
record = Entrez.read(handle)
article_number = record["Count"]
handle.close()

In [47]:
# get all articles from pubmed
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=article_number)
record = Entrez.read(handle)
id_list = record["IdList"]
handle.close()

In [48]:
# split input list in chunks of 150 so pubmed will honor our requests

id_list_chunks = [id_list[x:x+150] for x in range(0, len(id_list), 150)]
len(id_list_chunks)
print len(id_list_chunks[-1])

3


In [49]:
# fetch all records from pubmed, parse as medline format

all_records = []
for i in id_list_chunks:
    handle = Entrez.efetch(db="pubmed", id=i, rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    records = list(records)
    all_records += records
    handle.close()

In [62]:
# go through all IDs, see whether an DOI is associated and save pubmed + DOI
# "../data/parsetime-pubmed-crawl.csv" contains all pubmed IDs, title, date published plus their DOI
# "../data/parsetime-pubmed-crawl-no-doi.csv" contains all pubmed IDs for which no DOI was found, w/ title & DP
# both files have the full search term used for the search in the first "#"-prefixed comment line
# followed by a header

outfile = "../data/%s-pubmed-crawl.csv" % crawltime
output = open(outfile,"w")
output.write("#search term: %s \n" % search_term)
output.write("pubmed_ID\ttitle\tdate_published\tDOI\n")

no_doi_outfile = "../data/%s-pubmed-crawl-no-doi.csv" % crawltime
no_doi_output = open(no_doi_outfile,"w")
no_doi_output.write("#search term: %s \n" % search_term)
no_doi_output.write("pubmed_ID\ttitle\tdate_published\n")


for record in all_records:
    doi = "-"
    if "AID" in record.keys():
        for aid in record["AID"]:
            if "[doi]" in aid:
                doi = aid.replace("[doi]","")
    
    if "DP" in record.keys():
        date_published = record["DP"]
    else:
        date_published = "NA"
        
    if "TI" in record.keys():
        title = record["TI"]
    else:
        title = "NA"
    
    if doi != "-":
        output.write("%s\t%s\t%s\t%s\n" % (record["PMID"], title, date_published, doi ))     
    else:
        no_doi_output.write("%s\t%s\t%s\n" % (record["PMID"], title, date_published))        
output.close()
no_doi_output.close()

In [57]:
print(all_records[0]["DP"])
print(all_records[0]["TI"])

2017 Aug 31
Wake up to sleep: The effects of lacosamide on daytime sleepiness in adults with epilepsy.
