-
Notifications
You must be signed in to change notification settings - Fork 0
/
Get_PubMed.py
60 lines (51 loc) · 2 KB
/
Get_PubMed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from Bio import Entrez
import json
import pymongo
def number_of_entries(query):
Entrez.email = 'mikael@koutero.name'
handle = Entrez.egquery(term=query)
record = Entrez.read(handle)
for row in record["eGQueryResult"]:
if row["DbName"]=="pubmed":
entries_num = row["Count"]
return int(entries_num)
# https://marcobonzanini.com/2015/01/12/searching-pubmed-with-python/
def search(query, retstart):
Entrez.email = 'mikael@koutero.name'
handle = Entrez.esearch(db='pubmed',
sort='relevance',
retmode='xml',
retstart=retstart,
retmax=100000,
term=query)
results = Entrez.read(handle)
return results
def fetch_details(id_list,retstart):
ids = ','.join(id_list)
Entrez.email = 'mikael@koutero.name'
handle = Entrez.efetch(db='pubmed',
retmode='xml',
retstart=retstart,
retmax=10000,
id=ids)
results = Entrez.read(handle)
return results
if __name__ == '__main__':
entries_num = number_of_entries('bacteria')
host_string = "mongodb://localhost"
port = 27017
mongo_client = pymongo.MongoClient(host_string, port)
# get a reference to mongodb 'publishorperish'
mongo_db = mongo_client['publishorperish']
# get a reference to papers collection
papers_info = mongo_db['papers_info']
# number of entries :1929202
for x in range(0, entries_num, 100000):
results = search(query='bacteria',retstart=x)
id_list = results['IdList']
for y in range(0, len(id_list), 10000):
papers = fetch_details(id_list=id_list,retstart=y)
for i, paper in enumerate(papers):
papers_info.insert_one(papers[i])
# mongodb fiddling, just in case any duplicate
# db.papers_info.createIndex( {"MedlineCitation.PMID": 1}, {unique: true, dropDups: true} )