-
Notifications
You must be signed in to change notification settings - Fork 1
/
pubmed_literature_miner.py
36 lines (34 loc) · 1.65 KB
/
pubmed_literature_miner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#! usr/bin/env python3
# Author Gaurav
# Universitat Potsdam
from urllib.request import urlopen
from bs4 import BeautifulSoup
def candidaLiteratureMiner(term = None, fetch_count = None):
"""sumary_line
a candidaliteratureMiner from pubmed, given the term as candida
it will connect to the pubmed, prepare the ids that are releavant
to the candida, and will return the literature for them.
Keyword arguments:
argument -- description
term = "term" that you want to search in pubmed
Return: return_description
returns a ncbi pubmed id and the literature for the candida
"""
pubmed_term = term
count = fetch_count
pubmed_open = urlopen(f"https://pubmed.ncbi.nlm.nih.gov/?term={pubmed_term}&sort=date&size={count}")
ids_text = list(map(lambda n: n.strip().split(),\
(map(str,BeautifulSoup(pubmed_open, "html.parser").\
find_all("div", class_="share")))))
ids = [i.split("/")[-2] for i in list(filter(lambda n: \
"ncbi" in n,[j for i in ids_text for j in \
i if "permalink" in j]))]
format_id_links = []
for i in range(len(ids)):
format_id_links.append(f"https://pubmed.ncbi.nlm.nih.gov/{ids[i]}/")
ncbi_derive_information = {}
for i in range(len(format_id_links)):
ncbi_derive_information[ids[i]] = ''.join([i.get_text().strip() \
for i in BeautifulSoup(urlopen(format_id_links[i]), \
"html.parser").find_all("div", class_ = "abstract-content selected")])
return [(k,v) for k,v in ncbi_derive_information.items() if k or v != ""]