In [32]:
#Links
#LINK1: https://www.ncbi.nlm.nih.gov/home/develop/api/
#LINK2: https://colab.research.google.com/drive/1VOuvANFR08twLBROqYwO_TV34pazgF0_
#LINK3: https://www.nlm.nih.gov/pubs/techbull/mj12/mj12_pm_author_ranking.html
#LINK4: https://pubmed.ncbi.nlm.nih.gov/?term=Kumar+V&cauthor_id=32489811
#LINK5: https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/#converter
#
#Notes:
# Relevant:
#   The current preconditons include having the Name and Organization previously, with the additional constraint
# of a required computed author id that is generated from PubMed.
#   The date has the months in 3/4 letter abbreviations for some ungodly reason. Will have to convert later.
#   As of now, I return the DOI link, rather than the study it actually leads to. Ran out of time, but I think
# this is easily fixable.
#   The topics generated are the keywords from each article pulled. This is not a good way to do this, but should
# be fine for a start. There exists MeSH headings on some studies that will work much better for directly pulling,
# but ideally we just do something else entirely (Some early NLP? Maybe we just don't use tags here)
#
# Other:
#   There exists an 'affliation' tag on each study published that lists the author's university/location
# and potentially an email, which could be useful.

from bs4 import BeautifulSoup
import requests
import urllib.request
import json
import time

In [28]:
#Prior information
name = 'First Last'
org = 'Organization'
email = 'Email'

#Fill in prior information
base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
#Database searching
db = "pubmed"
#Output format
ret = "json"
#Max number of results
retnum = '5'
#Search Query - reorganize name into the format 'Last+F', though I think including the entire first name also works
name = "Kumar+V"
#Computed author id
cauthor = '32489811'
#Assemble elink
url = base + "esearch.fcgi?db=" + db + "&retmode=" + ret + "&retmax=" + retnum + "&term=" + name + "&cauthor_id=" + cauthor
#Test the esearch URL
print(url)

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=5&term=Kumar+V&cauthor_id=32489811


In [29]:
#Add search results to a list
webpage = urllib.request.urlopen(url).read()
dict_page =json.loads(webpage)
idlist = dict_page["esearchresult"]["idlist"]

#Test the search results
print(idlist)

['33618251', '33617872', '33615940', '33615185', '33613829']


In [73]:
def generate_study_and_tags(soup):
    
    ### Study Section ###
    study = {'title': 'None', 'publication date': 'None', 'pdf link': 'None', 'description': 'None'}
    
    # Article Name #
    name = soup.find('articletitle')
    if name:
        study['title'] = name.text
    
    # Publication Date #
    pubdate = soup.find('pubdate')
    date = 'None'
    if pubdate.year:
        date = pubdate.year.text
        if pubdate.month:
            date += '/' + pubdate.month.text
            if pubdate.day:
                date += '/' + pubdate.day.text
    study['publication date'] = date
    
    # Pdf Link #
    doi = soup.find('articleid', idtype = "doi")
    if doi:
        study['pdf link'] = 'https://doi.org/' + doi.text
    
    # Abstract #
    desc = soup.find('abstracttext')
    if desc:
        study['description'] = desc.text
    
    ### Tags Section ###
    tags = []
    keywords = soup.find_all('keyword')
    for keyword in keywords:
        tags.append(keyword.text) 
    #Could add MeSH tags if exist, would probably be better than these
    
    return study, tags

In [74]:
#Setup dictionary
researcher = {'name': name, 'organization': org, 'topics': 'None', 'studies': 'None'}
studies = []
topics = []

for link in idlist:
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=idlist"
    url = url.replace('idlist', link)
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    
    study, tags = generate_study_and_tags(soup)
    
    studies.append(study)
    topics = topics + tags
    
    #Need to sleep otherwise we get 
    time.sleep(1)

if studies:
    researcher['studies'] = studies
if topics:
    researcher['topics'] = topics
print(researcher)

{'name': 'Kumar+V', 'organization': 'asd', 'topics': ['1H-1,2,3-Triazole', '4-Aminoquinoline', 'Anti-plasmodial activity', 'Benzoxaborole', 'Cytotoxicity', 'Suzuki-miyaura reaction', 'Ethanol', 'fasting', 'immunohistochemistry', 'lipid droplet', 'liver', 'mass spectrometry', 'mitochondria', 'perilipins', 'proteomics', 'steatosis', ' Caenorhabditis elegans ', ' Pseudomonas aeruginosa ', 'Anti-quorum sensing', 'biofilm', 'vanillin', 'FEA of screws', 'mandible fracture', 'open reduction and internal fixation', 'self-drilling screws', 'self-tapping screws'], 'studies': [{'title': 'Synthesis and antiplasmodial evaluation of 1H-1,2,3-triazole grafted 4-aminoquinoline-benzoxaborole hybrids and benzoxaborole analogues.', 'publication date': '2021/Feb/16', 'pdf link': 'https://doi.org/10.1016/j.bioorg.2021.104733', 'description': 'A library of 1H-1,2,3-triazole-tethered 4-aminoquinoline-benzoxaborole hybrids as well as aryl substituted benzoxaborole analogues was synthesized and screened for th