In [1]:
#Links
#LINK1: https://www.ncbi.nlm.nih.gov/home/develop/api/
#LINK2: https://colab.research.google.com/drive/1VOuvANFR08twLBROqYwO_TV34pazgF0_
#LINK3: https://www.nlm.nih.gov/pubs/techbull/mj12/mj12_pm_author_ranking.html
#LINK4: https://pubmed.ncbi.nlm.nih.gov/?term=Kumar+V&cauthor_id=32489811
#LINK5: https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/#converter
#
#Notes:
# Relevant:
#   The current preconditons include having the Name and Organization previously, with the additional constraint
# of a required computed author id that is generated from PubMed.
#   The date has the months in 3/4 letter abbreviations for some ungodly reason. Will have to convert later.
#   As of now, I return the DOI link, rather than the study it actually leads to. Ran out of time, but I think
# this is easily fixable.
#   The topics generated are the keywords from each article pulled. This is not a good way to do this, but should
# be fine for a start. There exists MeSH headings on some studies that will work much better for directly pulling,
# but ideally we just do something else entirely (Some early NLP? Maybe we just don't use tags here)
#
# Other:
#   There exists an 'affliation' tag on each study published that lists the author's university/location
# and potentially an email, which could be useful.

from bs4 import BeautifulSoup
import requests
import urllib.request
import json
import time

In [2]:
def generate_study_and_tags(soup):
    
    ### Study Section ###
    study = {'title': 'None', 'publication date': 'None', 'pdf link': 'None', 'description': 'None'}
    
    # Article Name #
    name = soup.find('articletitle')
    if name:
        study['title'] = name.text
    
    # Publication Date #
    pubdate = soup.find('pubdate')
    date = 'None'
    if pubdate:
        date = pubdate.year.text
        if pubdate.month:
            month = pubdate.month.text
            if month == 'Jan':
                month = '1'
            elif month == 'Feb':
                month = '2'
            elif month == 'Mar':
                month = '3'
            elif month == 'Apr':
                month = '4'
            elif month == 'May':
                month = '5'
            elif month == 'Jun':
                month = '6'
            elif month == 'Jul':
                month = '7'
            elif month == 'Aug':
                month = '8'
            elif month == 'Sept':
                month = '9'
            elif month == 'Oct':
                month = '10'
            elif month == 'Nov':
                month = '11'
            elif month == 'Dec':
                month = '12'
            date += '/' + pubdate.month.text
            if pubdate.day:
                date += '/' + pubdate.day.text
    study['publication date'] = date
    
    # Pdf Link #
    doi = soup.find('articleid', idtype = "doi")
    if doi:
        study['pdf link'] = 'https://doi.org/' + doi.text
    
    # Abstract #
    desc = soup.find('abstracttext')
    if desc:
        study['description'] = desc.text
    
    ### Tags Section ###
    tags = []
    keywords = soup.find_all('keyword')
    for keyword in keywords:
        tags.append(keyword.text) 
    #Could add MeSH tags if exist, would probably be better than these
    
    return study, tags

In [3]:
def generate_researcher(InputName):
    
    ### Prior information ###
    #Name needs to be in the format "First Last"
    name = InputName
    org = 'Organization'
    email = 'Email'
    
    ### Search Link Creation ###
    #Fill in prior information
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    #Database searching
    db = "pubmed"
    #Output format
    ret = "json"
    #Max number of results
    retnum = '5'
    #Search Query - reorganize name into the format 'Last+F', though I think including the entire first name also works
    query = InputName.split()[1] + "+" + InputName.split()[0]
    #Computed author id
    #cauthor = '32489811'
    #Assemble elink
    url = base + "esearch.fcgi?db=" + db + "&retmode=" + ret + "&retmax=" + retnum + "&term=" + query# + "&cauthor_id=" + cauthor
    #Test esearch URL
    #print(url)
    
    ### Create List of Studies ###
    #Add search results to a list
    webpage = urllib.request.urlopen(url).read()
    dict_page =json.loads(webpage)
    idlist = dict_page["esearchresult"]["idlist"]
    #Test for search results
    #print(idlist)
    
    ### Create Researcher Dictionary ###
    #Setup dictionary
    researcher = {'name': name, 'organization': org, 'topics': 'None', 'studies': 'None'}
    studies = []
    topics = []
    
    for link in idlist:
        url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=idlist"
        url = url.replace('idlist', link)
        
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "html.parser")
        
        study, tags = generate_study_and_tags(soup)
        
        #Occationally I ran into blank studies? I think this was a result of other issues with my code
        ADD = False
        for entries in study.values():
            if entries != 'None':
                ADD = True
        if ADD:
            studies.append(study)
            topics = topics + tags
        
        #Need to sleep otherwise we get request error
        time.sleep(0.1)
    
    if studies:
        #Sort the studies by date
        studies = sorted(studies, key = lambda i: i['publication date'], reverse=True)
        researcher['studies'] = studies
    if topics:
        researcher['topics'] = topics
    #Test scraping results
    #print(researcher)
    
    return researcher

In [4]:
print(generate_researcher('Andrew Garcia'))

{'name': 'Andrew Garcia', 'organization': 'Organization', 'topics': ['Breast surgery', 'Neuromodulation', 'Postmastectomy pain syndrome', 'Postsurgical pain', 'Alternative treatments', 'Chronic pain', 'Non-opioid', 'Pain management', 'Perioperative', 'NO', 'fibrosis', 'hypertrophy'], 'studies': [{'title': 'A Comprehensive Review of the Diagnosis, Treatment, and Management of Postmastectomy Pain Syndrome.', 'publication date': '2020/Jun/11', 'pdf link': 'https://doi.org/10.1007/s11916-020-00876-6', 'description': 'Postmastectomy pain syndrome (PMPS) remains poorly defined, although it is applied to chronic neuropathic pain following surgical procedures of the breast, including mastectomy and lumpectomy in breast-conserving surgery. It is characterized by persistent pain affecting the anterior thorax, axilla, and/or medial upper arm following mastectomy or lumpectomy. Though the onset of pain is most likely to occur after surgery, there may also be a new onset of symptoms following adjuv