In [1]:
#Links
#LINK1: https://www.ncbi.nlm.nih.gov/home/develop/api/
#LINK2: https://colab.research.google.com/drive/1VOuvANFR08twLBROqYwO_TV34pazgF0_
#LINK3: https://www.nlm.nih.gov/pubs/techbull/mj12/mj12_pm_author_ranking.html
#LINK4: https://pubmed.ncbi.nlm.nih.gov/?term=Kumar+V&cauthor_id=32489811
#LINK5: https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/#converter
#
#Notes:
# Relevant:
#   The current preconditons include having the Name and Organization previously, with the additional constraint
# of a required computed author id that is generated from PubMed.
#   The date has the months in 3/4 letter abbreviations for some ungodly reason. Will have to convert later.
#   As of now, I return the DOI link, rather than the study it actually leads to. Ran out of time, but I think
# this is easily fixable.
#   The topics generated are the keywords from each article pulled. This is not a good way to do this, but should
# be fine for a start. There exists MeSH headings on some studies that will work much better for directly pulling,
# but ideally we just do something else entirely (Some early NLP? Maybe we just don't use tags here)
#
# Other:
#   There exists an 'affliation' tag on each study published that lists the author's university/location
# and potentially an email, which could be useful.
#use doi to check if a study exists on pubmed
#add pmid and check for it

from bs4 import BeautifulSoup
import requests
import urllib.request
import json
import time

In [11]:
# Converts any possible PubMed spelling of the month to a numerical we want
# If the month is already a number, leave it alone
# Add more entries if encountered
def monthToNum(month):
    if month == 'Jan' or month == 'January':
        month = '01'
    elif month == 'Feb' or month == 'February':
        month = '02'
    elif month == 'Mar' or month == 'March':
        month = '03'
    elif month == 'Apr' or month == 'April':
        month = '04'
    elif month == 'May':
        month = '05'
    elif month == 'Jun' or month == 'June':
        month = '06'
    elif month == 'Jul' or month == 'July':
        month = '07'
    elif month == 'Aug' or month == 'August':
        month = '08'
    elif month == 'Sep' or month == 'Sept' or month == 'September':
        month = '09'
    elif month == 'Oct' or month == 'October':
        month = '10'
    elif month == 'Nov' or month == 'November':
        month = '11'
    elif month == 'Dec' or month == 'December':
        month = '12'
    return month

'https://stackoverflow.com/'

In [2]:
def generate_article_and_tags(soup):
    
    #######################
    ### Article Section ###
    #######################
    
    # Create a base article dictionary and populate it with the info found below
    # If the majority of the fields are not replaced, something is probably going wrong
    article = {'title': 'None', 'publication date': 'None', 'pdf link': 'None', 'description': 'None'}
    
    ### Article Title ###
    # Search for the title of the article
    title = soup.find('articletitle')
    if title:
        article['title'] = title.text
    
    ### Publication Date ###
    # Search for the date of publication
    # Sometimes only part of the date has been provided
    pubdate = soup.find('pubdate')
    date = 'None'
    if pubdate:
        date = pubdate.year.text
        if pubdate.month:
            month = monthToNum(pubdate.month.text)       
            date += '/' + pubdate.month.text
            if pubdate.day:
                date += '/' + pubdate.day.text
    article['publication date'] = date
    
    ### Pdf Link ###
    #Search for DOI name (All published articles should have a DOI name)
    doi = soup.find('articleid', idtype = "doi")
    if doi:
        #The pdf link should be the final redirect of accessing DOI name
        pdflink = urllib.request.urlopen('https://doi.org/' + doi.text)
        article['pdf link'] = pdflink.geturl()
    
    ### Abstract ###
    #Search for article abstract (PubMed does not have any other form of description)
    abst = soup.find('abstracttext')
    if desc:
        article['description'] = abst.text
    
    ####################
    ### Tags Section ###
    ####################
    
    #Temporary implementation of tag collection
    tags = []
    keywords = soup.find_all('keyword')
    for keyword in keywords:
        tags.append(keyword.text) 
    #Could add MeSH tags if exist, would probably be better than these
    
    return study, tags

In [None]:
def checkPmid(pmid):
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=['" + pmid + "']"
    #wait actually we dont have to do this, but i guess double checking would be helpful
    

In [7]:
def generate_researcher(name, org='None', email='None', studyNum = 5, doi = 'None', pmid = 'None'):
    
    #######################
    ### Link Generation ###
    #######################
    
    ### PMID takes priority over DOI ###
    #check doi
    doi_to_pmi = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
    
    ### Search Link Creation ###
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json"
    #Search 
    #Name = Name of researcher
    #StudyNum = Max number of result
    
    #Search Query - reorganize name into the format 'Last+F', though I think including the entire first name also works
    query = name.split()[1] + "+" + name.split()[0]
    #Computed author id
    #cauthor = '32489811'
    #Assemble elink
    url = base + "&retmax=" + studyNum + "&term=" + query# + "&cauthor_id=" + cauthor
    #Test esearch URL
    #print(url)
    
    ### Create List of Studies ###
    #Add search results to a list
    webpage = urllib.request.urlopen(url).read()
    dict_page =json.loads(webpage)
    idlist = dict_page["esearchresult"]["idlist"]
    #Test for search results
    print(idlist)
    
    #############################
    ### Researcher Generation ###
    #############################
    
    #Setup dictionary
    researcher = {'name': name, 'organization': org, 'topics': 'None', 'studies': 'None'}
    studies = []
    topics = []
    
    for link in idlist:
        url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=idlist"
        url = url.replace('idlist', link)
        
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "html.parser")
        print(soup)
        
        #study, tags = generate_study_and_tags(soup)
        
        #Occationally I ran into blank studies? I think this was a result of other issues with my code
        ADD = False
        for entries in study.values():
            if entries != 'None':
                ADD = True
        if ADD:
            studies.append(study)
            topics = topics + tags
        
        #Need to sleep otherwise we get request error
        time.sleep(0.1)
    
    if studies:
        #Sort the studies by date
        studies = sorted(studies, key = lambda i: i['publication date'], reverse=True)
        researcher['studies'] = studies
    if topics:
        researcher['topics'] = topics
    #Test scraping results
    #print(researcher)
    
    return researcher

In [8]:
print(generate_researcher('Andrew Garcia'))

['33653945', '33057063', '32529416', '32378039', '31933147']
<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">

<pubmedarticleset>
<pubmedarticle>
<medlinecitation owner="NLM" status="In-Data-Review">
<pmid version="1">33653945</pmid>
<daterevised>
<year>2021</year>
<month>03</month>
<day>03</day>
</daterevised>
<article pubmodel="Print">
<journal>
<issn issntype="Electronic">1538-8514</issn>
<journalissue citedmedium="Internet">
<volume>20</volume>
<issue>3</issue>
<pubdate>
<year>2021</year>
<month>Mar</month>
</pubdate>
</journalissue>
<title>Molecular cancer therapeutics</title>
<isoabbreviation>Mol Cancer Ther</isoabbreviation>
</journal>
<articletitle>Resistance to Pyrrolobenzodiazepine Dimers Is Associated with SLFN11 Downregulation and Can Be Reversed through Inhibition of ATR.</articletitle>
<pagination>
<medlinepgn>541-552</medlinepgn>
</pagination>
<elocation

NameError: name 'study' is not defined