### Includes functions that are used to search and create a dataframe for PubMed articles based on their grant number
#### Main program that references these functions is included in mainpgrm notebook

In [3]:
import requests
from bs4 import BeautifulSoup 
import lxml
import json
import pandas as pd


#### Get PubMed IDs

In [4]:
#Take in the grant number as a string, and return the PubMed article ids associated with that grant number
#If no articles come up during the search, return none

def return_ids(grant_num):
    rsp = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&mindate=2020&maxdate=2023&retmax=70&term={grant_num}[Grant Number]')
    info = rsp.json() #convert the information from the response into a json to make it easily accessible
    ids = info['esearchresult']['idlist'] #grab the id list from the json information
    if ids != []:
        return ids #return id list
    else:
        return "Nothing found" #if there are no papers found, return Nothing found

#### Get Summary Information

In [5]:
#Function that takes in a list of PubMed Ids and returns the summary information for the articles associated with those IDs; returns a dictionary
#Can use this function in conjunction with returnIDs() function to go from grant number to article summaries
#The summary information includes a variety of information such as titles for all articles, authors, identifiers and more

def get_summary(idList):
    rsp = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&id={','.join(idList)}") #use join here to make list of ids into one comma separated string
    info = rsp.json() #convert the response information to easily accessible json info
    return info

#### Get Titles

In [6]:
#Function that takes in the summary of all articles given by the ids and returns a dictionary with the title of each article

def get_titles(summary):
    uids = summary['result']['uids'] #gives the list of unique ids for all articles in the summary
    title_dict = {}
    for id in uids:
        title_dict[id] = summary['result'][id]['title']
        #This grabs the title, summary is the information retrieved by esummary, 'results' is the key for the paper's info
        #within 'results' is each id and their corresponding info, within each id we can grab the info we want, in this case, the paper title
    return title_dict
    

#### Get Authors

In [7]:
#Function that takes in the summary of all the articles and returns a dictionary containing the authors of each article

def get_authors(summary):
    uids = summary['result']['uids'] #gives the list of unique ids
    author_dict = {} #create an empty dictionary that will store the authors by pubmed id
    for id in uids: #parse through each article given by the search via their ids
        authors = summary['result'][id]['authors'] #this line accesses the authors list
        auth_ls = [] #define an empty list where the author names will be kept
        for author in authors: #append each author for the current article
            auth_ls.append(author['name'])
        author_dict[id] = auth_ls
    return author_dict
    

#### Get Identifier (DOI, PMCID)

In [8]:
#Function that takes in the summary of all the articles and returns a dictioanry containing the chosen identifier of each article 
#Valid identifier args: 'doi' , 'pmc'

def get_identifier(summary, identifier):
    uids = summary['result']['uids'] #gives the list of unique ids
    identifier_dict = {}
    for id in uids:
        id_types_dict = summary['result'][id]['articleids']
        for dict in id_types_dict: #parse through article id dictionaries for each identifier
            if dict['idtype'] == identifier: #grab the specified identifier within the id dictionary if the id type matches the identifier specified in the function argument
                if identifier == 'doi': #special case for dois
                    identifier_dict[id] = 'doi:' + dict['value']
                else:
                    identifier_dict[id] = dict['value']
    return identifier_dict


#### Get Parsable Soup Object

In [9]:
#Function that takes in a list of PubMed IDs and creates a beautiful soup object from the articles returned by EFETCH. EFETCH returns information related to articles given by the search; this function
#is meant give a parsable bs4 object that can be used to extract more specific information related to the articles found (keywords, mesh ids, etc). The function returns a beautiful soup object with the articles
#returned from the search.
#This will eliminate any issues of sending to many requests by calling the function multiple times in a loop; create one soup object to use

def get_article_soup(id_list):
    rsp = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id={','.join(id_list)}")
    articles_soup = BeautifulSoup(rsp.content, features = 'xml') #create a beautiful soup object out of the response, this will hold all information for the articles
    return articles_soup

#### Get Keywords, Mesh IDs or Mesh Terms

In [10]:
#Function that takes in a beatiful soup object for parsing and the information you want to extract, gets a batch of information from PubMed that includes information about the articles for those IDs
#efetch API is used to grab article information; the function then converts the information into a beautiful soup object, and then it parses through
#the article information to get the specified information
#Arguments for info_type: 'keywords' , 'mesh_ids', 'mesh_terms'

def get_article_info(soup_obj, info_type): 
    articles_iterable = soup_obj.find_all('PubmedArticle') #Creates an iterable for all of the articles in the ID list
    tmp_info_dict = {} #create empty dictionary to hold the specified information extracted for each PubMed article
    for article in articles_iterable: #parse through each 'PubmedArticle' tag (each article in the search)
        id = article.find('PMID').text #grab the pubmed id for the current article
        if info_type == 'keywords':
            keywordList = article.find_all('Keyword') #get the keywords for the current article
            new_keywords = [x.text for x in keywordList] #convert the keyword tags into text/a list of strings
            tmp_info_dict[id] = new_keywords
        elif info_type == 'mesh_ids':
            mesh_list = article.find_all('MeshHeading') #get the tags for mesh terms in a parsable bs4 object
            mesh_ids = [term.find('DescriptorName')['UI'] for term in mesh_list] #grab the mesh id numbers and put them into a list
            tmp_info_dict[id] = mesh_ids
        elif info_type == 'mesh_terms':
            mesh_term_list = article.find_all('MeshHeading') #get the tags for mesh terms in a parsable bs4 object
            mesh_terms = [m_term.find('DescriptorName').text for m_term in mesh_term_list] #grab the mesh term and put them into a list
            tmp_info_dict[id] = mesh_terms
    return tmp_info_dict  #return the specified information 





#### Get Journal Name

In [11]:
#Function that takes in the search summary for all articles related to a grant number and grabs their journal name and returns them in a dictionary

def get_journal(summary):
    uids = summary['result']['uids'] #gives the list of unique ids for all articles in the summary
    journal_dict = {}
    for id in uids:
        journal_dict[id] = summary['result'][id]['fulljournalname'] #this line accesses the nested dictionary value for the journal name
    return journal_dict

#### Get Publication Year

In [12]:
#Function that tatkes in the search summary for all articles related to a grant number and grabs the publication year for each article
#Returns a dictionary with all articles and their respective publication year

def get_publication_yr(summary):
    uids = summary['result']['uids'] #gives the list of unique ids for all articles in the summary
    year_dict = {}
    for id in uids:
        year_dict[id] = summary['result'][id]['pubdate'][:4] #this line accesses the nested dictionary and gets the publication year
    return year_dict