### Includes functions that are used to search and create a dataframe for PubMed articles based on their grant number
#### Main program that references these functions is included in mainpgrm notebook

In [1]:
import requests
from bs4 import BeautifulSoup 
import lxml
import xmltodict
import json
import pandas as pd


#### Get PubMed IDs

In [13]:
#Take in the grant number as a string, and return the PubMed article ids associated with that grant number
#If no articles come up during the search, return none

def return_ids(grant_num):
    rsp = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&mindate=2020&maxdate=2023&retmax=70&term={grant_num}[Grant Number]')
    info = rsp.json() #convert the information from the response into a json to make it easily accessible
    ids = info['esearchresult']['idlist'] #grab the id list from the json information
    if ids != []:
        return ids #return id list
    else:
        return "Nothing found" #if there are no papers found, return Nothing found

#### Get Summary Information

In [3]:
#Function that takes in a list of PubMed Ids and returns the summary information for the articles associated with those IDs; returns a dictionary
#Can use this function in conjunction with returnIDs() function to go from grant number to article summaries
#The summary information includes a variety of information such as titles for all articles, authors, identifiers and more

def get_summary(idList):
    rsp = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&id={','.join(idList)}") #use join here to make list of ids into one comma separated string
    info = rsp.json() #convert the response information to easily accessible json info
    return info

#### Get Titles

In [4]:
#Function that takes in the summary of all articles given by the ids and returns a dictionary with the title of each article

def get_titles(summary):
    uids = summary['result']['uids'] #gives the list of unique ids for all articles in the summary
    title_dict = {}
    for id in uids:
        title_dict[id] = summary['result'][id]['title']
        #This grabs the title, summary is the information retrieved by esummary, 'results' is the key for the paper's info
        #within 'results' is each id and their corresponding info, within each id we can grab the info we want, in this case, the paper title
    return title_dict
    

#### Get Authors

In [5]:
#Function that takes in the summary of all the articles and returns a dictionary containing the authors of each article

def get_authors(summary):
    uids = summary['result']['uids'] #gives the list of unique ids
    author_dict = {} #create an empty dictionary that will store the authors by pubmed id
    for id in uids: #parse through each article given by the search via their ids
        authors = summary['result'][id]['authors'] #this line accesses the authors list
        auth_ls = [] #define an empty list where the author names will be kept
        for author in authors: #append each author for the current article
            auth_ls.append(author['name'])
        author_dict[id] = auth_ls
    return author_dict
    

#### Get DOIs

In [6]:
#Function that takes in the summary of all the articles and returns a dictioanry containing the doi of each article

def get_doi(summary):
    uids = summary['result']['uids'] #gives the list of unique ids
    doi_dict = {}
    for id in uids:
        doi_dict[id] = summary['result'][id]['elocationid'] #grab the identifier from the list of information, place it into a dictionary
    return doi_dict
    

#### Get Keywords

In [7]:
#Function that takes in a list of PubMed IDs, gets a batch of information from PubMed that includes information about the articles for those IDs
#efetch API is used to grab article information, converts the information into a beautiful soup object, and then it parses through
#the article information to get the keywords for each article

def get_keywords(idList): 
    rsp = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id={','.join(idList)}")
    articles_bs = BeautifulSoup(rsp.content, features = 'xml') #create a beautiful soup object out of the response, this will hold all information for the articles
    articles_iterable = articles_bs.find_all('PubmedArticle') #Creates an iterable for all of the articles in the ID list
    keyword_dict = {}
    for article in articles_iterable: #parse through each 'PubmedArticle' tag (each article in the search)
        id = article.find('PMID').text #grab the pubmed id for the current article
        keywordList = article.find_all('Keyword') #get the keywords for the current article
        new_keywords = [x.text for x in keywordList] #convert the keyword tags into text/a list of strings
        if keywordList == []: #accounts for articles with no keywords
            keyword_dict[id] = 'No keywords'
        else:
            keyword_dict[id] = new_keywords
    return keyword_dict




#### Get Journal Name

In [8]:
#Function that takes in the search summary for all articles related to a grant number and grabs their journal name and returns them in a dictionary

def get_journal(summary):
    uids = summary['result']['uids'] #gives the list of unique ids for all articles in the summary
    journal_dict = {}
    for id in uids:
        journal_dict[id] = summary['result'][id]['fulljournalname'] #this line accesses the nested dictionary value for the journal name
    return journal_dict

#### Get Publication Year

In [9]:
#Function that tatkes in the search summary for all articles related to a grant number and grabs the publication year for each article
#Returns a dictionary with all articles and their respective publication year

def get_publication_yr(summary):
    uids = summary['result']['uids'] #gives the list of unique ids for all articles in the summary
    year_dict = {}
    for id in uids:
        year_dict[id] = summary['result'][id]['pubdate'][:4] #this line accesses the nested dictionary and gets the publication year
    return year_dict

In [14]:
test_sum = get_summary(return_ids('HL119145'))
get_publication_yr(test_sum)

{'37428010': '2023',
 '37226730': '2023',
 '37039628': '2023',
 '37028977': '2023',
 '36949045': '2023',
 '36794589': '2023',
 '36565787': '2023',
 '36275710': '2022',
 '36043417': '2022',
 '36040468': '2022',
 '35962845': '2022',
 '35924558': '2022',
 '35843734': '2022',
 '35599981': '2022',
 '35466998': '2022',
 '35443763': '2022',
 '35379853': '2022',
 '35312466': '2022',
 '35312360': '2022',
 '35211681': '2022',
 '35142398': '2022',
 '34789851': '2021',
 '34775044': '2021',
 '34746796': '2021',
 '34657041': '2022',
 '34636650': '2021',
 '34579115': '2021',
 '34545358': '2021',
 '34514084': '2021',
 '34476401': '2021',
 '34387240': '2022',
 '34365067': '2021',
 '34270923': '2021',
 '34038701': '2021',
 '33988876': '2021',
 '33906951': '2021',
 '33890812': '2021',
 '33861689': '2021',
 '33856918': '2021',
 '33833034': '2021',
 '37117448': '2021',
 '33558758': '2021',
 '33558530': '2021',
 '33555777': '2021',
 '33523764': '2021',
 '33243385': '2020',
 '33165538': '2021',
 '33050894': 