# Main Program
## Search PubMed with a given list of grant numbers
#### All functions used can be referenced in search_mthds notebook 

In [26]:
%run search_mthds.ipynb

import pandas as pd
import time
from tqdm import tqdm

template_df = pd.DataFrame(columns = ['title', 'pmc_id', 'authors', 'doi', 'keywords', 'mesh_ids', 'mesh_terms', 'abstract', 'project_serial_num', 'journal_name', 'publication_year']) #create an empty dataframe where grant article info will be stored
grant_list = ['AA029328','AA029331', 'AA029345','AA029324','AA029316','AA029348','DC019579','DC019578','DC019573','DC016112','TR003780',
'TR003807','TR003793','HL119145','NR020105','DE031114','MD016526','DK130067','HL150852','DE030841','DE030842','HL152410',
'HL152401','DE030829','DE030852','DE030832','HD105618', 'HD105593','HD105594','HD105591','HD105619','HD105590','HD105613',
'HD105610','TR003775','TR003795','TR003787','TR003812','DA053976','DA053949','DA053941','DA053903','DA053893','LM013129','DA053899',
'ES103366', 'LM013755']
#grant list is subject to change and can be more general, but this form of grant numbers give the best results


### Main loop creating data table

In [27]:
for grant in tqdm(grant_list): #Create a loop to iterate and gather information for each grant number from the list
    my_ids = return_ids(grant) #grab article PubMed ID list
    if my_ids == 'Nothing Found': #move onto the next grant number if no articles were found
        time.sleep(1)
        continue
    art_summary = get_summary(my_ids) #gather all article information via esummary
    tmp_soup_obj = get_article_soup(my_ids) #get a parsable beautiful soup object that can be used in any functions (for get_article_info())
    titles = get_titles(art_summary) #gather titles of all articles; return dictionary 
    authors = get_authors(art_summary) #gather authors of all articles
    dois = get_identifier(art_summary, 'doi') #gather all dois and additional article identifier if available
    pmc_ids = get_identifier(art_summary, 'pmc') #gather all pmc ids for all articles associated with the current grant number
    keywords = get_article_info(tmp_soup_obj, 'keywords') #gather all keywords for each article
    mesh_ids = get_article_info(tmp_soup_obj, 'mesh_ids') #gather the mesh id numbers for each article
    mesh_terms = get_article_info(tmp_soup_obj, 'mesh_terms') #gather the mesh terms for each article
    abstracts = get_article_info(tmp_soup_obj, 'abstracts') #gather the abstract for each article
    journals = get_journal(art_summary)
    publication_yrs = get_publication_yr(art_summary)
    mydf = pd.DataFrame.from_dict(titles, orient='index', columns = ['title']) #initialize a blank dataframe with PubMed ids as row indices and a title column
    mydf['pmc_id'] = pd.Series(pmc_ids) #use pd.Series because it maintains the dict keys as indices and the values as the index-values (ex. index = pm_id, value = pmc_id)
    mydf['authors'] = pd.Series(authors) #add each column with the stored information above
    mydf['doi'] = pd.Series(dois)
    mydf['keywords'] = pd.Series(keywords)
    mydf['mesh_ids'] = pd.Series(mesh_ids)
    mydf['mesh_terms'] = pd.Series(mesh_terms)
    mydf['abstract'] = pd.Series(abstracts)
    mydf['project_serial_num'] = grant #make a new column with the grant number that corresponds to the current iteration of articles
    mydf['journal_name'] = pd.Series(journals)
    mydf['publication_year'] = pd.Series(publication_yrs)
    template_df = pd.concat([template_df, mydf], ignore_index = False) #append the current df to template dataframe
    time.sleep(1) #need to wait to avoid hitting the limit of 3 requests per second



100%|██████████| 47/47 [01:35<00:00,  2.03s/it]


### Write the dataframe to a csv file in your current working directory

In [28]:
#UNCOMMENT THIS LINE TO WRITE THE DATAFRAME TO CSV

#template_df.to_csv('article_data_current.csv')