## CREID coverage in NDE - initial check
This script checks the NDE for datasets belonging to CREID based on grant IDs

Here are the 10 grant numbers for the CREID Research Centers:
1. CREID-ESP: Washington University in St. Louis - David Wang:              U01AI151810-02
2. WARN-ID: Scripps - Kristian Andersen:  1U01AI151812-01
3. UWARN: University of Washington - Wesley Van Voorhis:    1U01AI151698-01
4. CREID-ECA: Washington State University - Kariuki Njenga:   1U01AI151799-01
5. CREATE-NEO: University of TX Medical Branch, Galveston - Nikos Vasilakis:  1U01AI151807-01
6. WAC-EID: University of TX Medical Branch, Galveston - Scott Weaver:  1U01 AI151801-01
7. EID-SEARCH: EcoHealth Alliance - Peter Daszak:  1U01AI151797-01
8. EEIDI: UC Davis - Christine Johnson:   1U01AI151814-01
9. PICREID: Institut Pasteur - Anavaj Sakuntabhai:   1U01AI151758-01
10. A2CARES: UC Berkeley - Eva Harris:   1U01 AI151788-01


In [23]:
import requests
import pandas as pd
import json
import os
import time

In [5]:
## Due to inconsistent use of Grant IDs, the bare minimum portion should be used for greatest coverage

def strip_grant_id(grant_id):
    grant_parts = grant_id.split('-')
    main_part = grant_parts[0]
    if "1U01" in main_part:
        no_mech = main_part.replace("1U01","")
    else:
        no_mech = main_part.replace("U01","")
    return no_mech

In [3]:
script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
grantidfile = os.path.join(data_path,'CREID_grants.txt')
creid_grants = pd.read_csv(grantidfile,sep='\t',header=None)
creid_grants.rename(columns={0:'Center',1:'Instition',2:'PI',3:'GrantID'},inplace=True)
print(creid_grants.head(n=2))

      Center                             Instition                 PI  \
0  CREID-ESP   Washington University in St. Louis          David Wang   
1    WARN-ID                              Scripps   Kristian Andersen   

           GrantID  
0   U01AI151810-02  
1  1U01AI151812-01  


In [4]:
print(creid_grants['GrantID'].tolist())

['U01AI151810-02', '1U01AI151812-01', '1U01AI151698-01', '1U01AI151799-01', '1U01AI151807-01', '1U01AI151801-01', '1U01AI151797-01', '1U01AI151814-01', '1U01AI151758-01', '1U01AI151788-01']


In [7]:
creid_grants['stripped_id'] = creid_grants.apply(lambda x: strip_grant_id(x['GrantID']), axis=1)
print(creid_grants.head(n=2))

      Center                             Instition                 PI  \
0  CREID-ESP   Washington University in St. Louis          David Wang   
1    WARN-ID                              Scripps   Kristian Andersen   

           GrantID stripped_id  
0   U01AI151810-02    AI151810  
1  1U01AI151812-01    AI151812  


In [8]:
grant_list = creid_grants['stripped_id'].unique().tolist()
print(grant_list)

['AI151810', 'AI151812', 'AI151698', 'AI151799', 'AI151807', 'AI151801', 'AI151797', 'AI151814', 'AI151758', 'AI151788']


In [24]:
## Search the NDE for the grant ids. I used the staging API since there are a few more sources in the staging API
creid_in_nde = []
no_data_found = []
for eachgrantid in grant_list:
    url = f"https://api-staging.data.niaid.nih.gov/v1/query?&q=funding.identifier:*{eachgrantid}*"
    r = requests.get(url)
    json_result = json.loads(r.text)
    resultlist = json_result['hits']
    if len(resultlist) > 0:
        for eachhit in resultlist:
            creid_in_nde.append({'stripped_id':eachgrantid, 'nde_id':eachhit['_id']})
    else:
        no_data_found.append({'stripped_id':eachgrantid,'nde_id':'None'})

creid_in_nde_df = pd.DataFrame(creid_in_nde)
no_data_found_df = pd.DataFrame(no_data_found)

print(len(creid_in_nde_df))
print(creid_in_nde_df.head(n=2))
print(len(no_data_found_df))
print(no_data_found_df)

11
  stripped_id       nde_id
0    AI151810  PRJNA657062
1    AI151810    GSE156219
6
  stripped_id nde_id
0    AI151799   None
1    AI151807   None
2    AI151797   None
3    AI151814   None
4    AI151758   None
5    AI151788   None


In [21]:
print(creid_in_nde_df)

   stripped_id       nde_id
0     AI151810  PRJNA657062
1     AI151810    GSE156219
2     AI151810    GSE193990
3     AI151812    GSE189787
4     AI151698    GSE162736
5     AI151698    GSE179722
6     AI151698    GSE223236
7     AI151698    GSE176386
8     AI151698    GSE157175
9     AI151698  PRJNA682812
10    AI151801    GSE209750


In [42]:
## Export the results
creid_in_nde_df.to_csv(os.path.join(data_path,'found_in_nde.tsv'),sep='\t',header=True)
no_data_found_df.to_csv(os.path.join(data_path,'not_found_in_nde.tsv'),sep='\t',header=True)

## Publications associated with grants

Pull PMIDs associated with grant ids. Also check other NCBI DBs for the grant ids


In [25]:
from Bio import Entrez
from Bio import Medline

In [26]:
Entrez.email = "your email here"

In [43]:
## Find anything associated with the grant id across various databases

datalist = []
datalist2 = []
for eachgrantid in grant_list:
    handle = Entrez.egquery(term=eachgrantid)
    record = Entrez.read(handle)
    tmpdict2 = {"stripped grant id":eachgrantid}
    for row in record["eGQueryResult"]:
        tmpdict2[row["DbName"]]=row["Count"]
        if row["Count"]!="Error":
            if int(row["Count"])>0:
                tmpdict={"stripped grant id":eachgrantid,"db":row["DbName"], "count":row["Count"]}
                datalist.append(tmpdict)
    datalist2.append(tmpdict2)
    time.sleep(1)

df1 = pd.DataFrame(datalist)
df2 = pd.DataFrame(datalist2)
df1.to_csv(os.path.join(data_path,'grant_associated_assets.tsv'),sep='\t',header=True)
df2.to_csv(os.path.join(data_path,'grant_all_db_counts.tsv'),sep='\t',header=True)

In [44]:
print(df.head(n=2))
print(df2.head(n=2))

  stripped grant id      db count
0          AI151810  pubmed    70
1          AI151810     pmc    84
  stripped grant id pubmed pmc mesh books pubmedhealth omim ncbisearch  \
0          AI151810     70  84    0     0        Error    0          0   
1          AI151812     63  51    0     0        Error    0          0   

  nuccore nucgss  ... pccompound pcsubstance pcassay nlmcatalog probe gap  \
0       1      0  ...          0           0       0          0     0   0   
1       1      0  ...          0           0       0          0     0   0   

  proteinclusters bioproject biosample biocollections  
0               0          0         0              0  
1               0          3         0              0  

[2 rows x 37 columns]


In [None]:
handle = Entrez.efetch(db="pubmed", id=test_pmid, rettype="medline", retmode="text")
records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
for record in records:
    MESHSet = record.get("MH","?") #writes the record to a list called MH
