# Program Collections generator

This script parses a xlsx file containing metadata about Program Collections and searches the NIAID Data Ecosystem Discovery Portal for records that should be included as part of each program. It then generates the corrections files needed to add the program collection to the Discovery Portal

In [1]:
import os
import requests
import json
import pandas as pd

In [2]:
script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
file_path = os.path.join(data_path,'Program Collections.xlsx')
p1_path = os.path.abspath(os.path.join(script_path, os.pardir))
parent_path = os.path.abspath(os.path.join(p1_path, os.pardir))
correction_path = os.path.join(parent_path,'nde-metadata-corrections','collections_corrections')
print(correction_path)

C:\Users\gtsueng\Anaconda3\envs\nde\nde-metadata-corrections\collections_corrections


In [3]:
def parse_array_text(arraytext):
    if '*' in arraytext:
        arraytext = arraytext.replace('*','')
    if ',' in arraytext:
        arrayinfo = arraytext.split(',')
        cleanarray = [x.strip() for x in arrayinfo]
    elif '|' in arraytext:
        arrayinfo = arraytext.split('|')
        cleanarray = [x.strip() for x in arrayinfo]
    else:
        cleanarray = [arraytext]
    return cleanarray

def generate_meta_file(correction_path, row):
    clean_descript = f"{row['description']} For more information, visit the NIAID program page: {row['niaidURL']}"
    altname = parse_array_text(row['alternateName'])
    parentorgs = parse_array_text(row['parentOrganization']) 
    meta_dict = {"@type": "ResearchProject", "name":row["name"],
                 "description":clean_descript, "alternateName":altname,
                 "url": row["url"], "parentOrganization":parentorgs} 
    file_name = f'{row["fileName"]}_correction.json'
    output_dict = {"sourceOrganization":[meta_dict]}
    with open(os.path.join(correction_path,file_name),'w') as outwrite:
        outwrite.write(json.dumps(output_dict,indent=4))

In [4]:
base_meta = pd.read_excel(file_path, 'metadata', engine='openpyxl')
test_meta = base_meta.loc[~base_meta['fundingIDList'].isna() & ~base_meta['niaidURL'].isna()]
print(test_meta.head(n=5))

  fileName                   name  \
0   SysBio  NIAID Systems Biology   
2  ReVAMPP  NIAID ReVAMPP Network   
3    CEIRR    NIAID CEIRR Network   
4     ACTG     NIAID ACTG Network   
5     ARLG     NIAID ARLG Program   

                                         description  \
0  The NIAID/Division of Microbiology and Infecti...   
2  ReVAMPP is a pandemic preparedness research ne...   
3  NIAID established the Centers of Excellence fo...   
4  The ACTG established and supports the largest ...   
5  In 2013, NIAID launched the Antibacterial Resi...   

                                       alternateName parentOrganization  \
0                                       NIAID SysBio              NIAID   
2  ReVAMPP, Research and Development of Vaccines ...              NIAID   
3  CEIRR, Centers of Excellence for Influenza Res...              NIAID   
4  ACTG, Advancing Clinical Therapeutics Globally...              NIAID   
5    ARLG, Antibacterial Resistance Leadership Group             

In [5]:
test_row = test_meta.iloc[1]
generate_meta_file(correction_path, test_row)

In [6]:
activity_codes_df = pd.read_csv(os.path.join(data_path,'NIH_IC_codes.tsv'),delimiter='\t',header=0)
print(activity_codes_df.head(n=2))

  Acronym                                          Full Name  Code
0      FIC               John E. Fogarty International Center   TW
1    NCATS  National Center for Advancing Translational Sc...   TR


In [7]:
def load_codes(data_path):
    activity_codes_df = pd.read_csv(os.path.join(data_path,'NIH_activity_codes.csv'),delimiter=',',header=0)
    tmp_codes = activity_codes_df['Activity Code'].unique().tolist()
    act_codes = [x.strip() for x in tmp_codes]
    ic_codes_df = pd.read_csv(os.path.join(data_path,'NIH_IC_codes.tsv'),delimiter='\t',header=0)
    temp_codes = ic_codes_df['Code'].unique().tolist()
    ic_codes = [x.strip() for x in temp_codes]
    return act_codes, ic_codes

def check_grantID_start(grantID, act_codes, ic_codes):
    first_char = grantID[0]
    first_two = grantID[0:2]
    try:
        int(first_two)
        idstart = "contract"
    except:
        try:
            int(first_char)
            idstart = "application_type"
        except:
            two_char = grantID[0:2]
            if '-' in grantID[0:3]:
                three_char = grantID[0:4].replace('-','')
            elif ' ' in grantID[0:3]:
                three_char = grantID[0:4].replace(' ','')
            else:
                three_char = grantID[0:3]
            if two_char in ic_codes:
                idstart = "ic_code"
            elif three_char in act_codes:
                idstart = "activity_code"
            else:
                idstart = "unknown"
    return idstart

def check_grantID_end(grantID):
    if '-' in grantID[-5:]:
        idend = "FY"
    else:
        idend = "project_code"
    return idend

def parse_apptype_start(grantID): ## Eg. 1-R01 vs 1R01 vs 1 RO1
    applTypeCode = str(grantID[0])
    if '-' in grantID[0:2]:
        remaining_grantID = grantID[2:]
    elif ' ' in grantID[0:2]:
        remaining_grantID = grantID[2:]
    else:
        remaining_grantID = grantID[1:]
    return applTypeCode, remaining_grantID

def parse_actcode_start(grantID):
    if '-' in grantID[0:3]: ## Eg. U-01 vs U01 vs U-01- vs U01-
        activityCode = grantID[0:4].replace('-','')
        if '-' in grantID[4:7]:
            remaining_grantID = grantID[5:]
        else:
            remaining_grantID = grantID[4:]
    else:
        activityCode = grantID[0:3]
        if '-' in grantID[3:6]:
            remaining_grantID = grantID[4:]
        else:
            remaining_grantID = grantID[3:]
    return activityCode, remaining_grantID

def parse_iccode_start(grantID):
    icCode = grantID[0:2]
    if '-' in grantID[2:5]: ## Eg. AI089992-11, AI-089992-11
        remaining_grantID = grantID[3:]
    else:
        remaining_grantID = grantID[2:]
    return icCode, remaining_grantID

def parse_serial_start(grantID):
    if len(grantID) > 6:
        serialNum = grantID[0:6]
        remaining_grantID = grantID[7:]
    else:
        serialNum = grantID
        remaining_grantID = ''
    return serialNum, remaining_grantID

def parse_grantID(grantID, act_codes, ic_codes):
    idstart = check_grantID_start(grantID, act_codes, ic_codes)
    idend = check_grantID_end(grantID)
    if idend == "FY":
        supportYear = grantID[-2:]
    else:
        supportYear = "not found"
    if idstart == "application_type":
        applTypeCode, grantID_child0 = parse_apptype_start(grantID)
        activityCode, grantID_child1 = parse_actcode_start(grantID_child0)
        icCode, grantID_child2 = parse_iccode_start(grantID_child1)
        serialNum, remaining_grantID = parse_serial_start(grantID_child2)
    elif idstart == "activity_code":
        applTypeCode = "not found"
        activityCode, grantID_child1 = parse_actcode_start(grantID)
        icCode, grantID_child2 = parse_iccode_start(grantID_child1)
        serialNum, remaining_grantID = parse_serial_start(grantID_child2)
    elif idstart == "ic_code":
        applTypeCode = "not found"
        activityCode = "not found"
        icCode, grantID_child2 = parse_iccode_start(grantID)
        serialNum, remaining_grantID = parse_serial_start(grantID_child2)      
    else:
        applTypeCode = "not found"
        activityCode = "not found"
        icCode = "not found"
        serialNum = "not found"
    grantObject = {"grantID":grantID,
                   "applTypeCode":applTypeCode, 
                   "activityCode":activityCode, 
                   "icCode":icCode, 
                   "serialNum":serialNum, 
                   "supportYear":supportYear}
    return grantObject

grantlist = parse_array_text(test_row['fundingIDList'])
print(grantlist[0])

AI181960       


In [8]:
grantIDList = ["1-R01-AI073685-01","1R01-AI073685-01","1-R01AI073685-01","1-R-01AI073685-01","R01AI073685-01","R01AI073685-01","R01-AI073685-01","AI073685-01","1-R-01-AI073685-01", "AI073685"]
act_codes, ic_codes = load_codes(data_path)
for eachgrant in grantIDList:
    grantObject = parse_grantID(eachgrant, act_codes, ic_codes)
    print(grantObject)
    

{'grantID': '1-R01-AI073685-01', 'applTypeCode': '1', 'activityCode': 'R01', 'icCode': 'AI', 'serialNum': '073685', 'supportYear': '01'}
{'grantID': '1R01-AI073685-01', 'applTypeCode': '1', 'activityCode': 'R01', 'icCode': 'AI', 'serialNum': '073685', 'supportYear': '01'}
{'grantID': '1-R01AI073685-01', 'applTypeCode': '1', 'activityCode': 'R01', 'icCode': 'AI', 'serialNum': '073685', 'supportYear': '01'}
{'grantID': '1-R-01AI073685-01', 'applTypeCode': '1', 'activityCode': 'R01', 'icCode': 'AI', 'serialNum': '073685', 'supportYear': '01'}
{'grantID': 'R01AI073685-01', 'applTypeCode': 'not found', 'activityCode': 'R01', 'icCode': 'AI', 'serialNum': '073685', 'supportYear': '01'}
{'grantID': 'R01AI073685-01', 'applTypeCode': 'not found', 'activityCode': 'R01', 'icCode': 'AI', 'serialNum': '073685', 'supportYear': '01'}
{'grantID': 'R01-AI073685-01', 'applTypeCode': 'not found', 'activityCode': 'R01', 'icCode': 'AI', 'serialNum': '073685', 'supportYear': '01'}
{'grantID': 'AI073685-01', 

### Searching for records related to grant IDs:

To do:
- Pull the datasets associated with the differently formatted grantIDs
- Compare the results to identify the best approach for pulling records based on grantIDs
  - It looks like the combined icCode+project number may yield the most results

In [9]:
def search_for_records(grantlist):
    resultlist = []
    faillist = []
    for eachgrant in grantlist:
        #print(eachgrant)
        r = requests.get(f"https://api-staging.data.niaid.nih.gov/v1/query?&q=funding.identifier:*{eachgrant}*&fields=_id,funding.identifier&size=500")
        s = requests.get(f"https://api-staging.data.niaid.nih.gov/v1/query?&q=funding.identifier:{eachgrant}&fields=_id,funding.identifier&size=500")
        temp = json.loads(r.text)
        temp2 = json.loads(s.text)
        try:
            temphits = len(temp['hits'])
        except:
            temphits = 0
        try:
            temphits2 = len(temp['hits'])
        except:
            temphits2 = 0
        if temphits > 0:
            if temphits2 > 0:
                alltemp = temp['hits'] + temp2['hits']
            else:
                alltemp = temp['hits']
        elif temphits2 > 0:
                alltemp = temp2['hits']
        else:
            faillist.append(eachgrant)
            alltemp = []
        if len(alltemp)>0:
            for eachhit in alltemp:
                tmpid = eachhit['_id']
                if isinstance(eachhit['funding'],list):
                    for eachfunding in eachhit['funding']:
                        if eachgrant in eachfunding['identifier']:
                            resultlist.append({"query":eachgrant,"_id":tmpid,"fundID":eachfunding['identifier']})
                elif isinstance(eachhit['funding'],dict):
                    resultlist.append({"query":eachgrant,"_id":tmpid,"fundID":eachhit['funding']['identifier']})
    resultdf = pd.DataFrame(resultlist)
    clean_result = resultdf.drop_duplicates(keep="first")
    print(faillist)
    return clean_result

In [10]:
grantIDList = ["1-R01-AI073685-01","1R01-AI073685-01","1-R01AI073685-01","1-R-01AI073685-01","R01AI073685","R01AI073685-01","R01 AI073685-01", "R01 AI07368501","R01-AI073685-01","AI073685-01","1-R-01-AI073685-01", "AI073685"]
clean_result = search_for_records(grantIDList)

['1-R01-AI073685-01', '1R01-AI073685-01', '1-R01AI073685-01', '1-R-01AI073685-01', 'R01 AI073685-01', 'R01 AI07368501', 'R01-AI073685-01', '1-R-01-AI073685-01']


In [11]:
print(clean_result)

             query            _id           fundID
0      R01AI073685  ds_e639f19429  1R01AI073685-01
2      R01AI073685  ds_61fc1af3a0  1R01AI073685-01
4      R01AI073685  ds_69b30c3f3f  1R01AI073685-01
5      R01AI073685  ds_5dc6e26d66  1R01AI073685-01
7      R01AI073685  ds_7fb8ffda8b  1R01AI073685-01
9      R01AI073685  ds_e3b3b46a00  1R01AI073685-01
10  R01AI073685-01  ds_e639f19429  1R01AI073685-01
12  R01AI073685-01  ds_61fc1af3a0  1R01AI073685-01
14  R01AI073685-01  ds_69b30c3f3f  1R01AI073685-01
15  R01AI073685-01  ds_5dc6e26d66  1R01AI073685-01
17  R01AI073685-01  ds_7fb8ffda8b  1R01AI073685-01
19  R01AI073685-01  ds_e3b3b46a00  1R01AI073685-01
20     AI073685-01  ds_e639f19429  1R01AI073685-01
22     AI073685-01  ds_61fc1af3a0  1R01AI073685-01
24     AI073685-01  ds_69b30c3f3f  1R01AI073685-01
25     AI073685-01  ds_5dc6e26d66  1R01AI073685-01
27     AI073685-01  ds_7fb8ffda8b  1R01AI073685-01
29     AI073685-01  ds_e3b3b46a00  1R01AI073685-01
30        AI073685  ds_e639f194

In [12]:
## parse the funder ids from the spreadsheet and format into icCode+project number
def parse_program_funding(fundingInfo):
    grantlist = []
    temp = parse_array_text(fundingInfo)
    temp2 = [x.strip() for x in temp]
    temp3 = [x.replace("*","") for x in temp2]
    for eachgrant in temp3:
        try:
            grantObject = parse_grantID(eachgrant, act_codes, ic_codes)
            if grantObject['icCode']!='not found' and grantObject['serialNum'] != 'not found':
                grantlist.append(grantObject['icCode']+grantObject['serialNum'])
            else:
                grantlist.append(eachgrant)
        except:
            grantlist.append(eachgrant)
    return grantlist

In [13]:
def generate_inclusion_list(row):
    filename = f"{row['fileName']}_records.txt"
    fundingInfo = row['fundingIDList']
    grantlist = parse_program_funding(fundingInfo)
    clean_result = search_for_records(grantlist)
    print(clean_result)
    with open(os.path.join(correction_path,filename),'w') as outwrite:
        if len(clean_result)>0:
            for eachrecord in clean_result['_id'].unique().tolist():
                outwrite.write(f'https://data.niaid.nih.gov/resources?id={eachrecord}\n')
        else:
            outwrite.write("")

In [14]:
#print(len(test_meta))
#print(test_meta.iloc[2]['fundingIDList'])
#print(test_meta.head(n=3))
#generate_inclusion_list(test_meta.iloc[2])
#print(test_meta.head(n=2))
test_meta.apply(lambda row: generate_meta_file(correction_path, row), axis=1)
#test_meta.apply(lambda row: generate_inclusion_list(row), axis=1)

0     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
25    None
27    None
28    None
29    None
30    None
32    None
34    None
35    None
36    None
38    None
39    None
41    None
42    None
43    None
45    None
dtype: object

In [None]:
eachgrant=grantIDList[-1]
r = requests.get(f"https://api-staging.data.niaid.nih.gov/v1/query?&q=funding.identifier:*{eachgrant}*&fields=_id,funding.identifier&size=500")
temp = json.loads(r.text)
print(temp)