## Funder mapping

1. Fetch all funders with a name but no id
2. Perform initial query to ROR to get a potential map
3. Identify ones with format `XXXX NIH HHS` for near-automatic confirmation

In [1]:
import os
import requests
import pandas as pd
import json

In [2]:
script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
print(script_path)

C:\Users\gtsueng\Anaconda3\envs\nde\nde_misc\funder_mapping


In [2]:
r = requests.get('https://api-staging.data.niaid.nih.gov/v1/query?=&q=_exists_:funding.funder.name AND -_exists_:funding.funder.identifier&extra_filter=_exists_:funding.funder.name.raw&facets=funding.funder.name.raw&filters=&size=0&facet_size=1000')

results = json.loads(r.text)
print(results.keys())

dict_keys(['took', 'total', 'max_score', 'facets'])


In [54]:
funder_freq_dict = results['facets']['funding.funder.name.raw']['terms']
no_id_funder_df = pd.DataFrame(funder_freq_dict)
print(no_id_funder_df.head(n=2))

   count           term
0  47287  NIGMS NIH HHS
1  32199    NCI NIH HHS


In [25]:
ror = requests.get(f'https://api.ror.org/v2/organizations?query={no_id_funder_df.iloc[998]["term"]}')
ror_res = json.loads(ror.text)
print(ror_res.keys())

dict_keys(['number_of_results', 'time_taken', 'items', 'meta'])


In [57]:
first_res = ror_res['items'][0]
print(first_res['id'])
print(first_res['names'])

def get_label(ror_names, funder_term):
    for eachobject in ror_names:
        if eachobject['value'] == funder_term:
            ror_label = eachobject['value']
            return ror_label
        if 'label' in ' '.join(eachobject['types']):
            if 'lang' in eachobject.keys():
                if eachobject['lang'] == 'en':
                    ror_label = eachobject['value']
                    return ror_label
            else:
                ror_label = eachobject['value'] 
                return ror_label

def get_ror_match(funder_term):
    ror = requests.get(f'https://api.ror.org/v2/organizations?query={funder_term}')
    try:
        ror_res = json.loads(ror.text)
        try:
            first_res = ror_res['items'][0]
            try:
                ror_id = first_res['id']
                ror_label = get_label(first_res['names'],funder_term)
            except:
                ror_id = "Error"
                ror_label = "No id or name"
        except:
            ror_id = "Error"
            ror_label = "No items"
    except:
            ror_id = "Error"
            ror_label = "request parse issue"        
    return ror_id, ror_label

https://ror.org/05svhj534
[{'lang': None, 'types': ['acronym'], 'value': 'DFF'}, {'lang': 'en', 'types': ['alias'], 'value': 'Danish Council for Independent Research'}, {'lang': 'da', 'types': ['ror_display', 'label'], 'value': 'Danmarks Frie Forskningsfond'}, {'lang': 'en', 'types': ['label'], 'value': 'Independent Research Fund Denmark'}]


In [32]:
print(no_id_funder_df.tail(n=2))

     count                                     term
998      4  Danish Council for Independent Research
999      4               Department of Conservation


In [39]:
ror_id, ror_label = get_ror_match(no_id_funder_df.iloc[998]['term'])
print(ror_id, ror_label)

https://ror.org/05svhj534 Danish Council for Independent Research


In [48]:
%%time

testdf = no_id_funder_df.head(n=10).copy()

testdf[['ror_id','ror_label']] = testdf.apply(lambda x: get_ror_match(x['term']),result_type='expand', axis=1)
print(testdf.head(n=2))

   count                      term                     ror_id  \
0  32280             NIGMS NIH HHS  https://ror.org/04q48ey07   
1  28118  The Leon Levy Foundation  https://ror.org/033hnyq61   

                                        ror_label  
0  National Institute of General Medical Sciences  
1                            Leon Levy Foundation  
CPU times: total: 672 ms
Wall time: 9.62 s


In [58]:
%%time
no_id_funder_df[['ror_id','ror_label']] = no_id_funder_df.apply(lambda x: get_ror_match(x['term']),result_type='expand', axis=1)
print(no_id_funder_df.head(n=2))

   count           term                     ror_id  \
0  47287  NIGMS NIH HHS  https://ror.org/04q48ey07   
1  32199    NCI NIH HHS  https://ror.org/02z7csx58   

                                        ror_label  
0  National Institute of General Medical Sciences  
1                     Hawaiian Historical Society  
CPU times: total: 1min 3s
Wall time: 15min 36s


In [62]:
no_id_funder_df.to_csv(os.path.join(data_path,'to_review.tsv'),sep='\t',header=True)

In [9]:
## Precompute likely correct matches
guess_id_funder_df = pd.read_csv(os.path.join(data_path,'to_review.tsv'),delimiter='\t',header=0,index_col=0)
print(guess_id_funder_df.head(n=2))

   count           term                     ror_id  \
0  47287  NIGMS NIH HHS  https://ror.org/04q48ey07   
1  32199    NCI NIH HHS  https://ror.org/02z7csx58   

                                        ror_label  
0  National Institute of General Medical Sciences  
1                     Hawaiian Historical Society  


In [5]:
def prioritize_funder(row):
    if 'NIH' in str(row['term']):
        priority = 1
    elif 'HHS' in str(row['term']):
        priority = 2
    else:
        priority = 3
    return priority


def check_exact_match(row):
    if row['term'] == row['ror_label']:
        exact_match = True
    else:
        exact_match = False
    return exact_match


In [10]:
guess_id_funder_df['priority'] = guess_id_funder_df.apply(lambda x: prioritize_funder(x), axis=1)
print(guess_id_funder_df.head(n=2))

   count           term                     ror_id  \
0  47287  NIGMS NIH HHS  https://ror.org/04q48ey07   
1  32199    NCI NIH HHS  https://ror.org/02z7csx58   

                                        ror_label  priority  
0  National Institute of General Medical Sciences         1  
1                     Hawaiian Historical Society         1  


In [11]:
guess_id_funder_df['exact_match?'] = guess_id_funder_df.apply(lambda x: check_exact_match(x), axis=1)
print(guess_id_funder_df.head(n=2))

   count           term                     ror_id  \
0  47287  NIGMS NIH HHS  https://ror.org/04q48ey07   
1  32199    NCI NIH HHS  https://ror.org/02z7csx58   

                                        ror_label  priority  exact_match?  
0  National Institute of General Medical Sciences         1         False  
1                     Hawaiian Historical Society         1         False  


In [12]:
guess_id_funder_df.to_csv(os.path.join(data_path,'to_review.tsv'),sep='\t',header=True)