In [1]:
import pandas as pd
from ast import literal_eval
from utils import utils, UMLSAPI

data = pd.read_csv('data/fnfr_scholarly_api.csv')
try:
    data.loc[:, 'interests'] = data['interests'].apply(lambda x: literal_eval(str(x.lower())))
except:
    data.loc[:, 'interests'] = data['interests'].fillna('[]').transform(lambda x: literal_eval(str(x.lower())))

data = data.explode('interests').reset_index(drop=True)
data = data.dropna(subset='interests')

# Par projets
interests_projects = pd.DataFrame(data.groupby('projet')['interests'].value_counts()).reset_index().rename(columns={'count':'n'})
interests_projects.sort_values(by='n', ascending=False)

Unnamed: 0,projet,interests,n
126,BIOSCAN: Tracing the Patterns of Life on a Cha...,dna barcoding,8
125,BIOSCAN: Tracing the Patterns of Life on a Cha...,biodiversity,8
838,TRIDENT: TRanslational Initiative to DE-risk N...,neuroscience,7
516,Mend the Gap: A Transformative Biomaterials Pl...,neuroscience,5
44,Abundant Intelligences: Expanding Artificial I...,artificial intelligence,4
...,...,...,...
417,Enabling novel cardiac therapies with pluripot...,vascular biology,1
418,Enabling novel cardiac therapies with pluripot...,vascularization,1
427,Inclusive Design for Employment Access (IDEA),work,1
428,Inclusive Design for Employment Access (IDEA),workplace mistreatment,1


In [2]:
# all 
interests_all = pd.DataFrame(data['interests'].value_counts()).reset_index().rename(columns={'count':'n'})
interests_all.sort_values(by='n', ascending=False)

Unnamed: 0,interests,n
0,neuroscience,15
1,biodiversity,10
2,dna barcoding,8
3,biomaterials,8
4,machine learning,7
...,...,...
463,psychosomatics,1
464,alexithymia,1
465,gene expression,1
466,pathways,1


In [4]:
# mapping UMLS
labels = interests_all['interests'].unique().tolist()
umls_mapping = UMLSAPI.queryUMLSAPI_exactSearch(labels)

umls_mapping

neuroscience MTH C0027910
biodiversity MSH C0282469
dna barcoding MSH C2936547
biomaterials MTH C0005479
machine learning MSH C0376284
tissue engineering MSH C0596171
chemistry MTH C0079107
chemistry MTH C0007996
chemistry MTH C1547978
chemistry MTH C2183231
chemistry MTH C0201682
electrochemistry MTH C0013803
cognition MTH C0009240
artificial intelligence MSH C0003916
genomics MTH C0887950
mental health MTH C0025353
ecology MSH C0013546
energy MTH C1442080
energy MTH C0542479
energy MTH C1547025
energy MTH C0424589
evolution MSH C0015219
public health MTH C2239238
public health MTH C3244304
public health MTH C0699943
public health MTH C0034019
molecular imaging MSH C1537028
genetics MTH C0017398
genetics MTH C0017399
genetics MTH C0314603
genetics MTH C1948182
development MTH C1527148
development MTH C0678723
development MTH C0018271
development MTH C0243107
co2 capture NOT FOUND
bioinformatics MSH C1140694
radiochemistry MSH C0034559
climate change MSH C2718051
bioethics MTH C0005489