In [1]:
import pandas as pd
import requests
from tempfile import mkdtemp
from zipfile import ZipFile
import os
import json
import shutil

In [2]:
SCANR_DUMP_URL = 'https://scanr-data.s3.gra.io.cloud.ovh.net/production/organizations.jsonl.gz'

In [3]:
def normalize(x):
    return x.lower().replace('é', 'e').replace('è', 'e').strip()

In [4]:
def get_last_ror_dump_url():
    try:
        ROR_URL = 'https://zenodo.org/api/records/?communities=ror-data&sort=mostrecent'
        response = requests.get(url=ROR_URL).json()
        ror_dump_url = response['hits']['hits'][0]['files'][-1]['links']['self']
        print(f'Last ROR dump url found: {ror_dump_url}')
    except:
        ror_dump_url = 'https://zenodo.org/api/files/25d4f93f-6854-4dd4-9954-173197e7fad7/v1.1-2022-06-16-ror-data.zip'
        print(f'ROR dump url detection failed, using {ror_dump_url} instead')
    return ror_dump_url
ROR_DUMP_URL = get_last_ror_dump_url()
CHUNK_SIZE = 128
def download_ror_data() -> list:
    print('download ROR')
    ror_downloaded_file = 'ror_data_dump.zip'
    ror_unzipped_folder = mkdtemp()
    response = requests.get(url=ROR_DUMP_URL, stream=True)
    with open(file=ror_downloaded_file, mode='wb') as file:
        for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
            file.write(chunk)
    with ZipFile(file=ror_downloaded_file, mode='r') as file:
        file.extractall(ror_unzipped_folder)
    for data_file in os.listdir(ror_unzipped_folder):
        if data_file.endswith('.json'):
            with open(f'{ror_unzipped_folder}/{data_file}', 'r') as file:
                data = json.load(file)
    os.remove(path=ror_downloaded_file)
    shutil.rmtree(path=ror_unzipped_folder)
    return data

ror_data = download_ror_data()

grid_to_ror = {}
for r in ror_data:
    grid = (r['external_ids'].get('GRID', {}).get('all'))
    if grid is None:
        continue
    if len(grid) in [11, 12, 13]:
        grid_to_ror[grid] = r['id'].split('/')[-1]
    else:
        print(grid)


Last ROR dump url found: https://zenodo.org/api/files/d72c6d64-be22-4040-ac02-6ce5451aaca1/v1.23-2023-04-12-ror-data.zip
download ROR


In [5]:
df_paysage = pd.read_json('paysage.jsonl', lines=True, dtype = {'siren': str, 'siret':str, 'idref':str, 'ed':str})
df_paysage['name_normalized'] = df_paysage['name'].apply(lambda x:normalize(x))

In [6]:
df_paysage[df_paysage.paysage=='U8a0v']

Unnamed: 0,name,city,country,grid,ror,wikidata,paysage,siret,siren,idref,rnsr,ed,name_normalized
2090,Université de Lille,Lille,France,,02kzqn938,,U8a0v,13002975400012,130029754,,,,universite de lille


In [7]:
for i, row in df_paysage.iterrows():
    if isinstance(row.grid, str) and not(isinstance(row.ror, str)):
        print(row.grid, row.ror, grid_to_ror.get(row.grid))

In [8]:
df_scanr = pd.read_json(SCANR_DUMP_URL, lines=True)

scanr_data = df_scanr.to_dict(orient='records')
df_scanr['scanr_id'] = df_scanr['id']
df_scanr = df_scanr.set_index('id')

rnsr_actif = []
for e in scanr_data:
    to_keep = False
    if e['status'] != 'active':
        continue
    current_rnsr = None
    for idf in e.get('externalIds', []):
        if idf.get('type') == 'rnsr':
            to_keep = True
            current_rnsr = idf['id']
            e['rnsr'] = current_rnsr
    if to_keep:
        rnsr_actif.append(e)
len(rnsr_actif)

4610

In [9]:
def get_main(x):
    for e in x:
        if e.get('main') == True:
            return e
    return None

def get_tutelle_info_scanr(idf):
    has_rnsr = False
    all_data = df_scanr[df_scanr.index==idf].to_dict(orient='records')
    if len(all_data) == 0:
        return None
    assert(len(all_data) == 1)
    data = all_data[0]
    elt = {'scanr_id': data['scanr_id'], 'scanr_label': data['label']['default'], 
           'scanr_city': get_main(data['address'])['city'], 'scanr_country': get_main(data['address'])['country']}
    for ext in data.get('externalIds', []):
        if ext.get('type') == 'rnsr':
            has_rnsr = True
        for f in ['ror', 'grid', 'siren', 'rnsr', 'sirene', 'idref']:
            if ext.get('type') in f:
                if f == 'sirene':
                    elt['siren'] = ext['id'][0:9]
                elif f == 'idref':
                    elt[f] = ext['id'].replace('idref', '')
                else:
                    elt[f] = ext['id']
    if has_rnsr:
        return None
    if data['status'] != 'active':
        return None
    if elt.get('grid') and not elt.get('ror'):
        grid = elt['grid']
        if grid in grid_to_ror:
            elt['ror'] = grid_to_ror[grid]
        
    return elt

In [10]:
tutelles_dict = {}
for e in rnsr_actif:
    for i in e['institutions']:
        if not i['structure']:
            continue
        if i['structure'] not in tutelles_dict:
            tutelles_dict[i['structure']] = get_tutelle_info_scanr(i['structure'])
            if tutelles_dict[i['structure']] is None:
                to_skip = True
            else:
                tutelles_dict[i['structure']]['linked_rnsr'] = []
        if (tutelles_dict[i['structure']]) and e['rnsr'] not in tutelles_dict[i['structure']]['linked_rnsr']:
            tutelles_dict[i['structure']]['linked_rnsr'].append(e['rnsr'])

In [11]:
tutelles_dict['130018351']

{'scanr_id': '130018351',
 'scanr_label': 'Université de Bordeaux',
 'scanr_city': 'Bordeaux',
 'scanr_country': 'France',
 'siren': '130018351',
 'grid': 'grid.10839.33',
 'idref': '175206562',
 'linked_rnsr': ['200710776M',
  '200310864A',
  '200711913Y',
  '200711920F',
  '200711923J',
  '199512085M',
  '199512079F',
  '199512080G',
  '199512081H',
  '199511980Y',
  '199911781P',
  '199911792B',
  '199911799J',
  '200311838J',
  '200411870P',
  '200711901K',
  '200511663K',
  '201119610X',
  '201019619L',
  '200715383V',
  '200717393E',
  '200317684N',
  '201119416L',
  '201119422T',
  '201119489R',
  '200715430W',
  '196817561G',
  '201119452A',
  '201119462L',
  '200716483R',
  '201119465P',
  '201119532M',
  '201119540W',
  '201119471W',
  '201622032H',
  '199017451S',
  '201722366S',
  '199611800Y',
  '201420655V',
  '201019175D',
  '201121939D',
  '201622180U',
  '201119424V',
  '201622182W',
  '201119479E',
  '197511801R',
  '200319908F',
  '200819977J',
  '200711887V',
  '201

In [12]:
df_ew = pd.read_csv('missing_tutelles.csv')
corresp_missing = {}
for row in df_ew.itertuples():
    if not isinstance(row.Paysage, str):
        print(row)
    corresp_missing[row.scanr_id] = row.Paysage
corresp_missing['130018351'] = '90I54'
corresp_missing['130026222'] = 'MihlE'
corresp_missing['130029952'] = 'TWBzp'

Pandas(Index=27, scanr_id='783753643', scanr_label='KWS MOMONT RECHERCHE', scanr_city='Mons-en-Pévèle', scanr_country='France', siren=783753643.0, grid=nan, idref=nan, ror=nan, Paysage=nan, _10='N\'a rien a faire dans le RNSR. Il s\'agit d\'une entreprise qui pointe sur un labo qui correspond à cette entreprise qui est la filiale recherche d\'une entreprise du même nom (sans "recherche")')


In [13]:
def enrich_with_paysage(elt):
    enrich = None
    
    current_id = elt['scanr_id']
    if current_id in corresp_missing:
        paysage_id = corresp_missing[current_id]
        df_current = df_paysage[df_paysage['paysage'] == paysage_id]
        if(len(df_current) != 1):
            print(current_id, paysage_id)
            print(df_current)
            #assert(1==0)
        else:
            enrich = df_current.to_dict(orient='records')[0]
    
    elt['name_normalized'] = normalize(elt['scanr_label'])
    
    if enrich is None:
        for f in ['grid', 'ror', 'siret', 'siren', 'idref', 'name_normalized']:
            if elt.get(f):
                current_id = elt[f]
                df_current = df_paysage[df_paysage[f] == current_id]
                if len(df_current) == 1:
                    #print(f)
                    enrich = df_current.to_dict(orient='records')[0]
                    break
                if len(df_current) > 1:
                    df_current2 = df_current[df_current.city == elt['scanr_city']]
                    if len(df_current2) == 1:
                        enrich = df_current2.to_dict(orient='records')[0]
                        break
        
    
    if enrich:
        elt['paysage_label'] = enrich['name']
        elt['paysage_city'] = enrich['city']
        elt['paysage_country'] = enrich['country']
        for f in ['rnsr', 'paysage', 'siret', 'siren', 'wikidata', 'idref', 'ror', 'grid', 'ed']:
            if enrich.get(f) and enrich[f] != 'None':
                if f in elt and elt[f] != enrich[f]:
                    #print(f, elt[f], enrich[f])
                    pass
                elt[f] = enrich[f]
    else:
        #print('----')
        #print('not found in paysage')
        #print("'"+elt['scanr_id']+"',")
        #print('-----')
        pass
    for f in elt.copy():
        if elt[f] == None or elt[f] == 'None':
            del elt[f]
    return elt

In [14]:
df_tutelles = pd.DataFrame([enrich_with_paysage(k) for k in tutelles_dict.values() if k])
df_tutelles

783753643 nan
Empty DataFrame
Columns: [name, city, country, grid, ror, wikidata, paysage, siret, siren, idref, rnsr, ed, name_normalized]
Index: []


Unnamed: 0,scanr_id,scanr_label,scanr_city,scanr_country,siren,grid,idref,ror,linked_rnsr,name_normalized,paysage_label,paysage_city,paysage_country,paysage,siret,wikidata,ed,rnsr
0,180089013,Centre national de la recherche scientifique F...,Paris,France,180089013,grid.4444.0,02636817X,02feahw73,"[199921733G, 200617701U, 200412238P, 200610674...",centre national de la recherche scientifique f...,Centre national de la recherche scientifique,Paris 16e,France,n2X5f,18008901303720,Q280413,,
1,193101524,Institut National des Sciences Appliquées Toul...,Toulouse,France,193101524,grid.461574.5,026388766,01h8pf755,"[201123658X, 200711888W, 199113242M, 199914342...",institut national des sciences appliquees toul...,Institut national des sciences appliquées de T...,Toulouse,France,dj88d,19310152400018,Q858979,,
2,110044013,Ministère de l'enseignement supérieur et de la...,Paris,France,110044013,grid.425729.f,232648964,03sjk9a61,[201722583C],ministere de l'enseignement superieur et de la...,Ministère de l'Enseignement supérieur et de la...,Paris 5e,France,991Uk,11004401300040,Q2726949,,
3,197517170,Pantheon-Sorbonne University Université Paris ...,Paris,France,197517170,grid.10988.38,027361802,002t25c44,"[200617701U, 199912442H, 200212721Y, 200212725...",pantheon-sorbonne university universite paris ...,Université Paris 1 - Panthéon Sorbonne,Paris 5e,France,6G2TU,19751717000019,Q999763,,
4,197534597,Ecole normale supérieure Paris,Paris,France,197534597,grid.5607.4,031738419,05a0dhs15,"[200617701U, 200510681T, 199912465H, 200710695...",ecole normale superieure paris,École normale supérieure PSL,Paris 5e,France,8618D,19753459700012,Q83259,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,892684945,INSTITUT NATURA E TEORIA PIRENEUS,Surba,France,892684945,,,,[201924216U],institut natura e teoria pireneus,Institut Natura e Teoria ※ Pirenèus,Surba,France,hfoxY,89268494500017,,,
381,784280745,Institut supérieur d'électronique de Paris,Paris,France,784280745,grid.466360.1,249214202,00yw34h52,[200524219H],institut superieur d'electronique de paris,Institut supérieur d'électronique de Paris,Paris 6e,France,VAP7f,78428074500026,Q3152556,,
382,0141432F,Ecole supérieure d'arts & médias,Caen,France,200028132,grid.466407.7,174791534,00c62es15,[202224239F],ecole superieure d'arts & medias,École supérieure d'arts et médias de Caen - Ch...,Caen,France,HqgFy,20002813200013,Q3578397,,
383,200027407,ECOLE SUPERIEURE D ART ET DE DESIGN D ORLEANS,Orléans,France,200027407,,,02kbwgp44,[201624288K],ecole superieure d art et de design d orleans,École supérieure d’art et de design d’Orléans,Orléans,France,tqTbz,20002740700010,Q3578394,,


In [15]:
df_tutelles[df_tutelles.scanr_id == '326118502']

Unnamed: 0,scanr_id,scanr_label,scanr_city,scanr_country,siren,grid,idref,ror,linked_rnsr,name_normalized,paysage_label,paysage_city,paysage_country,paysage,siret,wikidata,ed,rnsr
283,326118502,PIERRE FABRE MEDICAMENT,Lavaur,France,662006170,,,04hdhz511,"[201420847D, 200920004J, 199922774N]",pierre fabre medicament,Pierre Fabre,Castres,France,AQwHL,66200617000291,Q626597,,


In [16]:
df_tutelles['linked_rnsr'] = df_tutelles['linked_rnsr'].apply(lambda x:'|'.join(x))

In [17]:
df_tutelles[['paysage', 'paysage_label', 'paysage_city', 'ror', 'idref', 'grid', 'siren', 'siret', 'wikidata','scanr_id', 'linked_rnsr']].to_csv('tutelles.csv', index=False)

In [None]:
miss = []
for m in missing_paysage:
    elt = get_tutelle_info_scanr(m)
    miss.append(elt)

In [None]:
pd.DataFrame(miss).to_csv('missing_paysage.csv', index=False)

In [None]:
pd.DataFrame(miss)

In [None]:
enrich_with_paysage(tutelles_dict['130018351'])

In [None]:
df_paysage[df_paysage.siren=='130029952']

In [None]:
tutelles_dict['352636922']