# Traitement des données de financements 

In [59]:
#importer les packages
import pandas as pd
import numpy as np
from project.server.main.utils import replace_all,get_scanR_structure,get_id
from project.server.main.cached_data_handler import get_structure, get_person
from project.server.main.features_into_dictionnary import persons, projects, address
from project.server.main.id_from_orcid import orcid_to_idref
from tqdm import tqdm
import pprint as pp
tqdm.pandas()
from project.server.main.my_pickle import load_cache,write_cache
import os
from dotenv import load_dotenv
import requests

load_dotenv()

Authorization = os.getenv('Authorization_access_185.XX')
Authorization_ORCID = os.getenv('Authorization_cluster_BSO_ORCID')
url_cluster = os.getenv('url_cluster')

In [60]:
sources=pd.read_json('sources.json')

In [61]:
source=list(sources.keys())[1]
source

'ANSES'

In [62]:
#cache structures, personnes et orcid avec differentes sources de donnees
cached_data = {}
try:
    cached_data = load_cache(cached_data,f"./DATA/{source}/caches/cached_{source.lower()}_data.pkl")
except:
    write_cache(cached_data,f"./DATA/{source}/caches/cached_{source.lower()}_data.pkl")
    
cached_data_persons = {}
try:
    cached_data_persons = load_cache(cached_data_persons,f"./DATA/{source}/caches/cached_{source.lower()}_data_persons.pkl")
except:
    write_cache(cached_data_persons,f"./DATA/{source}/caches/cached_{source.lower()}_data_persons.pkl")
    
cached_data_orcid = {}
try:
    cached_data_orcid = load_cache(cached_data_orcid,f"./DATA/{source}/caches/cached_{source.lower()}_data_orcid.pkl")
except:
    write_cache(cached_data_orcid,f"./DATA/{source}/caches/cached_{source.lower()}_data_orcid.pkl")

975 data in cache
0 data in cache
0 data in cache


# Données partenaires

In [None]:
nbr_page=int(requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANSES"}&projection={"modified_at":1}&max_results=500&page=1', headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])
list_ids=[]
for i in range(1,nbr_page+1):
    page=requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANSES"}&projection={"modified_at":1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        list_ids.append(page['data'][k]['modified_at'])
        
response = requests.get(
    "http://www.data.gouv.fr/api/1/datasets/?organization=5ea7ceda37efc6da5b0b7d37&format=json&q=dos",
    headers={"Accept":"/"},
)
datas = response.json()
date_projects=[data for data in datas['data'][0]['resources'] if data['format']=='json' and 'anr-dos-depuis-2010-projets' in str(data['title']) !=-1 and 'projets.json' in str(data['title']) !=-1 ][0]['last_modified']
date_partners=[data for data in datas['data'][0]['resources'] if data['format']=='json' and 'anr-dos-depuis-2010-projets' in str(data['title']) !=-1 and 'partenaires.json' in str(data['title']) !=-1][0]['last_modified']

In [67]:
response = requests.get(
    f"http://www.data.gouv.fr/api/1/datasets/?organization={sources[source]['id']}",
    headers={"Accept":"/"},
)
datas = response.json()

In [68]:
[data for data in datas['data'][0]['resources'] if sources[source]['keyword_projects1'] in str(data['title']) !=-1 and sources[source]['keyword_projects2'] in str(data['title']) !=-1 ]

[{'checksum': {'type': 'sha1',
   'value': '946ee7c719b9c35e687067f745dd5e1cf099c362'},
  'created_at': '2025-04-17T15:45:14.495000+00:00',
  'description': 'Projets de recherche soutenus par le PNR EST 2006-2024\\\nCe fichier décrit les informations concernant les projets :\n\n* code de convention,\n* acronyme,\n* titre complet du projet en français en anglais,\n* date de début du projet,\n* montant de subvention\n* résumés en français et anglais\n',
  'extras': {'analysis:checksum': '946ee7c719b9c35e687067f745dd5e1cf099c362',
   'analysis:content-length': 1230692,
   'analysis:last-modified-at': '2025-04-17T15:45:14+00:00',
   'analysis:last-modified-detection': 'last-modified-header',
   'analysis:mime-type': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
   'analysis:parsing:finished_at': '2025-04-17T15:45:50.487854+00:00',
   'analysis:parsing:parquet_size': 1700795,
   'analysis:parsing:parquet_url': 'https://object.files.data.gouv.fr/hydra-parquet/hydra-par

In [5]:
# amener les partenaires depuis le site 
if source=='ANR':
    page_partenaires_10 = requests.get(sources[source]['url_partners']).json()
    colonnes_partenaires_10 = page_partenaires_10['columns']
    donnees_partenaires_10 = page_partenaires_10['data']
    df_partenaires=pd.DataFrame(data=donnees_partenaires_10,columns=colonnes_partenaires_10)
elif source=='ANSES':
    df_from_anses=pd.read_excel(sources[source]['url_partners'])
    df=df_from_anses.iloc[1:,:]
    df.columns=list(df_from_anses.iloc[0,:])
    dict_equipe={list(df_from_anses.columns)[k].replace('Équipe 10 ','Équipe 10').replace('Équipe13','Équipe 13'):k for k in range (len(list(df_from_anses.columns))) if list(df_from_anses.columns)[k].find('Équipe')>=0}
    list_df=[]
    number=3
    for n in range(1,len(dict_equipe)+1):
        equipe_n=pd.concat([df.iloc[:,0:3],df.iloc[:,number:number+6]], axis=1)
        list_df.append(equipe_n)
        number+=6
    df_partenaires=pd.concat([list_df[k].dropna(subset=[sources[source]['nom'], sources[source]['prenom'],sources[source]['nom_structure'], sources[source]['nom'], 'Pays'], how='all') for k in range(len(list_df))])
    
elif source=='IRESP':
    df_partenaires1=pd.read_csv(sources[source]['url_partners1'] ,sep=";", encoding='UTF-8')
    df_partenaires2=pd.read_csv(sources[source]['url_partners2'] ,sep=";", encoding='UTF-8')
    df_partenaires=pd.concat([df_partenaires1,df_partenaires2])
elif source=='ADEME':
    df_partenaires=pd.read_csv(sources[source]['url_partners'] ,sep=",", encoding='ISO-8859-1', on_bad_lines='skip')
else:    
    df_partenaires=pd.read_csv(sources[source]['url_partners'] ,sep=";", encoding='ISO-8859-1')

df_partenaires=df_partenaires.reset_index()
del df_partenaires['index']

# Matcher établissement

In [6]:
id_struct=df_partenaires
id_struct[f"{sources[source]['nom_structure']}2"]=id_struct.loc[:,sources[source]['nom_structure']].apply(lambda x: replace_all(str(x).lower().replace(" d e"," d'e").replace(" d a"," d'a").replace(" d i"," d'i").replace(" d o"," d'o").replace(" d u"," d'u").replace(" d y"," d'y").replace(" d h"," d'h").replace(" l e"," l'e").replace(" l a"," l'a").replace(" l i"," l'i").replace(" l o"," l'o").replace(" l u"," l'u").replace(" l y"," l'y").replace(" l h"," l'h")))
id_struct=id_struct.drop_duplicates(subset=[f"{sources[source]['nom_structure']}2"])

In [None]:
import requests
from project.server.main.Pydref import Pydref
from retry import retry
import os
from dotenv import load_dotenv

load_dotenv()

#urls
url='https://affiliation-matcher.staging.dataesr.ovh/match'
url_cluster = os.getenv('url_cluster')

#get the structure id from the structure name
@retry(delay=200, tries=30000)
def get_structure(row,source,cached_data,nom_structure,pays,ville,code_projet,annee_input):
    if row[nom_structure] in list(cached_data.keys()):
        pass
    else:
        url='https://affiliation-matcher.staging.dataesr.ovh/match'
        f= ' '.join([str(row[y]) for y in [x for x in [nom_structure,ville,pays] if x in list(row.keys())]])
        print(f)
        if source=='REG_IDF':
            rnsr=requests.post(url, json= {"type":"rnsr",
                                           "year":str(row[code_projet].split('-')[3]),
                                           "query":f,"verbose":False})
        elif source not in ['REG_IDF','ADEME','SIRANO']:
            rnsr=requests.post(url, json= {"type":"rnsr",
                                           "year":"20"+str(row[code_projet].split('-')[1])[-2:],
                                           "query":f,"verbose":False})
        else:
            print('ok')
            if source=='ADEME':
                annee=row['Date de dÃ©but du projet'][:4]
            elif source=='SIRANO':
                annee=row['annee_de_selection']
            else:
                annee=row[annee_input]
            rnsr=requests.post(url, json= {"type":"rnsr","year":str(annee),"query":f,"verbose":False})
        ror=requests.post(url, json= { "query" : f , "type":"ror"})
        grid=requests.post(url, json= { "query" : f , "type":"grid"})
        result_rnsr=rnsr.json()['results']
        result_ror=ror.json()['results']
        result_grid=grid.json()['results'] 
        print(result_rnsr,result_ror,result_grid)
        if result_rnsr != []:
            cached_data[row[nom_structure]]=result_rnsr
        elif result_rnsr != [] and result_grid != []:
            cached_data[row[nom_structure]]=result_grid
        elif result_rnsr != [] and result_grid == [] and result_ror != []:
            cached_data[row[nom_structure]]=result_ror
        else:
            cached_data[row[nom_structure]]=None

In [None]:
id_struct.progress_apply(lambda row: get_structure(row,source,cached_data,sources[source]['nom_structure'],sources[source]['ville'],sources[source]['pays'],sources[source]['code_projet'],sources[source]['annee']), axis=1) 
write_cache(cached_data,f"./DATA/{source}/caches/cached_{source}_data.pkl")

  1%|          | 1/112 [00:00<00:00, 999.60it/s]


TypeError: too many positional arguments

In [None]:
len(cached_data)

In [None]:
id_struct['id_structure_matcher']=id_struct.loc[:,sources[source]['nom_structure']].apply(lambda x: cached_data[x])

In [None]:
id_struct

In [None]:
id_struct=id_struct.reset_index()
del id_struct['index']
id_struct.to_json(f"./DATA/{source}/df_partenaires.json")

In [None]:
id_struct=pd.read_json(f"./DATA/{source}/df_partenaires.json")
id_struct=id_struct[[sources[source]['nom_structure'],'id_structure_matcher']]
id_struct[f"{sources[source]['nom_structure']}2"]=id_struct.loc[:,sources[source]['nom_structure']].apply(lambda x: replace_all(str(x).lower().replace(" d e"," d'e").replace(" d a"," d'a").replace(" d i"," d'i").replace(" d o"," d'o").replace(" d u"," d'u").replace(" d y"," d'y").replace(" d h"," d'h").replace(" l e"," l'e").replace(" l a"," l'a").replace(" l i"," l'i").replace(" l o"," l'o").replace(" l u"," l'u").replace(" l y"," l'y").replace(" l h"," l'h")))

df_partenaires[f"{sources[source]['nom_structure']}2"]=df_partenaires.loc[:,sources[source]['nom_structure']].apply(lambda x: replace_all(str(x).lower().replace(" d e"," d'e").replace(" d a"," d'a").replace(" d i"," d'i").replace(" d o"," d'o").replace(" d u"," d'u").replace(" d y"," d'y").replace(" d h"," d'h").replace(" l e"," l'e").replace(" l a"," l'a").replace(" l i"," l'i").replace(" l o"," l'o").replace(" l u"," l'u").replace(" l y"," l'y").replace(" l h"," l'h")))
df_partenaires_struct=pd.merge(df_partenaires,id_struct[[f"{sources[source]['nom_structure']}2",'id_structure_matcher']], on=f"{sources[source]['nom_structure']}2", how='left')
df_partenaires_struct

In [None]:
#compléter les données avec scanR
url_scanr='https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/scanR/projects.json'
requete_scanR = requests.get(url_scanr)
page_scanR= requete_scanR.json()
df_scanR=pd.DataFrame(page_scanR)
scanR=df_scanR.explode('participants').loc[:,['id','participants']]
scanR=scanR.rename(columns={'id':'id_anr'})
scanR['index']=[x for x in range(len(scanR))]
scanR=scanR.set_index('index')
scanR['id_structure_scanr']=scanR['participants'].apply(lambda x: x.get(str('structure')) if isinstance(x, dict) else None )
scanR['nom_struct']=scanR['participants'].apply(lambda x: get_scanR_structure(x))
del scanR['participants']
scanR_nettoye=scanR.drop_duplicates(subset='nom_struct')
scanR_nettoye[f"{sources[source]['nom_structure']}2"]=scanR_nettoye.loc[:,'nom_struct'].apply(lambda x: replace_all(str(x).lower()))
scanR_nettoye=scanR_nettoye[['id_structure_scanr',f"{sources[source]['nom_structure']}2"]]
scanR_nettoye=scanR_nettoye.drop_duplicates(subset=f"{sources[source]['nom_structure']}2")

In [None]:
df_partenaires_struct=pd.merge(df_partenaires_struct,scanR_nettoye, on=f"{sources[source]['nom_structure']}2", how='left')
df_partenaires_struct

In [None]:
#######fichier avec les identifiants structures rettrouvés à la main par Emmanuel ==> 'code'
scanr_structures=pd.read_excel('scanr_partenaires_non_identifies.xlsx')
scanr_structures[f"{sources[source]['nom_structure']}2"]=scanr_structures.loc[:,'Nom'].apply(lambda x: replace_all(str(x).lower().replace(" d e"," d'e").replace(" d a"," d'a").replace(" d i"," d'i").replace(" d o"," d'o").replace(" d u"," d'u").replace(" d y"," d'y").replace(" d h"," d'h").replace(" l e"," l'e").replace(" l a"," l'a").replace(" l i"," l'i").replace(" l o"," l'o").replace(" l u"," l'u").replace(" l y"," l'y").replace(" l h"," l'h")))
scanr_structures=scanr_structures[[f"{sources[source]['nom_structure']}2",'code']]
scanr_structures=scanr_structures.dropna().drop_duplicates(subset=f"{sources[source]['nom_structure']}2")
df_partenaires_complet=pd.merge(df_partenaires_struct,scanr_structures, on=f"{sources[source]['nom_structure']}2", how='left')
df_partenaires_complet

In [None]:
if 'finess' in list(df_partenaires.columns):
    finess_siret=pd.read_json("finess_siret-siege.json")
    df_partenaires_complet=pd.merge(df_partenaires_complet,finess_siret,how='left', on='finess')

In [None]:
df_partenaires_complet[sources[source]['identifiants_preferes_structure']]

In [None]:
df_partenaires_complet.columns

In [None]:
df_partenaires_complet['id_structure']=df_partenaires_complet.apply(lambda row: get_id(row,sources[source]['identifiants_preferes_structure']), axis=1)
df_partenaires_complet

In [None]:
#df_partenaires_complet[pd.isna(df_partenaires_complet.id_structure)]
df_partenaires_complet.loc[(pd.isna(df_partenaires_complet['id_structure']))|(str(df_partenaires_complet['id_structure'])=='None')|(str(df_partenaires_complet['id_structure'])=='nan')]

In [None]:
#df_partenaires_complet.to_excel(f"./DATA/{source}/df_partenaires_id_structures.xlsx")
df_partenaires_complet.to_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
df_partenaires_complet=pd.read_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
########récupération des structures sans identifiants pour les donner à Emmanuel
identifiants_a_remplir=df_partenaires_complet.loc[(pd.isna(df_partenaires_complet['id_structure']))|(str(df_partenaires_complet['id_structure'])=='None')|(str(df_partenaires_complet['id_structure'])=='nan')]
identifiants_a_remplir
identifiants_a_remplir=identifiants_a_remplir.drop_duplicates(subset=f"{sources[source]['nom_structure']}2")
identifiants_a_remplir=identifiants_a_remplir.reset_index()
del identifiants_a_remplir['index']

In [None]:
identifiants_a_remplir

In [None]:
if sources[source]['ville'] in list(identifiants_a_remplir.columns) and sources[source]['pays'] in list(identifiants_a_remplir.columns) and sources[source]['adresse'] not in list(identifiants_a_remplir.columns):
    identifiants_a_remplir=identifiants_a_remplir[[sources[source]['nom_structure'],sources[source]['ville'],sources[source]['pays']]]
elif sources[source]['ville'] in list(identifiants_a_remplir.columns) and sources[source]['pays'] in list(identifiants_a_remplir.columns) and sources[source]['adresse'] in list(identifiants_a_remplir.columns):
    identifiants_a_remplir=identifiants_a_remplir[[sources[source]['nom_structure'],sources[source]['adresse'],sources[source]['ville'],sources[source]['pays']]]
elif sources[source]['region'] in list(identifiants_a_remplir.columns):
    identifiants_a_remplir=identifiants_a_remplir[[sources[source]['nom_structure'],sources[source]['region']]]
elif sources[source]['ville'] in list(identifiants_a_remplir.columns) and sources[source]['pays'] not in list(identifiants_a_remplir.columns):
    identifiants_a_remplir=identifiants_a_remplir[[sources[source]['nom_structure'],sources[source]['ville']]]
identifiants_a_remplir

In [None]:
identifiants_a_remplir.to_excel(f"./structures_manquantes/partenaires_non_identifies_{source}.xlsx", index=False)

# Matcher des chercheurs

In [None]:
df_partenaires=pd.read_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
if len([x for x in ['nom', 'prenom'] if x in list(sources[source].keys())])==2:
    df_partenaires['id_personne']=df_partenaires.progress_apply(lambda row: get_person(row, cached_data_persons,sources[source]['nom'],sources[source]['prenom']), axis=1)
    #df_partenaires.to_excel(f"./DATA/{source}/df_partners_id_personne.xlsx")
    df_partenaires.to_json(f"./DATA/{source}/df_partners_id_person.json")

In [None]:
len(cached_data_persons)

In [None]:
write_cache(cached_data_persons,f"./DATA/{source}/caches/cached_{source.lower()}_data_persons.pkl")

In [None]:
df_partenaires['id_personne']=df_partenaires.progress_apply(lambda row: get_person(row, cached_data_persons,sources[source]['nom'],sources[source]['prenom']), axis=1)

In [None]:
df_partenaires.to_json(f"./DATA/{source}/df_partners_id_person.json")

In [None]:
if sources[source]['id_ORCID'] in list(df_partenaires.columns):
    df_partenaires=pd.read_json(f"./DATA/{source}/df_partners_id_person.json")
    df_partenaires['idref_ORCID']=df_partenaires.progress_apply(lambda row: orcid_to_idref(row,cached_data_orcid,sources[source]['id_ORCID'],Authorization_ORCID), axis=1)
    #write_cache(cached_data_orcid,f"./DATA/{source}/caches/cached_{source.lower()}_data_orcid.pkl")
    #df_partenaires.to_excel(f"./DATA/{source}/df_partners_id_person_ORCID.xlsx")
    df_partenaires.to_json(f"./DATA/{source}/df_partners_id_person_ORCID.json")

In [None]:
len(cached_data_orcid)

# ENVOI DES PROJETS SUR SCANR

In [None]:
if len(sources[source]['identifiants_preferes_personne'])>=2:
    df_partenaires=pd.read_json(f"./DATA/{source}/df_partners_id_person_ORCID.json")
elif len(sources[source]['identifiants_preferes_personne'])==1:
    df_partenaires=pd.read_json(f"./DATA/{source}/df_partners_id_person.json")
else:
    df_partenaires=pd.read_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
df_partenaires.loc[df_partenaires.id_structure.apply(lambda x :isinstance(x,list)),'id_structure']=df_partenaires.loc[df_partenaires.id_structure.apply(lambda x :isinstance(x,list)),'id_structure'].apply(lambda y: y[0])

In [None]:
df_partenaires

In [None]:
if len([x for x in ['nom', 'prenom'] if x in list(sources[source].keys())])==2:
    df_partenaires['id_person']=df_partenaires.apply(lambda row: get_id(row,sources[source]['identifiants_preferes_personne']), axis=1)
    df_partenaires['persons']=df_partenaires.progress_apply(lambda row: persons(row,sources[source]['prenom'],sources[source]['nom']) ,axis=1)
else:
    df_partenaires['persons']=np.nan

In [None]:
if source != 'SIRANO':
    df_partenaires=df_partenaires.groupby([sources[source]['code_projet']]).agg({'persons': lambda x: [ y for y in x.tolist() if pd.isna(y)==False]}, dropna=False).reset_index()
else:
    df_projets=df_partenaires.groupby([sources[source]['code_projet'], sources[source]['annee'], sources[source]['acronyme'],sources[source]['titre'],sources[source]['budget']], dropna=False).agg({'persons': lambda x: [ y for y in x.tolist() if pd.isna(y)==False]}, dropna=False)

In [None]:
# amener les projets depuis le site 
if source=='ANR':
    page_projets_10 = requests.get(sources[source]['url_projects']).json()
    colonnes_projets_10 = page_projets_10['columns']
    donnees_projets_10 = page_projets_10['data']
    df_projets=pd.DataFrame(data=donnees_projets_10,columns=colonnes_projets_10)
elif source=='IRESP':
    df_projets1=pd.read_csv(sources[source]['url_projects1'] ,sep=";", encoding='UTF-8')
    df_projets2=pd.read_csv(sources[source]['url_projects2'] ,sep=";", encoding='UTF-8')
    df_projets=pd.concat([df_projets1,df_projets2])
    df_projets.loc[pd.isna(df_projets['Titre_du_projet_FR']),'Titre_du_projet_FR']=df_projets.loc[pd.isna(df_projets['Titre_du_projet_FR']),'Titre_du_projet']
elif source!='SIRANO':
    df_projets=pd.read_csv(sources[source]['url_projects'] ,sep=";", encoding='ISO-8859-1')

df_projets=df_projets.reset_index()
del df_projets['index']

In [None]:
df_projets

In [None]:
if source!='SIRANO':
    df_projets=pd.merge(df_projets,df_partenaires,on=sources[source]['code_projet'], how='left')
else :
    df_projets['id']=df_partenaires.apply(lambda row: f"{row[sources[source]['code_projet']]}-{row[sources[source]['annee']]}-{row[sources[source]['acronyme']]}" , axis=1)
    del df_projets['code_projet']
    sources[source]['code_projet']='id'

df_projets['type']=source
df_projets['name']=df_projets.progress_apply(lambda row: projects(row,sources[source]['titre_fr'],sources[source]['titre_en']) ,axis=1)
df_projets['description']=df_projets.progress_apply(lambda row: projects(row,sources[source]['resume_fr'],sources[source]['resume_en']) ,axis=1)
df_projets.loc[:,sources[source]['budget']]=df_projets.loc[:,sources[source]['budget']].apply(lambda x : float(str(x).replace('.0','').replace('.00','').replace(' ','').replace(',','.').replace('€','').replace('\x80','')))
df_projets=df_projets.rename(columns={sources[source]['annee']:'year',sources[source]['acronyme']:'acronym',
                                      sources[source]['budget']:'budget_financed',sources[source]['code_projet']:'id'})
df_projets=df_projets[['id','type','name','description','acronym','year','budget_financed','persons']]

In [None]:
dict_row=df_projets.iloc[0,:].to_dict()
dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x')&(str(v)!='[]'))}
dict_row2

In [None]:
#envoi
err=[]
for i,row in df_projets.iterrows():
    dict_row=row.to_dict()
    dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x')&(str(v)!='[]'))}
    try:
       r=requests.post('http://185.161.45.213/projects/projects', json = dict_row2, headers={"Authorization":Authorization})
       res= r.json()
       if res.get('status')=='ERR':
           err.append(res)
           if res.get('error').get('code')!=422:
               print(err)
               pp.pprint(err)
    except Exception as e:
        pp.pprint(e)

In [None]:
pd.Series([x.get('issues').get('id') for x in err]).drop_duplicates().tolist() 

pour mettre à jour

In [None]:
###AAATTTENTION CHANGER A CHAQUE FOIS LE TYPE
nbr_page=int(requests.get('http://185.161.45.213/projects/projects?where={"type":"IRESP"}&projection={"id":1}&max_results=500&page=1', headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])

list_ids=[]
for i in range(1,nbr_page+1):
    print("page",i)
    page=requests.get('http://185.161.45.213/projects/projects?where={"type":"IRESP"}&projection={"id":1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        print("k",k)
        list_ids.append(page['data'][k]['id'])
    
projets_a_ajouter=[x for x in list(df_projets['id']) if x not in list_ids]

projets_a_retirer=[x for x in list_ids if x not in list(df_projets['id'])]

df_projets = df_projets[df_projets['id'].apply(lambda x: x in projets_a_ajouter)]

In [None]:
len(projets_a_ajouter)

In [None]:
len(df_projets[df_projets['id'].apply(lambda x: x in projets_a_ajouter)])

In [None]:
len(df_projets[df_projets['id'].apply(lambda x: x in projets_a_retirer)])

In [None]:
dict_row=df_projets.iloc[0,:].to_dict()
dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x')&(str(v)!='[]'))}
dict_row2

In [None]:
err=[]
for i,row in df_projets.iterrows():
    dict_row=row.to_dict()
    dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x')&(str(v)!='[]'))}
    try:
       r=requests.post('http://185.161.45.213/projects/projects', json = dict_row2, headers={"Authorization":Authorization})
       res= r.json()
       if res.get('status')=='ERR':
           err.append(res)
           if res.get('error').get('code')!=422:
               print(err)
               pp.pprint(err)
    except Exception as e:
        pp.pprint(e)

In [None]:
len(projets_a_ajouter)

Modifications

In [None]:
nbr_page=int(requests.get('http://185.161.45.213/projects/projects?where={"type":"ANR"}&projection={"id":1,"year":1}&max_results=500'+f"&page={1}", headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])
nbr_page

In [None]:
list_ids=[]
for i in range(1,nbr_page+1):
    print("page",i)
    page=requests.get('http://185.161.45.213/projects/projects?where={%22type%22:%22ANR%22}&projection={%22id%22:1,%22year%22:1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        list_ids.append(page['data'][k]['year'])

In [None]:
max(list_ids)

In [None]:
df_projets = df_projets[df_projets['id'].apply(lambda x: x in list_ids)]

In [None]:
"""
en principe, il faut que je mette à jour tous les projets 
qui ne sont pas clos avec les titres résumés et persons, mais 
peut etre qu'on peut mettre à jour automatiquement les projets
des 3 dernières années ? 

"""

err=[]
for id in df_projets.iterrows() :
    url = f"http://185.161.45.213/projects/projects/{id}"
    project=requests.get(url, headers={'Authorization': Authorization}).json()
    head = {"Authorization": Authorization, "If-Match": project['etag'], "Content-Type": "application/json"}

    r = requests.patch(url, json = {"year": 2023}, headers=head)
    res= r.json()
    if res.get('status')=='ERR':
        err.append(res)
        if res.get('error').get('code')!=422:
            print(err)
            pp.pprint(err)


# ENVOI DES PARTENAIRES SUR SCANR

In [None]:
df_partenaires=pd.read_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
df_partenaires.columns

In [None]:
### ATTENTION, vérifier que les projets sirano sont dans des structures françaises
if source=='IRESP':
    df_partenaires[sources[source]['pays']]=df_partenaires.loc[:,sources[source]['ville']].apply(lambda x: x.split('(')[1].replace(')','') if x.find('(')>=0 else 'France')
    df_partenaires.loc[:,sources[source]['ville']]=df_partenaires.loc[:,sources[source]['ville']].apply(lambda x: x.split('(')[0] if x.find('(')>=0 else x)
  
df_partenaires['address']=df_partenaires.apply(lambda row: address(row,sources[source]['pays'],sources[source]['ville'],source), axis=1)
df_partenaires.loc[:,'id_structure']=df_partenaires.loc[:,'id_structure'].apply(lambda x: x[0] if isinstance(x,list) else x )

In [None]:
df_partenaires

In [None]:
if source in ['ANSES','SIRANO']:
    df_partenaires['id']=df_partenaires.apply(lambda row: f"{row[sources[source]['code_projet']]}-{row[str(sources[source]['nom_structure'])+'2']}-{row[sources[source]['nom']]}-{row[sources[source]['prenom']]}" , axis=1)
if source =='REG_IDF':
    df_partenaires['id']=df_partenaires.apply(lambda row: f"{row[sources[source]['code_projet']]}-{row[str(sources[source]['nom_structure'])+'2']}-{row['entite_role']}" , axis=1)
df_partenaires['address']=df_partenaires.apply(lambda row: address(row,sources[source]['pays'],sources[source]['ville'],source), axis=1)
df_partenaires=df_partenaires.rename(columns={sources[source]['nom_structure']: 'name', sources[source]['code_projet']: 'project_id', 'id_structure':'participant_id','Projet.Partenaire.Code_Decision_ANR':'id'})
df_partenaires=df_partenaires[['name','id','project_id','participant_id','address']]
df_partenaires['project_type']=source
df_partenaires['participant_id']=df_partenaires.loc[:,'participant_id'].apply(lambda x: str(x[0]).replace('.0','') if isinstance(x,list) else str(x).split(';')[0].replace('.0',''))
df_partenaires=df_partenaires[['id','project_id', 'project_type', 'participant_id', 'name','address']]
df_partenaires['name'] = df_partenaires['name'].astype(str)
df_partenaires

In [None]:
df_partenaires[df_partenaires.duplicated(subset=['id'])]

In [None]:
dict_row=df_partenaires.iloc[1,:].to_dict()
dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x'))}
dict_row2

In [None]:
len(df_partenaires)

In [None]:
err=[]
for i,row in df_partenaires.iterrows():
    dict_row=row.to_dict()
    dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x'))}
    try:
       r=requests.post('http://185.161.45.213/projects/participations', json = dict_row2, headers={"Authorization":Authorization})
       res= r.json()
       if res.get('status')=='ERR':
           print(i)
           err.append(res)
           if res.get('error').get('code')!=422:
               print(err)
               pp.pprint(err)
    except Exception as e:
        pp.pprint(e)

Mise à jour

In [None]:
nbr_page=int(requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANSES"}&projection={"id":1}&max_results=500&page=1', headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])

list_ids=[]
for i in range(1,nbr_page+1):
    print("page",i)
    page=requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANSES"}&projection={"id":1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        print("k",k)
        list_ids.append(page['data'][k]['id'])
    
projets_a_ajouter=[x for x in list(df_partenaires['id'].drop_duplicates()) if x not in list(pd.Series(list_ids).drop_duplicates())]

projets_a_retirer=[x for x in list_ids if x not in list(df_partenaires['id'])]

df_partenaires = df_partenaires[df_partenaires['id'].apply(lambda x: x in projets_a_ajouter)]


In [None]:
len(projets_a_ajouter)

In [None]:
print(df_partenaires)

In [None]:
len(df_partenaires)

In [None]:
err=[]
for i,row in df_partenaires.iterrows():
    dict_row=row.to_dict()
    dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x'))}
    try:
       r=requests.post('http://185.161.45.213/projects/participations', json = dict_row2, headers={"Authorization":Authorization})
       res= r.json()
       if res.get('status')=='ERR':
           print(i)
           err.append(res)
           if res.get('error').get('code')!=422:
               print(err)
               pp.pprint(err)
    except Exception as e:
        pp.pprint(e)

Modifications

In [None]:
nbr_page=int(requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANR","participant_id":{"$exists":false}}&projection={"id":1}&max_results=500'+f"&page={1}", headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])

list_ids=[]
for i in range(1,nbr_page+1):
    print("page",i)
    page=requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANR","participant_id":{"$exists":false}}&projection={"id":1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        print("k",k)
        list_ids.append(page['data'][k]['id'])

df_partenaires = df_partenaires[df_partenaires['id'].apply(lambda x: x in list_ids)]

In [None]:
len(list_ids)

In [None]:
df_partenaires=df_partenaires[(df_partenaires.participant_id!='x') & (df_partenaires.participant_id!='None')]

In [None]:
df_partenaires

In [None]:
err=[]
for i,row in df_partenaires.iterrows() :
    print(i)
    id=row['id']
    url = f"http://185.161.45.213/projects/participations/{id}"
    project=requests.get(url, headers={'Authorization': Authorization}).json()
    head = {"Authorization": Authorization, "If-Match": project['etag'], "Content-Type": "application/json"}

    r = requests.patch(url, json = {"participant_id": row['participant_id']}, headers=head)
    res= r.json()
    if res.get('status')=='ERR':
        err.append(res)
        if res.get('error').get('code')!=422:
            print(err)
            pp.pprint(err)

In [None]:
pd.Series([x.get('issues').get('id')[25:] for x in err]).drop_duplicates().tolist()

Faire un nouveau fichier avec toutes les structures manquantes

In [None]:
struct_manq=pd.read_excel('scanr_partenaires_non_identifies.xlsx')

In [None]:
nom1=struct_manq[struct_manq.New=='IRESP'].drop_duplicates(subset='Nom').Nom

In [None]:
iresp_actuel=pd.read_excel('./missing_ids_structures/partenaires_non_identifies_IRESP.xlsx')

In [None]:
nom2=iresp_actuel['Nom_equipe']

In [None]:
pd.Series(list(nom1)+list(nom2))

In [None]:
pd.Series(list(nom1)+list(nom2)).drop_duplicates()