# Traitement des données de financements 

In [1]:
#importer les packages
import pandas as pd
import numpy as np
from project.server.main.utils import replace_all, get_scanR_structure, get_id, strip_outer_quotes, clean_budget
from project.server.main.cached_data_handler import get_structure, get_person
from project.server.main.features_into_dictionnary import persons, projects, address
from project.server.main.id_from_orcid import orcid_to_idref
from tqdm import tqdm
import pprint as pp
tqdm.pandas()
from project.server.main.my_pickle import load_cache,write_cache
import os
from dotenv import load_dotenv
import requests

load_dotenv()

Authorization = os.getenv('Authorization_access_185.XX')
Authorization_ORCID = os.getenv('Authorization_cluster_BSO_ORCID')
url_cluster = os.getenv('url_cluster')

In [2]:
sources=dict(pd.read_json('sources.json'))

In [3]:
for source in list(sources.keys()):
    sources[source]={k:y for k,y in sources[source].items() if not (isinstance(y, float) and pd.isna(y))}

In [4]:
source=list(sources.keys())[3]
source

'SIRANO'

In [5]:
#cache structures, personnes et orcid avec differentes sources de donnees
cached_data = {}
try:
    cached_data = load_cache(cached_data,f"./DATA/{source}/caches/cached_{source.lower()}_data.pkl")
except:
    write_cache(cached_data,f"./DATA/{source}/caches/cached_{source.lower()}_data.pkl")
    
cached_data_persons = {}
try:
    cached_data_persons = load_cache(cached_data_persons,f"./DATA/{source}/caches/cached_{source.lower()}_data_persons.pkl")
except:
    write_cache(cached_data_persons,f"./DATA/{source}/caches/cached_{source.lower()}_data_persons.pkl")
    
cached_data_orcid = {}
try:
    cached_data_orcid = load_cache(cached_data_orcid,f"./DATA/{source}/caches/cached_{source.lower()}_data_orcid.pkl")
except:
    write_cache(cached_data_orcid,f"./DATA/{source}/caches/cached_{source.lower()}_data_orcid.pkl")

114 data in cache
2057 data in cache
0 data in cache


# Données partenaires

In [None]:
if source == 'SIRANO':
    pass
elif source == 'REG_IDF':
    response = requests.get(
        "https://data.iledefrance.fr/api/explore/v2.1/catalog/datasets/dim_map_projets_finances",
        headers={"Accept":"/"},
    )
    datas = response.json()
    date_projects=datas['metas']['default']['modified']
    
    response = requests.get(
        "https://data.iledefrance.fr/api/explore/v2.1/catalog/datasets/dim_map_projets_finances_entites_partenariat",
        headers={"Accept":"/"},
    )
    datas = response.json()
    date_partners=datas['metas']['default']['modified']
else:
    response = requests.get(
        f"http://www.data.gouv.fr/api/1/datasets/?organization={sources[source]['id']}&format=json&q=dos",
        headers={"Accept":"/"},
    )
    datas = response.json()
    date_projects=[data for data in datas['data'][0]['resources'] if sources[source]['keyword_projects1'] in str(data['title']) !=-1 and sources[source]['keyword_projects2'] in str(data['title']) !=-1 ][0]['last_modified']
    date_partners=[data for data in datas['data'][0]['resources'] if sources[source]['keywords_partners1'] in str(data['title']) !=-1 and sources[source]['keywords_partners1'] in str(data['title']) !=-1][0]['last_modified']

nbr_page=int(requests.get(f'http://185.161.45.213/projects/participations?where={"project_type":"{source}"}&projection={"modified_at":1}&max_results=500&page=1', headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])
list_ids=[]
for i in range(1,nbr_page+1):
    page=requests.get(f'http://185.161.45.213/projects/participations?where={"project_type":"{source}"}&projection={"modified_at":1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        list_ids.append(page['data'][k]['modified_at'])
max_date_partners=max(list_ids)

nbr_page=int(requests.get(f'http://185.161.45.213/projects/projects?where={"type":"{source}"}&projection={"modified_at":1}&max_results=500&page=1', headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])
list_ids=[]
for i in range(1,nbr_page+1):
    page=requests.get(f'http://185.161.45.213/projects/projects?where={"type":"{source}"}&projection={"modified_at":1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        list_ids.append(page['data'][k]['modified_at'])
max_date_projects=max(list_ids)

#if true => update
print(max_date_partners<date_partners or max_date_projects<date_projects)

In [None]:
# amener les partenaires depuis le site 
if source=='ANR':
    page_partenaires_10 = requests.get(sources[source]['url_partners']).json()
    colonnes_partenaires_10 = page_partenaires_10['columns']
    donnees_partenaires_10 = page_partenaires_10['data']
    df_partners=pd.DataFrame(data=donnees_partenaires_10,columns=colonnes_partenaires_10)
elif source=='ANSES':
    df_from_anses=pd.read_excel(sources[source]['url_partners'])
    df_partners=df_from_anses.applymap(strip_outer_quotes)
    df_partners['annee']=df_partners.apply(lambda row: "20"+str(row[sources[source]['code_projet']].split('-')[1])[-2:], axis=1)
elif source=='IRESP':
    df_partners1=pd.read_csv(sources[source]['url_partners1'] ,sep=";", encoding='UTF-8')
    df_partners2=pd.read_csv(sources[source]['url_partners2'] ,sep=";", encoding='UTF-8')
    df_partners=pd.concat([df_partners1,df_partners2])
else:    
    df_partners=pd.read_csv(sources[source]['url_partners'] ,sep=";", encoding='ISO-8859-1')
df_partners=df_partners.reset_index()
del df_partners['index']

# Matcher établissement

In [None]:
id_struct=df_partners
id_struct[f"{sources[source]['nom_structure']}2"]=id_struct.loc[:,sources[source]['nom_structure']].apply(lambda x: replace_all(str(x).lower().replace(" d e"," d'e").replace(" d a"," d'a").replace(" d i"," d'i").replace(" d o"," d'o").replace(" d u"," d'u").replace(" d y"," d'y").replace(" d h"," d'h").replace(" l e"," l'e").replace(" l a"," l'a").replace(" l i"," l'i").replace(" l o"," l'o").replace(" l u"," l'u").replace(" l y"," l'y").replace(" l h"," l'h")))
id_struct=id_struct.drop_duplicates(subset=[f"{sources[source]['nom_structure']}2"])

In [None]:
id_struct.progress_apply(lambda row: get_structure(row,source,cached_data,sources[source]['nom_structure'],sources[source]['ville'],sources[source]['pays'],sources[source]['code_projet'],sources[source]['annee']), axis=1) 
write_cache(cached_data,f"./DATA/{source}/caches/cached_{source}_data.pkl")

In [None]:
len(cached_data)

In [None]:
id_struct['id_structure_matcher']=id_struct.loc[:,sources[source]['nom_structure']].apply(lambda x: cached_data[x])

In [None]:
id_struct

In [None]:
id_struct=id_struct.reset_index()
del id_struct['index']
id_struct.to_json(f"./DATA/{source}/df_partners.json")

In [None]:
id_struct=pd.read_json(f"./DATA/{source}/df_partners.json")
id_struct=id_struct[[sources[source]['nom_structure'],'id_structure_matcher']]
id_struct[f"{sources[source]['nom_structure']}2"]=id_struct.loc[:,sources[source]['nom_structure']].apply(lambda x: replace_all(str(x).lower().replace(" d e"," d'e").replace(" d a"," d'a").replace(" d i"," d'i").replace(" d o"," d'o").replace(" d u"," d'u").replace(" d y"," d'y").replace(" d h"," d'h").replace(" l e"," l'e").replace(" l a"," l'a").replace(" l i"," l'i").replace(" l o"," l'o").replace(" l u"," l'u").replace(" l y"," l'y").replace(" l h"," l'h")))

df_partners[f"{sources[source]['nom_structure']}2"]=df_partners.loc[:,sources[source]['nom_structure']].apply(lambda x: replace_all(str(x).lower().replace(" d e"," d'e").replace(" d a"," d'a").replace(" d i"," d'i").replace(" d o"," d'o").replace(" d u"," d'u").replace(" d y"," d'y").replace(" d h"," d'h").replace(" l e"," l'e").replace(" l a"," l'a").replace(" l i"," l'i").replace(" l o"," l'o").replace(" l u"," l'u").replace(" l y"," l'y").replace(" l h"," l'h")))
df_partners_struct=pd.merge(df_partners,id_struct[[f"{sources[source]['nom_structure']}2",'id_structure_matcher']], on=f"{sources[source]['nom_structure']}2", how='left')
df_partners_struct

In [None]:
#compléter les données avec scanR
url_scanr='https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/scanR/projects.json'
requete_scanR = requests.get(url_scanr)
page_scanR= requete_scanR.json()
df_scanR=pd.DataFrame(page_scanR)
scanR=df_scanR.explode('participants').loc[:,['id','participants']]
scanR=scanR.rename(columns={'id':'id_anr'})
scanR['index']=[x for x in range(len(scanR))]
scanR=scanR.set_index('index')
scanR['id_structure_scanr']=scanR['participants'].apply(lambda x: x.get(str('structure')) if isinstance(x, dict) else None )
scanR['nom_struct']=scanR['participants'].apply(lambda x: get_scanR_structure(x))
del scanR['participants']
scanR_nettoye=scanR.drop_duplicates(subset='nom_struct')
scanR_nettoye[f"{sources[source]['nom_structure']}2"]=scanR_nettoye.loc[:,'nom_struct'].apply(lambda x: replace_all(str(x).lower()))
scanR_nettoye=scanR_nettoye[['id_structure_scanr',f"{sources[source]['nom_structure']}2"]]
scanR_nettoye=scanR_nettoye.drop_duplicates(subset=f"{sources[source]['nom_structure']}2")

In [None]:
df_partners_struct=pd.merge(df_partners_struct,scanR_nettoye, on=f"{sources[source]['nom_structure']}2", how='left')
df_partners_struct

In [None]:
#######fichier avec les identifiants structures rettrouvés à la main par Emmanuel ==> 'code'
scanr_structures=pd.read_excel('scanr_partenaires_non_identifies.xlsx')
scanr_structures[f"{sources[source]['nom_structure']}2"]=scanr_structures.loc[:,'Nom'].apply(lambda x: replace_all(str(x).lower().replace(" d e"," d'e").replace(" d a"," d'a").replace(" d i"," d'i").replace(" d o"," d'o").replace(" d u"," d'u").replace(" d y"," d'y").replace(" d h"," d'h").replace(" l e"," l'e").replace(" l a"," l'a").replace(" l i"," l'i").replace(" l o"," l'o").replace(" l u"," l'u").replace(" l y"," l'y").replace(" l h"," l'h")))
scanr_structures=scanr_structures[[f"{sources[source]['nom_structure']}2",'code']]
scanr_structures=scanr_structures.dropna().drop_duplicates(subset=f"{sources[source]['nom_structure']}2")
df_partners_complet=pd.merge(df_partners_struct,scanr_structures, on=f"{sources[source]['nom_structure']}2", how='left')
df_partners_complet

In [None]:
if 'finess' in list(df_partners.columns):
    finess_siret=pd.read_csv(f"./DATA/{source}/finess_siret-siege.csv", sep= ";")[['finess','siret']]
    finess_siret.loc[:,'siren']=finess_siret.loc[:,'siret'].apply(lambda x: str(x)[:9] if pd.isna(x)==False else None)
    finess_siret.loc[:,'finess']=finess_siret.loc[:,'finess'].apply(lambda x: str(x) if pd.isna(x)==False else None)
    finess_siret=finess_siret.dropna().drop_duplicates(subset=['finess'])
    df_partners_complet.loc[:,'finess']=df_partners_complet.loc[:,'finess'].apply(lambda x: str(x) if pd.isna(x)==False else None)
    df_partners_complet=pd.merge(df_partners_complet,finess_siret[['finess','siren']],how='left', on='finess')

In [None]:
if 'entite_SIRET' in list(df_partners.columns):
    df_partners_complet['entite_SIRET']=df_partners_complet['entite_SIRET'].apply(lambda x: str(clean_budget(x)) if pd.isna(x)==False else None)

In [None]:
df_partners_complet['id_structure']=df_partners_complet.apply(lambda row: get_id(row,sources[source]['identifiants_preferes_structure']), axis=1)
df_partners_complet

In [None]:
#df_partners_complet[pd.isna(df_partners_complet.id_structure)]
df_partners_complet.loc[(pd.isna(df_partners_complet['id_structure']))|(str(df_partners_complet['id_structure'])=='None')|(str(df_partners_complet['id_structure'])=='nan')]

In [None]:
#df_partners_complet.to_excel(f"./DATA/{source}/df_partners_id_structures.xlsx")
df_partners_complet.to_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
df_partners_complet=pd.read_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
########récupération des structures sans identifiants pour les donner à Emmanuel
identifiants_a_remplir=df_partners_complet.loc[(pd.isna(df_partners_complet['id_structure']))|(str(df_partners_complet['id_structure'])=='None')|(str(df_partners_complet['id_structure'])=='nan')]
identifiants_a_remplir
identifiants_a_remplir=identifiants_a_remplir.drop_duplicates(subset=f"{sources[source]['nom_structure']}2")
identifiants_a_remplir=identifiants_a_remplir.reset_index()
del identifiants_a_remplir['index']

In [None]:
if sources[source]['ville'] in list(identifiants_a_remplir.columns) and sources[source]['pays'] in list(identifiants_a_remplir.columns) and sources[source]['adresse'] not in list(identifiants_a_remplir.columns):
    identifiants_a_remplir=identifiants_a_remplir[[sources[source]['nom_structure'],sources[source]['ville'],sources[source]['pays']]]
elif sources[source]['ville'] in list(identifiants_a_remplir.columns) and sources[source]['pays'] in list(identifiants_a_remplir.columns) and sources[source]['adresse'] in list(identifiants_a_remplir.columns):
    identifiants_a_remplir=identifiants_a_remplir[[sources[source]['nom_structure'],sources[source]['adresse'],sources[source]['ville'],sources[source]['pays']]]
elif sources[source]['region'] in list(identifiants_a_remplir.columns):
    identifiants_a_remplir=identifiants_a_remplir[[sources[source]['nom_structure'],sources[source]['region']]]
elif sources[source]['ville'] in list(identifiants_a_remplir.columns) and sources[source]['pays'] not in list(identifiants_a_remplir.columns):
    identifiants_a_remplir=identifiants_a_remplir[[sources[source]['nom_structure'],sources[source]['ville']]]
identifiants_a_remplir

In [None]:
identifiants_a_remplir.to_excel(f"./missing_ids_structures/partenaires_non_identifies_{source}.xlsx", index=False)

# Matcher des chercheurs

In [None]:
df_partners=pd.read_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
df_partners.columns

In [None]:
if len([x for x in ['nom', 'prenom'] if x in list(sources[source].keys())])==2:
    df_partners['id_personne']=df_partners.progress_apply(lambda row: get_person(row, cached_data_persons,sources[source]['nom'],sources[source]['prenom']), axis=1)
    #df_partners.to_excel(f"./DATA/{source}/df_partners_id_personne.xlsx")
    df_partners.to_json(f"./DATA/{source}/df_partners_id_person.json")
    len(cached_data_persons)
    write_cache(cached_data_persons,f"./DATA/{source}/caches/cached_{source.lower()}_data_persons.pkl")
    df_partners['id_personne']=df_partners.progress_apply(lambda row: get_person(row, cached_data_persons,sources[source]['nom'],sources[source]['prenom']), axis=1)
else:
    df_partners['id_personne']=None

In [None]:
df_partners.to_json(f"./DATA/{source}/df_partners_id_person.json")

In [None]:
if sources[source]['id_ORCID'] in list(df_partners.columns):
    df_partners=pd.read_json(f"./DATA/{source}/df_partners_id_person.json")
    df_partners['idref_ORCID']=df_partners.progress_apply(lambda row: orcid_to_idref(row,cached_data_orcid,sources[source]['id_ORCID'],Authorization_ORCID), axis=1)
    write_cache(cached_data_orcid,f"./DATA/{source}/caches/cached_{source.lower()}_data_orcid.pkl")
    #df_partners.to_excel(f"./DATA/{source}/df_partners_id_person_ORCID.xlsx")
    df_partners.to_json(f"./DATA/{source}/df_partners_id_person_ORCID.json")

In [None]:
len(cached_data_orcid)

In [None]:
df_partners

# ENVOI DES PROJETS SUR SCANR

In [None]:
if len(sources[source]['identifiants_preferes_personne'])>=2:
    df_partners=pd.read_json(f"./DATA/{source}/df_partners_id_person_ORCID.json")
elif len(sources[source]['identifiants_preferes_personne'])==1:
    df_partners=pd.read_json(f"./DATA/{source}/df_partners_id_person.json")
else:
    df_partners=pd.read_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
df_partners.loc[df_partners.id_structure.apply(lambda x :isinstance(x,list)),'id_structure']=df_partners.loc[df_partners.id_structure.apply(lambda x :isinstance(x,list)),'id_structure'].apply(lambda y: y[0])

In [None]:
if len([x for x in ['nom', 'prenom'] if x in list(sources[source].keys())])==2:
    df_partners['id_person']=df_partners.apply(lambda row: get_id(row,sources[source]['identifiants_preferes_personne']), axis=1)
    df_partners['persons']=df_partners.progress_apply(lambda row: persons(row,sources[source]['prenom'],sources[source]['nom']) ,axis=1)
else:
    df_partners['persons']=np.nan

100%|██████████| 3588/3588 [00:00<00:00, 24068.07it/s]


In [None]:
df_partners.columns

Index(['appel_a_projets', 'annee_de_selection', 'region', 'nom_etablissement',
       'finess', 'type_etablissement', 'acronyme', 'titre',
       'discipline_principale', 'nom_porteur', 'prenom_porteur',
       'financement_total', 'numero_registre_essais', 'numero_tranche',
       'nom_etablissement2', 'id_structure_matcher', 'id_structure_scanr',
       'code', 'siren', 'id_structure', 'id_personne', 'id_person', 'persons'],
      dtype='object')

In [None]:
if source != 'SIRANO':
    df_partners=df_partners.groupby([sources[source]['code_projet']]).agg({'persons': lambda x: [ y for y in x.tolist() if pd.isna(y)==False]}, dropna=False).reset_index()
else:
    df_projets=df_partners.groupby([sources[source]['code_projet'], sources[source]['annee'], sources[source]['acronyme'],sources[source]['titre_fr'],sources[source]['budget']], dropna=False).agg({'persons': lambda x: [ y for y in x.tolist() if pd.isna(y)==False]}, dropna=False)

In [20]:
# amener les projets depuis le site 
if source=='ANR':
    page_projets_10 = requests.get(sources[source]['url_projects']).json()
    colonnes_projets_10 = page_projets_10['columns']
    donnees_projets_10 = page_projets_10['data']
    df_projets=pd.DataFrame(data=donnees_projets_10,columns=colonnes_projets_10)
elif source=='IRESP':
    df_projets1=pd.read_csv(sources[source]['url_projects1'] ,sep=";", encoding='UTF-8')
    df_projets2=pd.read_csv(sources[source]['url_projects2'] ,sep=";", encoding='UTF-8')
    df_projets=pd.concat([df_projets1,df_projets2])
elif source=='ANSES':
    df_projets=pd.read_excel(sources[source]['url_projects'])
    df_projets=df_projets.applymap(strip_outer_quotes)
elif source!='SIRANO':
    df_projets=pd.read_csv(sources[source]['url_projects'] ,sep=";", encoding='ISO-8859-1')

df_projets=df_projets.reset_index()
if source!='SIRANO':
    del df_projets['index']

In [None]:
if source!='SIRANO':
    df_projets=pd.merge(df_projets,df_partners,on=sources[source]['code_projet'], how='left')
else :
    df_projets['id']=df_partners.apply(lambda row: f"{row[sources[source]['code_projet']]}-{row[sources[source]['annee']]}-{row[sources[source]['acronyme']]}" , axis=1)
    del df_projets[sources[source]['code_projet']]
    #sources[source]['code_projet']='id'

df_projets['type']=source
df_projets['name']=df_projets.progress_apply(lambda row: projects(row,sources[source]['titre_fr'],sources[source]['titre_en']) ,axis=1)
df_projets['description']=df_projets.progress_apply(lambda row: projects(row,sources[source]['resume_fr'],sources[source]['resume_en']) ,axis=1)

100%|██████████| 3588/3588 [00:00<00:00, 24279.03it/s]
100%|██████████| 3588/3588 [00:00<00:00, 50735.15it/s]


In [22]:
df_projets.loc[:,sources[source]['budget']]=df_projets.loc[:,sources[source]['budget']].apply(lambda x : clean_budget(x))
df_projets=df_projets.rename(columns={sources[source]['annee']:'year',sources[source]['acronyme']:'acronym',
                                      sources[source]['budget']:'budget_financed',sources[source]['code_projet']:'id'})
df_projets=df_projets[['id','type','name','description','acronym','year','budget_financed','persons']]

In [23]:
df_projets[df_projets.duplicated(subset=['id'])]

Unnamed: 0,id,type,name,description,acronym,year,budget_financed,persons


In [24]:
len(df_projets)

3588

In [None]:
dict_row=df_projets.iloc[0,:].to_dict()
dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='')&(str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x')&(str(v)!='[]'))}
dict_row2['budget_financed']=clean_budget(dict_row2['budget_financed'])
dict_row2

In [None]:
#envoi
err=[]
for i,row in df_projets.iterrows():
    dict_row=row.to_dict()
    dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='')&(str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x')&(str(v)!='[]'))}
    if 'budget_financed' in list(dict_row2.keys()):
        dict_row2['budget_financed']=clean_budget(dict_row2['budget_financed'])
    try:
       r=requests.post('http://185.161.45.213/projects/projects', json = dict_row2, headers={"Authorization":Authorization})
       res= r.json()
       if res.get('status')=='ERR':
           err.append(res)
           if res.get('error').get('code')!=422:
               print(err)
               pp.pprint(err)
    except Exception as e:
        pp.pprint(e)

In [None]:
pd.Series([x.get('issues').get('id') for x in err]).drop_duplicates().tolist() 

pour mettre à jour

In [None]:
###AAATTTENTION CHANGER A CHAQUE FOIS LE TYPE
nbr_page=int(requests.get('http://185.161.45.213/projects/projects?where={"type":"IRESP"}&projection={"id":1}&max_results=500&page=1', headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])

list_ids=[]
for i in range(1,nbr_page+1):
    print("page",i)
    page=requests.get('http://185.161.45.213/projects/projects?where={"type":"IRESP"}&projection={"id":1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        print("k",k)
        list_ids.append(page['data'][k]['id'])
    
projets_a_ajouter=[x for x in list(df_projets['id']) if x not in list_ids]

projets_a_retirer=[x for x in list_ids if x not in list(df_projets['id'])]

df_projets = df_projets[df_projets['id'].apply(lambda x: x in projets_a_ajouter)]

In [None]:
len(projets_a_ajouter)

In [None]:
len(df_projets[df_projets['id'].apply(lambda x: x in projets_a_ajouter)])

In [None]:
len(df_projets[df_projets['id'].apply(lambda x: x in projets_a_retirer)])

In [None]:
dict_row=df_projets.iloc[0,:].to_dict()
dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x')&(str(v)!='[]'))}
dict_row2

In [None]:
err=[]
for i,row in df_projets.iterrows():
    dict_row=row.to_dict()
    dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='')&(str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x')&(str(v)!='[]'))}
    try:
       r=requests.post('http://185.161.45.213/projects/projects', json = dict_row2, headers={"Authorization":Authorization})
       res= r.json()
       if res.get('status')=='ERR':
           err.append(res)
           if res.get('error').get('code')!=422:
               print(err)
               pp.pprint(err)
    except Exception as e:
        pp.pprint(e)

In [None]:
len(projets_a_ajouter)

Modifications

In [None]:
nbr_page=int(requests.get('http://185.161.45.213/projects/projects?where={"type":"ANR"}&projection={"id":1,"year":1}&max_results=500'+f"&page={1}", headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])
nbr_page

In [None]:
list_ids=[]
for i in range(1,nbr_page+1):
    print("page",i)
    page=requests.get('http://185.161.45.213/projects/projects?where={%22type%22:%22ANR%22}&projection={%22id%22:1,%22year%22:1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        list_ids.append(page['data'][k]['year'])

In [None]:
max(list_ids)

In [None]:
df_projets = df_projets[df_projets['id'].apply(lambda x: x in list_ids)]

In [None]:
"""
en principe, il faut que je mette à jour tous les projets 
qui ne sont pas clos avec les titres résumés et persons, mais 
peut etre qu'on peut mettre à jour automatiquement les projets
des 3 dernières années ? 

"""

err=[]
for id in df_projets.iterrows() :
    url = f"http://185.161.45.213/projects/projects/{id}"
    project=requests.get(url, headers={'Authorization': Authorization}).json()
    head = {"Authorization": Authorization, "If-Match": project['etag'], "Content-Type": "application/json"}

    r = requests.patch(url, json = {"year": 2023}, headers=head)
    res= r.json()
    if res.get('status')=='ERR':
        err.append(res)
        if res.get('error').get('code')!=422:
            print(err)
            pp.pprint(err)


# ENVOI DES PARTENAIRES SUR SCANR

In [None]:
df_partners=pd.read_json(f"./DATA/{source}/df_partners_id_structures.json")

In [None]:
df_partners.columns

Index(['appel_a_projets', 'annee_de_selection', 'region', 'nom_etablissement',
       'finess', 'type_etablissement', 'acronyme', 'titre',
       'discipline_principale', 'nom_porteur', 'prenom_porteur',
       'financement_total', 'numero_registre_essais', 'numero_tranche',
       'nom_etablissement2', 'id_structure_matcher', 'id_structure_scanr',
       'code', 'siren', 'id_structure'],
      dtype='object')

In [None]:
### ATTENTION, vérifier que les projets sirano sont dans des structures françaises
if source=='IRESP':
    df_partners[sources[source]['pays']]=df_partners.loc[:,sources[source]['ville']].apply(lambda x: x.split('(')[1].replace(')','') if x.find('(')>=0 else 'France')
    df_partners.loc[:,sources[source]['ville']]=df_partners.loc[:,sources[source]['ville']].apply(lambda x: x.split('(')[0] if x.find('(')>=0 else x)
  
df_partners['address']=df_partners.apply(lambda row: address(row,sources[source]['pays'],sources[source]['ville'],source), axis=1)
df_partners.loc[:,'id_structure']=df_partners.loc[:,'id_structure'].apply(lambda x: x[0] if isinstance(x,list) else x )

In [None]:
if source in ['ANSES','SIRANO']:
    df_partners['id']=df_partners.apply(lambda row: f"{row[sources[source]['code_projet']]}-{row[str(sources[source]['nom_structure'])+'2']}-{row[sources[source]['nom']]}-{row[sources[source]['prenom']]}-{row[sources[source]['acronyme']]}-{row[sources[source]['annee']]}" , axis=1)
if source =='REG_IDF':
    df_partners['id']=df_partners.apply(lambda row: f"{row[sources[source]['code_projet']]}-{row[str(sources[source]['nom_structure'])+'2']}-{row['entite_role']}" , axis=1)
df_partners['address']=df_partners.apply(lambda row: address(row,sources[source]['pays'],sources[source]['ville'],source), axis=1)
df_partners=df_partners.rename(columns={sources[source]['nom_structure']: 'name', sources[source]['code_projet']: 'project_id', 'id_structure':'participant_id',sources[source]['id_partenaire']:'id'})
df_partners=df_partners[['name','id','project_id','participant_id','address']]
df_partners['project_type']=source
df_partners['participant_id']=df_partners.loc[:,'participant_id'].apply(lambda x: str(x[0]).replace('.0','') if isinstance(x,list) else str(x).split(';')[0].replace('.0',''))
df_partners=df_partners[['id','project_id', 'project_type', 'participant_id', 'name','address']]
df_partners['name'] = df_partners['name'].astype(str)
df_partners

Unnamed: 0,id,project_id,project_type,participant_id,name,address
0,PHRIP-ghuparipyneurocience-MORVILLERS-Jean-Man...,PHRIP,SIRANO,200082105,GHU PARIS PSY ET NEUROSCIENCES,{'country': 'France'}
1,PHRIP-chutour-CHEYROUX-Pierre-DETERQVT-2013.0,PHRIP,SIRANO,263700189,CHU DE TOURS,{'country': 'France'}
2,PHRIP-chranger-LE ROY-Cyril-CHIC-2013.0,PHRIP,SIRANO,264900036,CHR ANGERS,{'country': 'France'}
3,PHRIP-aitancepubliquehopitauxpari-ROCH-Stéphan...,PHRIP,SIRANO,26750045200011,ASSISTANCE PUBLIQUE-HOPITAUX DE PARIS,{'country': 'France'}
4,PHRIP-chrunancy-LESNY-Martine-PREVENIR-2013.0,PHRIP,SIRANO,WVf8h,CHRU DE NANCY,{'country': 'France'}
...,...,...,...,...,...,...
3583,PHRCN-aitancepubliquehopitauxpari-VAN BELLE-Er...,PHRCN,SIRANO,26750045200011,ASSISTANCE PUBLIQUE-HOPITAUX DE PARIS,{'country': 'France'}
3584,PHRCN-hopitauxuniveritairrabourgbourg-SEBBAG-E...,PHRCN,SIRANO,26670057400012,HOPITAUX UNIVERSITAIRES DE STRASBOURG,{'country': 'France'}
3585,PHRCN-aitancepubliquehopitauxpari-BÉHIN-Anthon...,PHRCN,SIRANO,26750045200011,ASSISTANCE PUBLIQUE-HOPITAUX DE PARIS,{'country': 'France'}
3586,PHRCN-aitancepubliquehopitauxpari-RUNYO-Floren...,PHRCN,SIRANO,26750045200011,ASSISTANCE PUBLIQUE-HOPITAUX DE PARIS,{'country': 'France'}


In [None]:
df_partners[df_partners.duplicated(subset=['id'])]

Unnamed: 0,id,project_id,project_type,participant_id,name,address


In [None]:
dict_row=df_partners.iloc[1,:].to_dict()
dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x'))}
dict_row2

{'id': 'PHRIP-chutour-CHEYROUX-Pierre-DETERQVT-2013.0',
 'project_id': 'PHRIP',
 'project_type': 'SIRANO',
 'participant_id': '263700189',
 'name': 'CHU DE TOURS',
 'address': {'country': 'France'}}

In [None]:
len(df_partners)

3588

In [None]:
err=[]
for i,row in df_partners.iterrows():
    dict_row=row.to_dict()
    dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='')&(str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x'))}
    try:
       r=requests.post('http://185.161.45.213/projects/participations', json = dict_row2, headers={"Authorization":Authorization})
       res= r.json()
       if res.get('status')=='ERR':
           print(i)
           err.append(res)
           if res.get('error').get('code')!=422:
               print(err)
               pp.pprint(err)
    except Exception as e:
        pp.pprint(e)

In [14]:
pd.Series([x.get('issues').get('id') for x in err]).drop_duplicates().tolist() 

[]

Mise à jour

In [None]:
nbr_page=int(requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANR"}&projection={"id":1}&max_results=500&page=1', headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])

list_ids=[]
for i in range(1,nbr_page+1):
    print("page",i)
    page=requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANR"}&projection={"id":1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        print("k",k)
        list_ids.append(page['data'][k]['id'])
    
projets_a_ajouter=[x for x in list(df_partners['id'].drop_duplicates()) if x not in list(pd.Series(list_ids).drop_duplicates())]

projets_a_retirer=[x for x in list_ids if x not in list(df_partners['id'])]

df_partners = df_partners[df_partners['id'].apply(lambda x: x in projets_a_ajouter)]


In [None]:
len(projets_a_ajouter)

In [None]:
len(projets_a_retirer)

In [None]:
len(df_partners)

In [None]:
err=[]
for i,row in df_partners.iterrows():
    dict_row=row.to_dict()
    dict_row2={k:v for k,v in list(dict_row.items()) if ((str(v)!='')&(str(v)!='nan')&(str(v)!='NaN')&(str(v)!='None')&(str(v)!='x'))}
    try:
       r=requests.post('http://185.161.45.213/projects/participations', json = dict_row2, headers={"Authorization":Authorization})
       res= r.json()
       if res.get('status')=='ERR':
           print(i)
           err.append(res)
           if res.get('error').get('code')!=422:
               print(err)
               pp.pprint(err)
    except Exception as e:
        pp.pprint(e)

Modifications

In [None]:
nbr_page=int(requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANR","participant_id":{"$exists":false}}&projection={"id":1}&max_results=500'+f"&page={1}", headers={"Authorization":Authorization}).json()['hrefs']['last']['href'].split('page=')[1])

list_ids=[]
for i in range(1,nbr_page+1):
    print("page",i)
    page=requests.get('http://185.161.45.213/projects/participations?where={"project_type":"ANR","participant_id":{"$exists":false}}&projection={"id":1}&max_results=500'+f"&page={i}", headers={"Authorization":Authorization}).json()
    for k in range(len(page['data'])):
        print("k",k)
        list_ids.append(page['data'][k]['id'])

df_partners = df_partners[df_partners['id'].apply(lambda x: x in list_ids)]

In [None]:
len(list_ids)

In [None]:
df_partners=df_partners[(df_partners.participant_id!='x') & (df_partners.participant_id!='None')]

In [None]:
df_partners

In [None]:
err=[]
for i,row in df_partners.iterrows() :
    print(i)
    id=row['id']
    url = f"http://185.161.45.213/projects/participations/{id}"
    project=requests.get(url, headers={'Authorization': Authorization}).json()
    head = {"Authorization": Authorization, "If-Match": project['etag'], "Content-Type": "application/json"}

    r = requests.patch(url, json = {"participant_id": row['participant_id']}, headers=head)
    res= r.json()
    if res.get('status')=='ERR':
        err.append(res)
        if res.get('error').get('code')!=422:
            print(err)
            pp.pprint(err)

In [None]:
pd.Series([x.get('issues').get('id')[25:] for x in err]).drop_duplicates().tolist()

Faire un nouveau fichier avec toutes les structures manquantes

In [None]:
struct_manq=pd.read_excel('scanr_partenaires_non_identifies.xlsx')

In [None]:
nom1=struct_manq[struct_manq.New=='IRESP'].drop_duplicates(subset='Nom').Nom

In [None]:
iresp_actuel=pd.read_excel('./missing_ids_structures/partenaires_non_identifies_IRESP.xlsx')

In [None]:
nom2=iresp_actuel['Nom_equipe']

In [None]:
pd.Series(list(nom1)+list(nom2))

In [None]:
pd.Series(list(nom1)+list(nom2)).drop_duplicates()