In [None]:
import requests
import json
import pandas as pd


api_url = "https://www.agefiph.fr/jsonapi"

services_json = requests.get(api_url + "/node/aide_service")
services_json = services_json.json()
services: list = services_json["data"]

thematiques_json = requests.get(api_url + "/taxonomy_term/thematique")
thematiques_json = thematiques_json.json()
thematiques: list = thematiques_json["data"]

In [None]:
import trafilatura
from typing import Optional

def html_to_markdown(s: Optional[str]) -> Optional[str]:
    if s is None or s == "":
        return s
    return trafilatura.extract(trafilatura.load_html("<html>" + s + "</html>"))

# with pd.json_normalize the values of fied_essential_* were turned into NaN
# so I switched to a more basic approach
# df_services = pd.json_normalize(services,  sep=".")

services_normalized = []
for service in services:    
    s = {}
    attributes = service["attributes"]
    
    s["id"] = service["id"]
    s["date_creation"] = attributes["created"]
    s["date_maj"] = attributes["changed"]
    s["nom"] = attributes["title"]
    s["presentation_resume"] = attributes["field_titre_card_employeur"]
    
    s["presentation_detail"] = ""
    
    # HARD CODED FIELDS
    
    s["contact_public"] = True
    
    # DESCRIPTIONS  
    if attributes['field_essentiel_ph']:
        s["presentation_detail"] = f"""<p>Pour la personne handicapée :</p> {attributes['field_essentiel_ph']['processed']}"""
    if attributes['field_essentiel_employeur']:
        s["presentation_detail"] += f"""<p>Pour l'employeur :</p> {attributes['field_essentiel_employeur']['processed']}"""   
    
    if attributes["field_texte_brut_long"]:
         s["presentation_detail"] = attributes["field_texte_brut_long"] + f"""
         {s["presentation_detail"]}"""   
    
    s["presentation_detail"] = html_to_markdown(s["presentation_detail"])
    
    # THEMATIQUES (seulement les id)
    s["id_thematiques"] = []
    try:
        for thematique in service["relationships"]["field_thematique"]["data"]:
            # print(thematique['id'])
            s["id_thematiques"].append(thematique["id"])
    except KeyError:
        print(f"{s['nom']} : pas de thématique")
           
    services_normalized.append(s)  

In [None]:
thematiques_normalized = {}
mapping_thematiques = {
    # Source https://grist.incubateur.net/o/datainclusion/uVsB8pabQGoe/Thmatiques/p/13
    
    '4e08047f-b0ed-431a-9182-61e8e61b1486': ["handicap--favoriser-le-retour-et-le-maintien-dans-lemploi"],
    '11618ce3-e59b-404f-8eb2-5763215464f2': ["handicap--favoriser-le-retour-et-le-maintien-dans-lemploi"],
    '60c25ci7-61sc-89a9-ny54-126hslf808a2': ["handicap--connaissance-des-droits-des-travailleurs"],
    '51be0003-13d8-4ffa-9923-248e7aa4a227': [],
    'ddf0fa87-2ee0-481c-a258-96985b7826c3': [],
    'cb2c9fec-c190-4e2f-aeee-6da818109bf8': ["handicap--favoriser-le-retour-et-le-maintien-dans-lemploi"],
    '78b28acb-803e-4b06-ab77-58dabfbd8571': ["handicap--adaptation-au-poste-de-travail"],
    '366eb399-1e6c-4609-8066-d1504fae2a8e': [],
    '907a8c33-5c56-49d3-bd64-a736a9ceac76': [],
    '5d8c88d8-db03-4f27-b517-d7016896b01a': [],
    'fb5e6180-290b-4216-ba68-624d25defa3a': ["handicap--favoriser-le-retour-et-le-maintien-dans-lemploi"],
    '03228d62-2a59-49d8-8443-b25cb2e684b9': ["accompagnement-social-et-professionnel-personnalise--definition-du-projet-professionnel"],
    'f9ab3e06-af51-463a-aaf7-7b04a28e047f': ["se-former--trouver-sa-formation"],
    'aeab1d68-4e89-4e2a-a612-d8645e3999d8': ["creation-activite--definir-son-projet-de-creation-dentreprise"],
    'f4551558-8315-4708-8357-5ecc89751bc6': ["handicap--faire-reconnaitre-un-handicap"],
    '4b8b0473-52c2-4a21-956d-d7d68a7053b5': []
}

In [None]:
import copy

services_normalized_w_thematiques = []
for service in services_normalized:
    service['thematiques'] = set()
    for id_thematique in service["id_thematiques"]:
        if len(mapping_thematiques[id_thematique]) > 0:
            service['thematiques'].add(mapping_thematiques[id_thematique][0])
    service['thematiques'] = list(service['thematiques'])
    services_normalized_w_thematiques.append(service)
    
df = pd.DataFrame(services_normalized_w_thematiques)
print(df)

In [None]:
import requests

# Adding structures and contact data
# Source: https://grist.incubateur.net/o/datainclusion/bWqnEafQaLgc/Partage-de-donnes-AGEFIPH-Mars-Avril-2023/p/4
df_structures = pd.read_csv("https://grist.incubateur.net/o/datainclusion/api/docs/bWqnEafQaLgcTvFv7rv6hF/download/csv?viewSection=7&tableId=Structures&activeSortSpec=%5B%5D&filters=%5B%5D", 
                            index_col=None, dtype=str)

# Adding zone diffusion
url_to_call = "https://geo.api.gouv.fr/communes?code="
df_structures["zone_diffusion_type"] = "region"

## Calling geo.api.gouv.fr for code INSEE => code région matching
## writing the result in the structures dataframe
for index, structure in df_structures.iterrows():
    result = requests.get(url_to_call + structure["code_insee"]).json()
    code_region = result[0]["codeRegion"]
    df_structures.at[index, 'zone_diffusion_code'] = code_region
    
print(mapping_zone_diffusion)




In [None]:
# Cross/full join of structures and services data
df_structures= df_structures.rename(columns={"id": "structure_id"})

df_services_merged = df_structures[["structure_id", 
                                    "courriel", 
                                    "telephone", 
                                    "adresse", 
                                    "commune", 
                                    "code_postal", 
                                    "code_insee",
                                    "zone_diffusion_type",
                                    "zone_diffusion_code"]].join(how="cross", other=df)

# Making service id unique across all regions
# It's ok to use iterrows() for small data, right? :P
for index, service in df_services_merged.iterrows():
    unique_service_id = f"{service['structure_id']}_{service['id']}"
    df_services_merged.at[index, 'id'] = unique_service_id
    

In [None]:
df_services_merged.to_csv("services.csv", index=None)
df_services_merged