Le but de ce notebook est de générer des structures et services pour l'offre de service de l'agefiph.

Ce notebook prend 2 sources de données :

* une liste de strucures issu d'un tableau grist maintenu par data.inclusion,
* une liste de services issu de l'api de l'agefiph.


In [None]:
import requests
import numpy as np
import pandas as pd
import trafilatura
import hashlib
from uuid import UUID


def html_to_markdown(s: str):
    if s is None or s == "":
        return s
    return trafilatura.extract(trafilatura.load_html("<html>" + s + "</html>"))


# https://grist.incubateur.net/o/datainclusion/bWqnEafQaLgc/Partage-de-donnes-AGEFIPH-Mars-Avril-2023/p/4
STRUCTURES_TABLE_URL = "https://grist.incubateur.net/o/datainclusion/api/docs/bWqnEafQaLgcTvFv7rv6hF/download/csv?viewSection=7&tableId=Structures&activeSortSpec=%5B%5D&filters=%5B%5D"
SERVICES_API_URL = "https://www.agefiph.fr/jsonapi/node/aide_service"


In [None]:
raw_structures_df = pd.read_csv(STRUCTURES_TABLE_URL, dtype=str).replace([np.nan, ""], None)
raw_services_df = pd.json_normalize(
    requests.get(SERVICES_API_URL).json()["data"]
).replace([np.nan, ""], None)


In [None]:
raw_structures_df = raw_structures_df

raw_structures_df.info()


In [None]:
raw_services_df = raw_services_df[
    [
        "id",
        "attributes.created",
        "attributes.changed",
        "attributes.title",
        "attributes.field_titre_card_employeur",
        "attributes.field_essentiel_ph.processed",
        "attributes.field_essentiel_employeur.processed",
        "attributes.field_texte_brut_long",
        "relationships.field_thematique.data",
    ]
]

raw_services_df.info()


In [None]:
pd.json_normalize(
    raw_services_df.rename(columns={"id": "service_id"}).to_dict(orient="records"),
    record_path="relationships.field_thematique.data",
    meta="service_id",
).info()


In [None]:
DI_THEMATIQUES_BY_AGEFIPH_THEMATIQUE_ID = {
    # Source https://grist.incubateur.net/o/datainclusion/uVsB8pabQGoe/Thmatiques/p/13
    "4e08047f-b0ed-431a-9182-61e8e61b1486": "handicap--favoriser-le-retour-et-le-maintien-dans-lemploi",
    "11618ce3-e59b-404f-8eb2-5763215464f2": "handicap--favoriser-le-retour-et-le-maintien-dans-lemploi",
    "60c25ci7-61sc-89a9-ny54-126hslf808a2": "handicap--connaissance-des-droits-des-travailleurs",
    "51be0003-13d8-4ffa-9923-248e7aa4a227": None,
    "ddf0fa87-2ee0-481c-a258-96985b7826c3": None,
    "cb2c9fec-c190-4e2f-aeee-6da818109bf8": "handicap--favoriser-le-retour-et-le-maintien-dans-lemploi",
    "78b28acb-803e-4b06-ab77-58dabfbd8571": "handicap--adaptation-au-poste-de-travail",
    "366eb399-1e6c-4609-8066-d1504fae2a8e": None,
    "907a8c33-5c56-49d3-bd64-a736a9ceac76": None,
    "5d8c88d8-db03-4f27-b517-d7016896b01a": None,
    "fb5e6180-290b-4216-ba68-624d25defa3a": "handicap--favoriser-le-retour-et-le-maintien-dans-lemploi",
    "03228d62-2a59-49d8-8443-b25cb2e684b9": "accompagnement-social-et-professionnel-personnalise--definition-du-projet-professionnel",
    "f9ab3e06-af51-463a-aaf7-7b04a28e047f": "se-former--trouver-sa-formation",
    "aeab1d68-4e89-4e2a-a612-d8645e3999d8": "creation-activite--definir-son-projet-de-creation-dentreprise",
    "f4551558-8315-4708-8357-5ecc89751bc6": "handicap--faire-reconnaitre-un-handicap",
    "4b8b0473-52c2-4a21-956d-d7d68a7053b5": None,
}


def map_service(row) -> dict:
    service = {}
    service["id"] = row["id"]
    service["date_creation"] = row["attributes.created"]
    service["date_maj"] = row["attributes.changed"]
    service["nom"] = row["attributes.title"]
    service["contact_public"] = True
    service["presentation_resume"] = row["attributes.field_titre_card_employeur"]

    service["presentation_detail"] = ""
    if row["attributes.field_essentiel_ph.processed"] is not None:
        service["presentation_detail"] += (
            "<p>Pour la personne handicapée :</p>"
            + row["attributes.field_essentiel_ph.processed"]
        )
    if row["attributes.field_essentiel_employeur.processed"] is not None:
        service["presentation_detail"] += (
            "<p>Pour l'employeur :</p>"
            + row["attributes.field_essentiel_employeur.processed"]
        )
    if row["attributes.field_texte_brut_long"] is not None:
        service["presentation_detail"] = (
            row["attributes.field_texte_brut_long"] + service["presentation_detail"]
        )
    service["presentation_detail"] = html_to_markdown(service["presentation_detail"])
    service["presentation_detail"] = service["presentation_detail"] or None

    service["thematiques"] = list(
        set(
            [
                v
                for v in [
                    DI_THEMATIQUES_BY_AGEFIPH_THEMATIQUE_ID[
                        agefiph_thematique_data["id"]
                    ]
                    for agefiph_thematique_data in row[
                        "relationships.field_thematique.data"
                    ]
                ]
                if v is not None
            ]
        )
    )

    return service


In [None]:
template_services_df = raw_services_df.apply(map_service, axis=1, result_type="expand")

template_services_df.info()


In [None]:
# Cartesian products
services_df = (
    raw_structures_df[
        [
            "id",
            "courriel",
            "telephone",
            "adresse",
            "commune",
            "code_postal",
            "code_insee",
        ]
    ]
    .rename(columns={"id": "structure_id"})
    .join(template_services_df, how="cross")
)

# Making service id unique across all regions
services_df = services_df.assign(
    id=services_df.apply(
        lambda row: str(
            UUID(
                hex=hashlib.md5((row["structure_id"] + row["id"]).encode()).hexdigest()
            )
        ),
        axis=1,
    )
)

services_df.info()


In [None]:
services_df.to_json("services.json", orient="records", force_ascii=False)
raw_structures_df.to_json("structures.json", orient="records", force_ascii=False)


In [None]:
services_df
