In [None]:
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
pd.options.display.max_columns = None
plt.rc("figure", figsize=[12, 4])


# Ancien fichier

## Analyse

👉 RÉSUMÉ EN BAS DE PAGE

Documentation de l'API : https://api.gouv.fr/les-api/api_etablissements_publics

### Importation des données bruts

Liste des structures à importer

In [None]:
code_pivot = [
    "agefiph",
    "agence_insertion",
    "adil",
    "ars",
    "aav",
    "afpa",
    "apec",
    "aract",
    "apecita",
    "bav",
    "caf",
    "cicas",
    "cio",
    "cidf",
    "pmi",
    "dr_femmes",
    "ars_antenne",
    "direccte",
    "direccte_ut",
    "drihl",
    "drihl_ut",
    "dd_femmes",
    "msap",
    "greta",
    "cij",
    "epci",
    "mairie",
    "mairie_com",
    "maison_emploi",
    "mjd",
    "maison_handicapees",
    "mission_locale",
    "msa",
    "permanence_juridique",
    "plateforme_naturalisation",
    "pcb",
    "pif",
    "prefecture",
    "droit_travail",
]


Appel à l'API pour l'ensemble des structures de la liste

In [None]:
url = "https://etablissements-publics.api.gouv.fr/v3/organismes/"
df = pd.DataFrame()

for code in code_pivot:
    response = requests.get(url + code)
    data = response.json()
    code_df = pd.json_normalize(data, ["features", []])
    df = pd.concat([raw_df, code_df])


In [None]:
df.info()


## Adresses

type d'adresse: géopostale vs postale vs physique

cf https://routagedoc.sudeducation.org/IMG/pdf/adressage.pdf



In [None]:
adresses_df = pd.json_normalize(
    raw_df.to_dict(orient="records"),
    record_path="properties.adresses",
    meta="properties.id",
)


In [None]:
adresses_df.type.value_counts()


In [None]:
adresses_df["properties.id"].nunique()


In [None]:
adresses_df[(adresses_df.type == "géopostale") | (adresses_df.type == "physique")][
    "properties.id"
].nunique()


In [None]:
adresses_df[
    (adresses_df.type == "géopostale") | (adresses_df.type == "physique")
].shape[0]


conclusion: la quasitotalité des structures ont soit une adresse géopostale, soit une adresse physique, de manière mutuellement exclusive

choix : ordre de prio géopostale/physique > postale

In [None]:
adresses_df = adresses_df.sort_values("type")
adresses_df = adresses_df.drop_duplicates("properties.id", keep="first")


In [None]:
adresses_df.shape[0]


In [None]:
def compute_field_occupancy_rates(df):
    return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)


### Aperçu

In [None]:
df.sample(2)


### Taux de remplissage de la structure

In [None]:
def compute_field_occupancy_rates(df):
    return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)


compute_field_occupancy_rates(df).to_frame()


### Identifiant local ?

Le champs `properties.id`

In [None]:
df["properties.id"].nunique()


### Code INSEE ?

Le champs `properties.codeInsee` est présent et complet.

### Dates ?

Aucune information n'est disponible concernant les dates.

### SIRET

Aucune information sur les SIRET.

### Nettoyage

In [None]:
# Traitement des données absentes
df = df.replace(["", np.nan], None)


### Analyse des secteurs

In [None]:
plt.rc("figure", figsize=[12, 8])
sns.countplot(
    data=df.fillna("Inconnu"),
    y="properties.pivotLocal",
    order=df["properties.pivotLocal"].fillna("Inconnu").value_counts().index,
)


### Analyse des typologies

Analyse approfondie à prévoir pour matcher les autres structures du pivot avec notre schéma.

In [None]:
categories_flags_places_df = (
    df['properties.pivotLocal'].str.lower()
    .apply(
        lambda s: 
    )
    .apply(pd.Series)
    .assign(
        na=lambda df: df.apply(
            lambda row: ~row.any(), axis="columns", result_type="expand"
        )
    )
)

categories_flags_places_df.sum().sort_values(ascending=False).plot(kind="bar", grid=True, rot=35, figsize=(20, 8))

## Résumé

<br>✅ Id disponible (id)
<br>✅ 100% des champs obligatoires
<br>✅ Champs non-obligatoires disponibles (code INSEE, téléphone,...)
<br>
<br>❌ Pas de dates
<br>❌ Pas de SIRET
<br>❌ Pas de rna

# Nouveau fichier (v4)

[Documentation](https://echanges.dila.gouv.fr/OPENDATA/Base_donn%c3%a9es_locales/Specifications-datagouv-base-de-donnees-locales-Service-public_V1.1.pdf)

In [None]:
import numpy as np
import pandas as pd
import requests
import tarfile
import zipfile


## Analyse du dataset entier

In [None]:
url = "https://www.data.gouv.fr/fr/datasets/r/73302880-e4df-4d4c-8676-1a61bb997f3d"

In [None]:
response = requests.get(url)

In [None]:
with open(url.split("/")[-1], "wb") as output_file:
    output_file.write(response.content)


In [None]:
with tarfile.open(url.split("/")[-1], "r:bz2") as tar:
    tar.extractall()


In [None]:
import json
import pathlib

with next(pathlib.Path().glob("*.gouv_local.json")).open() as f:
    data = json.load(f)


In [None]:
raw_df = pd.json_normalize(data["service"])
raw_df = raw_df.replace([np.nan, ""], None)


In [None]:
raw_df.info()


In [None]:
raw_df.siret.notna().sum()


In [None]:
def compute_field_occupancy_rates(df):
    return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)

compute_field_occupancy_rates(raw_df).to_frame()

## Analyse circonscrite aux éléments pivot

Traitement des données sur la base du dataframe original :

In [None]:
raw_df.sample(3)

In [None]:
types_df = pd.json_normalize(
    raw_df.to_dict(orient="records"), record_path="pivot", meta="id"
)
types_df.info()

In [None]:
types_df.id.duplicated().sum()

In [None]:
types_df.type_service_local.value_counts().head(20).to_frame()

In [None]:
filtered_types_df = types_df[types_df.type_service_local.isin(code_pivot)]
filtered_types_df.sample(3)

On filtre les données sur la base des pivots qui nous concernent :

In [None]:
filtered_df = pd.merge(
    raw_df, filtered_types_df[["id", "type_service_local"]], how="inner", on="id"
)

In [None]:
filtered_df.siret.fillna(filtered_df.siren).notna().sum()

In [None]:
filtered_df.shape[0]

In [None]:
filtered_df.sample(1).to_dict(orient="records")

### taux de remplissage des champs

In [None]:
compute_field_occupancy_rates(filtered_df).to_frame()

In [None]:
adresses_df = pd.json_normalize(
    filtered_df.to_dict(orient="records"),
    record_path="adresse",
    meta="id"
)

In [None]:
adresses_df.type_adresse.value_counts()

In [None]:
adresses_df.id.duplicated().sum()

In [None]:
adresses_df.id.nunique()


conclusion: toutes ont au moins une adresse, quelques unes ont 2 adresses

### Date de création

In [None]:
filtered_df.date_creation.value_counts()

In [None]:
filtered_df.date_creation.value_counts()

In [None]:
filtered_df.date_modification.isna().sum()

conclusion: date de maj complete mais très mauvais à priori pour une bonne partie du fichier

### Identifiant local

Nouvel identifiant `id` complet.

In [None]:
filtered_df.id.nunique()

### Nom

In [None]:
filtered_df.nom.isna().sum()

In [None]:
filtered_df.nom.sample(5)

In [None]:
filtered_df[["nom", "code_insee_commune"]].value_counts()

In [None]:
pd.json_normalize(filtered_df[filtered_df.nom == "Caisse d'allocations familiales (Caf) de l'Essonne - accueil d'Évry"].to_dict(orient="records"), record_path="adresse")

conclusion: à priori, plûtot clean

### commune

In [None]:
adresses_df = pd.json_normalize(
    filtered_df.to_dict(orient="records"),
    record_path="adresse",
    meta="id",
)

In [None]:
adresses_df

In [None]:
adresses_df.id.duplicated(keep=False).sum()

In [None]:
adresses_df = adresses_df.sort_values(by="type_adresse")
adresses_df = adresses_df.drop_duplicates(subset=["id"],keep="first")

In [None]:
adresses_df.id.duplicated(keep=False).sum()

In [None]:
filtered_df = pd.merge(filtered_df, adresses_df, how="left", on="id")

In [None]:
filtered_df.nom_commune.isna().sum()

### code postal

In [None]:
filtered_df.code_postal.isna().sum()

### Code Insee

In [None]:
filtered_df.code_insee_commune.isna().sum()

### adresse

In [None]:
filtered_df.adresse.isna().sum()

In [None]:
filtered_df[["numero_voie", "complement1"]].sample(10)

### longitude, latitude

In [None]:
filtered_df.longitude.isna().sum(), filtered_df.latitude.isna().sum()

### telephone

In [None]:
(filtered_df.telephone.astype(str) == "[]").sum()

In [None]:
telephone_df = pd.json_normalize(filtered_df.to_dict(orient="records"), record_path="telephone", meta="id")

In [None]:
telephone_df.shape[0]

In [None]:
telephone_df.id.duplicated(keep=False).sum()

In [None]:
telephone_df[telephone_df.id.duplicated(keep=False)]

In [None]:
import re

telephone_df[telephone_df.valeur.map(lambda s: re.match(r"[a-zA-Z]+", s) != None)]

### courriel

In [None]:
courriel_df = pd.json_normalize(filtered_df.to_dict(orient="records"), record_path="adresse_courriel", meta="id")

In [None]:
courriel_df = courriel_df.rename(columns={0: "valeur"})

In [None]:
courriel_df.valeur.value_counts()

In [None]:
courriel_df.id.duplicated(keep=False).sum()

In [None]:
courriel_df[courriel_df.id.duplicated(keep=False)]

### site web

In [None]:
site_internet_df = pd.json_normalize(filtered_df.to_dict(orient="records"), record_path="site_internet", meta="id")

In [None]:
site_internet_df.id.duplicated(keep=False).sum()

In [None]:
site_internet_df[site_internet_df.id.duplicated(keep=False)]

### description

In [None]:
(filtered_df.texte_reference.astype(str) != "[]").sum()

In [None]:
filtered_df.mission.notna().sum()

In [None]:
filtered_df[filtered_df.mission.notna()].mission.sample(10).to_frame()

### date maj

In [None]:
filtered_df.date_modification.sample(5)

In [None]:
from datetime import datetime

filtered_df.date_modification.map(lambda s: datetime.strptime(s, "%d/%m/%Y %H:%M:%S"))


### lien source

In [None]:
annuaire_df = pd.json_normalize(filtered_df.to_dict(orient="records"), record_path="annuaire", meta="id")

### accessibilité

In [None]:
adresses_df.accessibilite.value_counts()

### horaires d'ouverture

In [None]:
horaires_df = pd.json_normalize(filtered_df.to_dict(orient="records"), record_path="plage_ouverture", meta="id")

In [None]:
filtered_df.plage_ouverture.map(json.dumps)

### labels nationaux

In [None]:
filtered_df.partenaire.value_counts()

In [None]:
filtered_df.apply(lambda row: "afpa" in row.nom.lower(), axis="columns").sum()

In [None]:
types_df.type_service_local.value_counts().head(20).to_frame()

In [None]:
filtered_df.type_service_local.value_counts().head(50).to_frame()

### SIRET

In [None]:
filtered_df.siret.notna().sum()

In [None]:
filtered_df.siren.notna().sum()

In [None]:
filtered_df[filtered_df.siren.notna() | filtered_df.siret.notna()].shape[0]

Champs siret et siren, assez peu renseignés

### Analyse des secteurs

In [None]:
df["pivot"].str[0].str['type_service_local'].value_counts()

## Résumé nouveau fichier

<br>✅ Id disponible (id)
<br>✅ 100% des champs obligatoires
<br>✅ Champs non-obligatoires disponibles (code Insee, téléphone, dates...)
<br>
<br>❌ Pas de SIRET
<br>❌ Pas de rna