In [None]:
from datetime import date
from dateutil.parser import parse as dateutil_parse
import os
from pathlib import Path
import importlib

import dotenv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import soliguide

importlib.reload(soliguide)

In [None]:
pd.options.display.max_rows = None
plt.rc("figure", figsize=[12, 8])

In [None]:
# Load token from dot `.env` in this notebook directory
dotenv.load_dotenv(dotenv.find_dotenv())

soliguide_api_client = soliguide.APIClient(
    base_url="https://api.soliguide.fr/",
    token=os.environ["SOLIGUIDE_API_TOKEN"],
    user_agent="betaTest",
)

In [None]:
all_places_data = soliguide_api_client.search(
    location_geo_type="pays", location_geo_value="france"
)

all_places_df = pd.DataFrame.from_records(data=all_places_data)
all_places_df.to_json(f"./soliguide-places-{date.today().strftime('%Y%m%d')}.json", orient="records")


### Nettoyage

In [None]:
# get the latest downloaded data
data_file_path = sorted(Path(".").glob("soliguide-places-*.json"))[-1]

all_places_df = pd.read_json(data_file_path)
all_places_df = pd.json_normalize(all_places_df.to_dict(orient="records"))
all_places_df = all_places_df.set_index("lieu_id")
all_places_df.createdAt = all_places_df.createdAt.apply(lambda s: dateutil_parse(s))
all_places_df.updatedAt = all_places_df.updatedAt.apply(lambda s: dateutil_parse(s))
all_places_df = all_places_df.replace([np.nan, ""], None)
all_places_df.sample(5)

### Nombres de lieux uniques

In [None]:
all_places_df.reset_index().lieu_id.nunique()

### Nombre de lieux en France

In [None]:
all_places_df.shape[0]

### Taux de remplissage des champs de structures

In [None]:
def compute_field_occupancy_rates(df):
    return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)

In [None]:
from IPython.display import display
for _, df in compute_field_occupancy_rates(all_places_df).groupby(lambda c: c.split(".")[0]):
    display(df.to_frame())

### Distribution de la date de 1er référencement des lieux

In [None]:
all_places_df.createdAt.hist(bins=100)

### Distribution de la date de dernière mise-à-jour

In [None]:
all_places_df.updatedAt.hist(bins=100)

### Typologie

* Pas de champs typologie

In [None]:
categories_flags_places_df = (
    all_places_df.name.str.lower()
    .apply(
        lambda s: {
            "restos_du_c": "restos du c" in s,
            "epicerie": "epicerie" in s,
            "caf": "caf" in s,
            "ccas": "ccas" in s,
            "pole_emploi": "pôle emploi" in s,
            "secours_populaire": "secours populaire" in s,
            "secours_catholique": "secours catholique" in s,
            "mairie": "mairie" in s,
            "commune": "commune" in s,
            "association": "association" in s,
            "mission_locale": "mission locale" in s,
            "action_emploi": "action emploi" in s,
            "caarud": "caarud" in s,
            "croix_rouge": "croix" in s and "rouge" in s,
            "pmi": "pmi" in s,
            "*thèque": "médiathèque" in s or "bibliothèque" in s,
            "mjd": "mjd" in s,
            "france_services": "espace france services" in s,
            "cidff": "cidff" in s,
            "médiavipp": "médiavipp" in s,
            "nouvelles_voies": "nouvelles voies" in s,
            "adil": "adil" in s,
            "maison_emploi": "maison" in s and "emploi" in s,
        },
    )
    .apply(pd.Series)
    .assign(
        na=lambda df: df.apply(
            lambda row: ~row.any(), axis="columns", result_type="expand"
        )
    )
)

categories_flags_places_df.sum().sort_values(ascending=False).plot(kind="bar", grid=True, rot=35, figsize=(20, 8))


In [None]:
categories_flags_places_df.drop(columns=["na"]).sum().sort_values(ascending=False).plot(
    kind="bar", grid=True, rot=35, figsize=(20, 8)
)


### Répartition géographique

In [None]:
all_places_df["position.departement"].value_counts().to_frame()

### Nombre de lieux par status sur soliguide

In [None]:
all_places_df.statut.value_counts().to_frame()

# Services

In [None]:
all_services_df = pd.json_normalize(
    all_places_df.reset_index().to_dict(orient="records"),
    record_path="services_all",
    meta=["lieu_id"],
)
all_services_df = all_services_df.replace([np.nan, ""], None)
all_services_df.sample(5)


### Nombre de services

In [None]:
# nombre de données services
all_services_df.shape[0]

In [None]:
# nombre de services uniques
all_services_df.serviceObjectId.nunique()

### Nombre de services fermés

In [None]:
(all_services_df["close.actif"] == True).sum()

### Nombre de services par typologie

In [None]:
# par catégories
all_services_df.categorie.apply(lambda code: soliguide.categories_by_subcategories[code]).value_counts().to_frame()

In [None]:
# par sous-catégories
all_services_df.categorie.apply(lambda code: soliguide.categories[code]).value_counts().to_frame()

### Taux de remplissage des champs services

In [None]:
from IPython.display import display
for _, df in compute_field_occupancy_rates(all_services_df).groupby(lambda c: c.split(".")[0]):
    display(df.to_frame())

### Typologies

In [None]:
structures_df = all_places_df[all_places_df["position.departement"] == "Essonne"]

categories_flags_structures_df = (
    structures_df.name.str.lower()
    .apply(
        lambda s: {
            "restos_du_c": "restos du c" in s,
            "epicerie": "epicerie" in s,
            "caf": "caf" in s,
            "ccas": "ccas" in s,
            "pole_emploi": "pôle emploi" in s,
            "secours_populaire": "secours populaire" in s,
            "secours_catholique": "secours catholique" in s,
            "mairie": "mairie" in s,
            "commune": "commune" in s,
            "association": "association" in s,
            "mission_locale": "mission locale" in s,
            "action_emploi": "action emploi" in s,
            "caarud": "caarud" in s,
            "croix_rouge": "croix" in s and "rouge" in s,
            "pmi": "pmi" in s,
            "*thèque": "médiathèque" in s or "bibliothèque" in s,
            "mjd": "mjd" in s,
            "france_services": "espace france services" in s,
            "cidff": "cidff" in s,
            "médiavipp": "médiavipp" in s,
            "nouvelles_voies": "nouvelles voies" in s,
            "adil": "adil" in s,
            "maison_emploi": "maison" in s and "emploi" in s,
        },
    )
    .apply(pd.Series)
    .assign(
        na=lambda df: df.apply(
            lambda row: ~row.any(), axis="columns", result_type="expand"
        )
    )
)


In [None]:
categories_flags_structures_df.sum().sort_values(ascending=False).plot(
    kind="bar", grid=True, rot=45, figsize=(20, 8)
)
