In [None]:
from collections import defaultdict

import dotenv
import numpy as np
import pandas as pd


In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

dotenv.load_dotenv(dotenv.find_dotenv())


Ce notebook permet de re-normaliser les données extraites.

In [None]:
def normalize_dataframe(df: pd.DataFrame) -> dict[str, pd.DataFrame]:
    df_by_tbl_name = {}

    df = df.replace([np.nan, ""], None)

    ressources_df = df.iloc[
        :,
        (df.columns == "ID_RES")
        | (df.columns == "STRUCTURE")
        | (df.columns == "LIBELLE_SERVICE")
        | (df.columns == "DESCRIPTION_SERVICE")
        | (df.columns == "DATE DERNIERE MAJ")
        | (df.columns == "SERVICE_RSP")
        | df.columns.str.endswith("_ADR")
        | df.columns.str.endswith("_PHY")
    ]
    ressources_df = ressources_df.drop_duplicates(subset="ID_RES")
    df_by_tbl_name["ressources"] = ressources_df

    contacts_df = df[
        [
            "ID_RES",
            "ID_CTC",
            "TEL_1_CTC",
            "TEL_2_CTC",
            "FAX_CTC",
            "SITE_INTERNET_CTC",
            "MAIL_CTC",
        ]
    ]
    contacts_df = contacts_df.drop_duplicates()
    df_by_tbl_name["contacts"] = contacts_df

    horaires_df = df.iloc[
        :,
        (df.columns == "ID_RES")
        | (df.columns == "COMMENTAIRES_HORAIRE_RSP")
        | df.columns.str.endswith("_HOR"),
    ]
    horaires_df = horaires_df.drop_duplicates()
    horaires_df = horaires_df.dropna(subset=["JOUR_HOR"])
    df_by_tbl_name["horaires"] = horaires_df

    familles_df = df[["ID_RES", "CODE_FAM", "FamilleBesoin"]]
    familles_df = familles_df.drop_duplicates()
    df_by_tbl_name["familles"] = familles_df

    categories_df = df[["ID_RES", "CODE_CAT", "Besoin"]]
    categories_df = categories_df.drop_duplicates()
    df_by_tbl_name["categories"] = categories_df

    sous_categories_df = df[["ID_RES", "CODE_SSC", "Sous besoin"]]
    sous_categories_df = sous_categories_df.drop_duplicates()
    df_by_tbl_name["sous_categories"] = sous_categories_df

    return df_by_tbl_name


In [None]:
dfs = []


In [None]:
dfs.append(
    pd.read_excel(
        "https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDORA14092022.xlsx",
        dtype=str,
    )
)


In [None]:
dfs.append(
    pd.read_excel(
        "https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDora0311-1.xlsx",
        dtype=str,
    )
)


In [None]:
dfs.append(
    pd.read_excel(
        "https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDora0311-2.xlsx",
        dtype=str,
    )
)


In [None]:
dfs.append(
    pd.read_excel(
        "https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDora0311-3.xlsx",
        dtype=str,
    )
)


In [None]:
dfs.append(
    pd.read_excel(
        "https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDora0311-4.xlsx",
        dtype=str,
    )
)


In [None]:
for df in dfs[1:]:
    df["DATE DERNIERE MAJ"] = pd.to_datetime(
        df["MAX_DATE"].astype(float), unit="D", origin="1899-12-30"
    ).map(lambda dt: dt.isoformat(sep=" ", timespec="seconds"))


In [None]:
df_by_tbl_name = defaultdict(pd.DataFrame)

for df in dfs:
    local_df_by_tbl_name = normalize_dataframe(df)

    for tbl_name, df in local_df_by_tbl_name.items():
        df_by_tbl_name[tbl_name] = pd.concat([df_by_tbl_name[tbl_name], df])


In [None]:
for tbl_name, df in df_by_tbl_name.items():
    print(tbl_name, df.duplicated(subset=["ID_RES"], keep=False).sum())


In [None]:
for tbl_name, df in df_by_tbl_name.items():
    df.to_csv(
        f"{tbl_name}.csv",
        index=False,
        sep="|",
    )
