In [None]:
import requests
from zipfile import ZipFile 
from io import BytesIO
import os
import pathlib
import pandas as pd

from common.utils.source_extractors import NotebookResult


def download() -> dict:
    """generates a NotebookResult with the data files and the dictionary file
    from the BMO 2024 open data

    Returns:
        dict: a json with the NotebookResult
    """

    resp = requests.get(
        "https://www.data.gouv.fr/fr/datasets/r/4319f5e8-a8e6-476c-8da0-c0d3439ebb55"
    )

    resp.raise_for_status()

    # the response is a zipfile
    zipped_content = BytesIO(resp.content)

    print(f"Zipped content len: {len(zipped_content.getvalue())/1000} Kb")

    # unzip the content in the /data directory
    data_dir = pathlib.Path.cwd() / "../data/emploi"

    print(f"Unzipping content to '{data_dir}'")

    # make sure the target directory exists
    pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True)

    with ZipFile(zipped_content) as zip_archive:

        for file_info in zip_archive.infolist():

            # skip the directory itself
            if file_info.filename.endswith('/'):
                continue

            zip_archive.extract(
                member=file_info,
                path=data_dir,
            )

            excel_path = os.path.join(data_dir, file_info.filename)

            # dictionnaire des données
            dictionary_df = pd.read_excel(open(excel_path, "rb"), sheet_name="Description des variables")

            # generate csv from the dictionary
            dictionary_csv_path = os.path.join(data_dir, "bmo_2024_dictionnaire.csv")
            dictionary_df.to_csv(dictionary_csv_path, index=False)

            # données elles-mêmes
            data_df = pd.read_excel(open(excel_path, "rb"), sheet_name="BMO_2024_open_data")

            # generate csv from the data
            data_csv_path = os.path.join(data_dir, "bmo_2024_data.csv")
            data_df.to_csv(data_csv_path, index=False)

            # return a dict like
            # that will be jsonified in the stdout
            return NotebookResult(
                data_files=[data_csv_path],
                dictionnary_file=dictionary_csv_path,
                format="csv",
            ).model_dump()

download()

Zipped content len: 3244.393 Kb
File Name                                             Modified             Size
Donnees_consolidees_2024/                      2024-04-25 15:30:16            0
Donnees_consolidees_2024/Base_open_data_BMO_2024.xlsx 2024-04-25 15:28:28      3491167


Analyzing ./Donnees_consolidees_2024/Base_open_data_BMO_2024.xlsx
       annee Code métier BMO Nom métier BMO Famille_met  \
0       2024           A0X40   Agriculteurs           Z   
1       2024           A0X40   Agriculteurs           Z   
2       2024           A0X40   Agriculteurs           Z   
3       2024           A0X40   Agriculteurs           Z   
4       2024           A0X40   Agriculteurs           Z   
...      ...             ...            ...         ...   
53438   2024           W1X80     Formateurs           S   
53439   2024           W1X80     Formateurs           S   
53440   2024           W1X80     Formateurs           S   
53441   2024           W1X80     Formateurs           S   
53442   2024           W1X80     Formateurs           S   

                                 Lbl_fam_met  REG                     NOM_REG  \
0                             Autres métiers    1                  Guadeloupe   
1                             Autres métiers    1              