In [1]:
# Manage all imports
import pandas as pd
import os
import datetime
import sys

# Dirty trick to be able to import common odis modules, if the notebook is not executed from 13_odis
current_dir = os.getcwd()
parent_dir = os.path.dirname(os.getcwd())
while not current_dir.endswith("13_odis"):
    print("changing to parent dir")
    os.chdir(parent_dir)
    current_dir = parent_dir
    parent_dir = os.path.dirname(current_dir)

print(os.getcwd())
sys.path.append(current_dir)

changing to parent dir
/Users/alex/dev/13_odis


In [2]:
# additional imports
from common.config import load_config
from common.data_source_model import DataSourceModel
from common.utils.file_handler import FileHandler
from common.utils.interfaces.data_handler import OperationType

In [None]:
# Define parameters for papermill
model_name = "emploi.bmo_2024"
filepath = 'data/imports/emploi/emploi.bmo_2024_1.zip'

In [None]:
# Initialize common variables
dataframes = {}
artifacts = []

config = load_config("datasources.yaml", response_model=DataSourceModel)
model = config.get_model( model_name = model_name )
start_time = datetime.datetime.now()

# Instantiate File Handler for file loads and dumps
handler = FileHandler()

In [5]:
from zipfile import ZipFile

# unzip and dump files into the data/imports folder
with open(filepath, 'rb') as f:
    zip_archive = ZipFile(f)

    zip_members = zip_archive.infolist()
    for member in zip_members:

        if not member.is_dir():

            member_filename = member.filename
            member_name = member_filename.split(".")[0]
            member_format = member_filename.split(".")[-1]
            print(member_format)
            
            f_member = zip_archive.open( member, 'r' ).read()
    
            artifact = handler.artifact_dump(
                f_member,
                "file",
                model,
                format = member_format
            )

            print(artifact.model_dump(mode="yaml"))

            artifacts.append(artifact)

xlsx
2025-04-11 12:17:58,585 - main - INFO :: file_handler.py :: emploi.bmo_2024 -> results saved to : 'data/imports/emploi/emploi.bmo_2024_file.xlsx'
{'name': 'file', 'storage_info': {'location': 'data/imports/emploi', 'format': 'xlsx', 'file_name': 'emploi.bmo_2024_file.xlsx', 'encoding': 'utf-8'}, 'load_to_bronze': True, 'success': True}


In [6]:
preprocess_metadata = handler.dump_metadata(
    model = model,
    operation = OperationType.PREPROCESS,
    start_time = start_time,
    complete = True,
    errors = 0,
    artifacts = artifacts,
    pages = []
)

2025-04-11 12:18:00,197 - main - INFO :: file_handler.py :: emploi.bmo_2024 -> results saved to : 'data/imports/emploi/emploi.bmo_2024_metadata_preprocess.json'
2025-04-11 12:18:00,199 - main - DEBUG :: file_handler.py :: Metadata written in: 'data/imports/emploi/emploi.bmo_2024_metadata_preprocess.json'


In [7]:
from pathlib import Path
artifact = artifacts[0]

xls_base_path = Path( artifact.storage_info.location )
xls_filepath = xls_base_path / artifact.storage_info.file_name

df = pd.read_excel(
    xls_filepath, 
    sheet_name='BMO_2024_open_data',
    ) 

dataframes[ artifact.name ] = df

df.head()


Unnamed: 0,annee,Code métier BMO,Nom métier BMO,Famille_met,Lbl_fam_met,REG,NOM_REG,Dept,NomDept,BE24,NOMBE24,met,xmet,smet
0,2024,A0X40,Agriculteurs,Z,Autres métiers,1,Guadeloupe,971,Guadeloupe,101,BASSIN BASSE-TERRE,95,31,44
1,2024,A0X40,Agriculteurs,Z,Autres métiers,1,Guadeloupe,971,Guadeloupe,102,BASSIN GRANDE-TERRE,167,69,86
2,2024,A0X40,Agriculteurs,Z,Autres métiers,1,Guadeloupe,971,Guadeloupe,103,BASSIN MARIE-GALANTE,17,*,*
3,2024,A0X40,Agriculteurs,Z,Autres métiers,1,Guadeloupe,978,Guadeloupe,104,BASSIN ILES DU NORD,6,6,*
4,2024,A0X40,Agriculteurs,Z,Autres métiers,1,Guadeloupe,971,Guadeloupe,105,BASSIN CENTRE,45,37,30


In [8]:
from dotenv import dotenv_values
import sqlalchemy

# prepare db client
vals = dotenv_values()

conn_str = "postgresql://{}:{}@{}:{}/{}".format(
    vals["PG_DB_USER"],
    vals["PG_DB_PWD"],
    vals["PG_DB_HOST"],
    vals["PG_DB_PORT"],
    vals["PG_DB_NAME"]
)

dbengine = sqlalchemy.create_engine(conn_str)

In [9]:
# insert all to bronze
# make the final table name lowercase to avoid issues in Postgre
for name, dataframe in dataframes.items():
    dataframe.to_sql(
        name = f"{model.table_name}_{name.lower()}",
        con = dbengine,
        schema = 'bronze',
        index = True,
        if_exists = 'replace'
    )
