In [3]:
# Manage all imports
import pandas as pd
import os
import datetime
import sys

# Dirty trick to be able to import common odis modules, if the notebook is not executed from 13_odis
current_dir = os.getcwd()
parent_dir = os.path.dirname(os.getcwd())
while not current_dir.endswith("13_odis"):
    print("changing to parent dir")
    os.chdir(parent_dir)
    current_dir = parent_dir
    parent_dir = os.path.dirname(current_dir)

print(os.getcwd())
sys.path.append(current_dir)

changing to parent dir
/home/jbn/13_odis


In [4]:
# additional imports
from common.config import load_config
from common.data_source_model import DataSourceModel
from common.utils.file_handler import FileHandler
from common.utils.interfaces.data_handler import OperationType

In [5]:
# Define parameters for papermill
model_name = "emploi.bmo_2025"
filepath = 'data/imports/emploi/emploi.bmo_2025_1.zip'

In [6]:
# Initialize common variables
dataframes = {}
artifacts = []

config = load_config("datasources.yaml", response_model=DataSourceModel)
model = config.get_model( model_name = model_name )
start_time = datetime.datetime.now()

# Instantiate File Handler for file loads and dumps
handler = FileHandler()

In [7]:
from zipfile import ZipFile

# unzip and dump files into the data/imports folder
with open(filepath, 'rb') as f:
    zip_archive = ZipFile(f)

    zip_members = zip_archive.infolist()
    for member in zip_members:

        if not member.is_dir():

            member_filename = member.filename
            member_name = member_filename.split(".")[0]
            member_format = member_filename.split(".")[-1]
            print(member_format)
            
            f_member = zip_archive.open( member, 'r' ).read()
    
            artifact = handler.artifact_dump(
                f_member,
                "file",
                model,
                format = member_format
            )

            print(artifact.model_dump(mode="yaml"))

            artifacts.append(artifact)

xml
2025-08-27 17:03:50,272 - DEBUG :: file_handler.py :: dump (162) :: dumping: data/imports/emploi/emploi.bmo_2025_file.xml
2025-08-27 17:03:50,275 - DEBUG :: file_handler.py :: file_dump (305) :: emploi.bmo_2025 -> results saved to : 'data/imports/emploi/emploi.bmo_2025_file.xml'
{'name': 'file', 'storage_info': {'location': 'data/imports/emploi', 'format': 'xml', 'file_name': 'emploi.bmo_2025_file.xml', 'encoding': 'utf-8'}, 'load_to_bronze': True, 'success': True}
rels
2025-08-27 17:03:50,278 - DEBUG :: file_handler.py :: dump (162) :: dumping: data/imports/emploi/emploi.bmo_2025_file.rels
2025-08-27 17:03:50,279 - DEBUG :: file_handler.py :: file_dump (305) :: emploi.bmo_2025 -> results saved to : 'data/imports/emploi/emploi.bmo_2025_file.rels'
{'name': 'file', 'storage_info': {'location': 'data/imports/emploi', 'format': 'rels', 'file_name': 'emploi.bmo_2025_file.rels', 'encoding': 'utf-8'}, 'load_to_bronze': True, 'success': True}
xml
2025-08-27 17:03:50,282 - DEBUG :: file_han

In [8]:
preprocess_metadata = handler.dump_metadata(
    model = model,
    operation = OperationType.PREPROCESS,
    start_time = start_time,
    complete = True,
    errors = 0,
    artifacts = artifacts,
    pages = []
)

2025-08-27 17:03:50,469 - DEBUG :: file_handler.py :: dump (162) :: dumping: data/imports/emploi/emploi.bmo_2025_metadata_preprocess.json
2025-08-27 17:03:50,472 - DEBUG :: file_handler.py :: file_dump (305) :: emploi.bmo_2025 -> results saved to : 'data/imports/emploi/emploi.bmo_2025_metadata_preprocess.json'
2025-08-27 17:03:50,473 - DEBUG :: file_handler.py :: dump_metadata (473) :: Metadata written in: 'data/imports/emploi/emploi.bmo_2025_metadata_preprocess.json'


In [9]:
from pathlib import Path
artifact = artifacts[0]

xls_base_path = Path( artifact.storage_info.location )
xls_filepath = xls_base_path / artifact.storage_info.file_name

df = pd.read_excel(
    xls_filepath, 
    sheet_name='BMO_2025_open_data',
    ) 

dataframes[ artifact.name ] = df

df.head()


ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [None]:
from dotenv import dotenv_values
import sqlalchemy
from sqlalchemy import text

# prepare db client
vals = dotenv_values()

conn_str = "postgresql://{}:{}@{}:{}/{}".format(
    vals["PG_DB_USER"],
    vals["PG_DB_PWD"],
    vals["PG_DB_HOST"],
    vals["PG_DB_PORT"],
    vals["PG_DB_NAME"]
)

dbengine = sqlalchemy.create_engine(conn_str)

In [None]:
# insert all to bronze
# make the final table name lowercase to avoid issues in Postgre

for name, dataframe in dataframes.items():

    subtable_name = f"{model.table_name}_{name.lower()}"
    query_str = f"DROP TABLE IF EXISTS bronze.{subtable_name} CASCADE"

    # dropping existing table with cascade
    with dbengine.connect() as con:
        print(f"Dropping if exists: {subtable_name}")
        result = con.execute(text(query_str))
        con.commit()

    print(f"Inserting DataFrame {subtable_name}")
    dataframe.to_sql(
        name = subtable_name,
        con = dbengine,
        schema = 'bronze',
        index = True,
        if_exists = 'replace'
    )