# Preprocessor Notebook : Excel

Ce notebook traite un fichier Excel de façon assez générique : extrait la première feuille et la charge en couche Bronze.

 ### Paramètres
 Ce Notebook prend des paramètres en entrée, définis sur la toute première cellule (ci-dessus).
 La cellule a le tag "parameters" ce qui permet de lui passer des valeurs via papermill.
 - filepath : le chemin vers le fichier Excel à traiter
 - model_name : le nom du modèle source

 ### Principe
 La première feuille est chargée dans un dataFrame puis sauvegardée en .xlsx, et chargée en base de données Bronze.

## Initialisation

Les cellules suivantes servent à importer les modules nécessaires et à préparer les variables communes utilisées dans les traitements.

In [None]:
# Baseline imports
import pandas as pd
import os
import sys
import datetime

# Dirty trick to be able to import common odis modules, if the notebook is not executed from 13_odis
current_dir = os.getcwd()
parent_dir = os.path.dirname(os.getcwd())
while not current_dir.endswith("13_odis"):
    print("changing to parent dir")
    os.chdir(parent_dir)
    current_dir = parent_dir
    parent_dir = os.path.dirname(current_dir)

print(os.getcwd())
sys.path.append(current_dir)

changing to parent dir
/Users/alex/dev/13_odis


In [13]:
# additional imports
from common.config import load_config
from common.data_source_model import DataSourceModel
from common.utils.file_handler import FileHandler, XlsxReader
from common.utils.interfaces.data_handler import OperationType
from common.utils.interfaces.loader import Column

## Paramètres du Notebook
Paramètres pouvant être passés en input par papermill.

Seuls des types built-in semblent marcher (str, int etc), les classes spécifiques ou les objets mutables (datetime...) semblent faire planter papermill.

Doc officielle de papermill : parametrize [https://papermill.readthedocs.io/en/latest/usage-parameterize.html]

In [3]:
# Define parameters for papermill. 
filepath = 'data/imports/geographical_references/geographical_references.intercommunalites_1.xlsx'
model_name = "geographical_references.intercommunalites"


In [4]:
# Initialize common variables
dataframes = {}
artifacts = []

start_time = datetime.datetime.now(tz=datetime.timezone.utc)
config = load_config("datasources.yaml", response_model=DataSourceModel)
model = config.get_model( model_name = model_name )
# Instantiate File Handler for file loads and dumps
handler = FileHandler()

## Traitement des données
A partir de là, on charge le fichier Excel dans Pandas et on traite les feuilles à récupérer, une par une

In [5]:
# Load workbook to a dict of pandas dataframes
dataframes = XlsxReader(filepath).load(model=model)

2025-08-11 10:18:59,857 - DEBUG :: file_handler.py :: load (58) :: loading: data/imports/geographical_references/geographical_references.intercommunalites_1.xlsx


In [11]:
print(f'Dataframes names : {dataframes.keys()}')

Dataframes names : dict_keys(['0'])


## Sauvegarde des métadonnées
On sauvegarde les métadonnées du processus localement, pour garder l'historique et pouvoir reprendre après erreur si besoin

In [7]:
for artifact in artifacts:
    print(artifact.model_dump( mode = "json" ))

preprocess_metadata = handler.dump_metadata(
    model = model,
    operation = OperationType.PREPROCESS,
    start_time = start_time,
    complete = True,
    errors = 0,
    artifacts = artifacts,
    pages = []
)

2025-08-11 10:34:24,032 - DEBUG :: file_handler.py :: dump (162) :: dumping: data/imports/geographical_references/geographical_references.intercommunalites_metadata_preprocess.json
2025-08-11 10:34:24,041 - DEBUG :: file_handler.py :: file_dump (305) :: geographical_references.intercommunalites -> results saved to : 'data/imports/geographical_references/geographical_references.intercommunalites_metadata_preprocess.json'
2025-08-11 10:34:24,043 - DEBUG :: file_handler.py :: dump_metadata (473) :: Metadata written in: 'data/imports/geographical_references/geographical_references.intercommunalites_metadata_preprocess.json'


## Chargement en couche Bronze
On charge un engine SQLAchemy pour charger tous les datasets en base

In [8]:
from dotenv import dotenv_values
import sqlalchemy
from sqlalchemy import text

# prepare db client
vals = dotenv_values()

conn_str = "postgresql://{}:{}@{}:{}/{}".format(
    vals["PG_DB_USER"],
    vals["PG_DB_PWD"],
    vals["PG_DB_HOST"],
    vals["PG_DB_PORT"],
    vals["PG_DB_NAME"]
)

dbengine = sqlalchemy.create_engine(conn_str)

In [17]:
def sanitize_columns(df: pd.DataFrame) -> pd.DataFrame:
        """
        Sniff a dataframe to sanitize and deduplicate columns based on column name.
        Deduplicates columns while preserving order,
        after applying sanitization (accents, lowercase, etc).
        """

        seen = set()
        deduped_columns_map = {}
        processed_df = df.copy()

        for original_col in processed_df.columns:
            try:
                sanitized = Column(name=original_col).name
            except Exception as e:
                processed_df.drop(original_col)
                print(f"Skipping invalid column '{original_col}': {e}")
                continue

            if sanitized not in seen:
                seen.add(sanitized)
                deduped_columns_map[original_col] = sanitized
            else:
                print(f"Duplicate column after sanitation: '{sanitized}' (original: '{original_col}'). It will be dropped before loading.")
                processed_df.drop(original_col)

        print(f"Final deduplicated column map: {deduped_columns_map.items}")

        processed_df.rename(
             columns = deduped_columns_map,
             inplace = True
        )

        return processed_df

In [19]:
# insert all to bronze
# make the final table name lowercase to avoid issues in Postgre

for name, dataframe in dataframes.items():

    # sanitize the dataframe
    clean_dataframe = sanitize_columns(dataframe)

    print(f"DataFrame columns after deduplication: {list(clean_dataframe.columns)}")

    subtable_name = model.table_name if name=="0" else f"{model.table_name}_{name.lower()}"
    query_str = f"DROP TABLE IF EXISTS bronze.{subtable_name} CASCADE"

    # dropping existing table with cascade
    with dbengine.connect() as con:
        print(f"Dropping if exists: {subtable_name}")
        result = con.execute(text(query_str))
        con.commit()

    print(f"Inserting DataFrame {subtable_name}")
    clean_dataframe.to_sql(
        name = subtable_name,
        con = dbengine,
        schema = 'bronze',
        index = True,
        if_exists = 'replace'
    )


Final deduplicated column map: <built-in method items of dict object at 0x17a3d4740>
DataFrame columns after deduplication: ['departement', 'arrondissement_siege', 'commune_siege', 'n_siren', 'nom_du_groupement', 'nature_juridique', 'mode_de_financement', 'mode_de_repartition_des_sieges', 'autre_mode_de_repartition_des_sieges', 'date_de_creation', 'date_deffet', 'syndicat_a_la_carte', 'groupement_interdepartemental', 'zone_de_montagne', 'epage', 'eptb', 'numero_et_libelle_de_la_voie_du_siege', 'complement_dadresse_du_siege', 'distribution_speciale_du_siege', 'code_postal_du_siege', 'ville_du_siege', 'telephone_du_siege', 'courriel_du_siege', 'site_internet_du_siege', 'numero_et_libelle_de_la_voie_administrative', 'complement_dadresse_administrative', 'distribution_speciale_administrative', 'code_postal_administrative', 'ville_administrative', 'telephone_administrative', 'teom', 'reom', 'dotation_globale', 'dotation_de_compensation', 'dotation_dintercommunalite', 'dotation_des_groupemen