In [None]:

# ==========================================
# PARAMETER CELLS (Fabric Notebook)
# ==========================================
# Ces 3 variables doivent être marquées comme "Parameter" dans Fabric

LandingPath = "Files/landing/customers/2025-12-01/churn.csv"  # exemple, surchargé par le pipeline
ExecDate = "2025-12-01"                                    # exemple, surchargé par le pipeline
Entity = "CUSTOMER"                                         # exemple, surchargé par le pipeline


In [None]:
# ==========================================
# IMPORTS
# ==========================================
from pyspark.sql.functions import (
    col,
    lit,
    upper,
    trim,
    to_date,
    current_timestamp
)
from pyspark.sql import functions as F


# ==========================================
# LECTURE DU FICHIER SOURCE
# ==========================================
# On lit le CSV depuis le landing path.
# input_path doit être un chemin valide depuis le Lakehouse lié (ex: "Files/landing/customers/2025-12-01/churn.csv")

df_raw = (
    spark.read
         .format("csv")
         .option("header", "true")
         .option("inferSchema", "true")
         .load(LandingPath)
)

display(df_raw.limit(10))


# ==========================================
# NORMALISATION / RENOMMAGE DES COLONNES
# ==========================================
# Schéma du fichier churn.csv :
# RowNumber, CustomerId, Surname, CreditScore, Geography, Gender,
# Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember,
# EstimatedSalary, Exited

df_norm = (
    df_raw
        # Renommage en snake_case
        .withColumnRenamed("RowNumber", "row_number")
        .withColumnRenamed("CustomerId", "customer_id")
        .withColumnRenamed("Surname", "surname")
        .withColumnRenamed("CreditScore", "credit_score")
        .withColumnRenamed("Geography", "geography")
        .withColumnRenamed("Gender", "gender")
        .withColumnRenamed("Age", "age")
        .withColumnRenamed("Tenure", "tenure")
        .withColumnRenamed("Balance", "balance")
        .withColumnRenamed("NumOfProducts", "num_products")
        .withColumnRenamed("HasCrCard", "has_credit_card")
        .withColumnRenamed("IsActiveMember", "is_active_member")
        .withColumnRenamed("EstimatedSalary", "estimated_salary")
        .withColumnRenamed("Exited", "exited")
)

display(df_norm.limit(10))


# ==========================================
# TYPAGE EXPLICITE DES COLONNES
# ==========================================
df_typed = (
    df_norm
        .withColumn("row_number",        col("row_number").cast("int"))
        .withColumn("customer_id",       col("customer_id").cast("long"))
        .withColumn("credit_score",      col("credit_score").cast("int"))
        .withColumn("age",               col("age").cast("int"))
        .withColumn("tenure",            col("tenure").cast("int"))
        .withColumn("balance",           col("balance").cast("double"))
        .withColumn("num_products",      col("num_products").cast("int"))
        .withColumn("has_credit_card",   col("has_credit_card").cast("int"))
        .withColumn("is_active_member",  col("is_active_member").cast("int"))
        .withColumn("estimated_salary",  col("estimated_salary").cast("double"))
        .withColumn("exited",            col("exited").cast("int"))
)

display(df_typed.limit(10))


# ==========================================
# NORMALISATION DES STRINGS
# ==========================================
# On nettoie et met en UPPERCASE certaines colonnes texte.

df_clean = (
    df_typed
        .withColumn("surname",   upper(trim(col("surname"))))
        .withColumn("geography", upper(trim(col("geography"))))
        .withColumn("gender",    upper(trim(col("gender"))))
)


# ==========================================
# COLONNES TECHNIQUES (BRONZE)
# ==========================================
# - source_file      : permet de tracer le fichier d'origine
# - ingestion_date   : date d'exécution (exec_date)
# - ingestion_ts     : timestamp technique
# - entity           : type d'entité (CUSTOMER)

df_bronze = (
    df_clean
        .withColumn("source_file",   lit(LandingPath))
        .withColumn("ingestion_date", to_date(lit(ExecDate), "yyyy-MM-dd"))
        .withColumn("ingestion_ts",  current_timestamp())
        .withColumn("entity",        lit(Entity))
)

display(df_bronze.limit(10))


# ==========================================
# ÉCRITURE DANS LA TABLE BRONZE
# ==========================================
# IMPORTANT :
# - Le notebook doit être lié au Lakehouse `lh_wm_core`
# - La table sera créée (si besoin) en mode Delta dans ce Lakehouse.

target_table = "bronze_customers_raw"

(
    df_bronze
        .write
        .mode("append")
        .format("delta")
        .saveAsTable(target_table)
)

print(f"✅ Ingestion bronze terminée pour {entity} dans la table {target_table}")
