In [0]:
import pyspark.sql.functions as F

In [0]:
def normalize_column_value(
    df,
    column_name,
    old_value1,
    new_value1,
    old_value2,
    new_value2,
    default_value="n/a"
):
    """
    Standardizes categorical column values.

    - Trims whitespace
    - Converts values to uppercase for comparison
    - Maps known values to standardized labels
    - Assigns default value for unknowns
    """

    normalized_col = F.upper(F.trim(F.col(column_name)))

    return (
        df.withColumn(
            column_name,
            F.when(normalized_col == old_value1, new_value1)
             .when(normalized_col == old_value2, new_value2)
             .otherwise(default_value)
        )
    )


In [0]:
def configure_adls_access(storage_account="salesdwh"):
    """
    Configures Spark to access Azure Data Lake Gen2 using OAuth.
    Secrets are fetched securely from Databricks Secret Scope.
    """

    scope_name = "salesdwhscope"

    client_id = dbutils.secrets.get(scope=scope_name, key="clientid")
    tenant_id = dbutils.secrets.get(scope=scope_name, key="tenantid")
    client_secret = dbutils.secrets.get(scope=scope_name, key="clientsecret")

    account_url = f"{storage_account}.dfs.core.windows.net"

    spark.conf.set(f"fs.azure.account.auth.type.{account_url}", "OAuth")
    spark.conf.set(
        f"fs.azure.account.oauth.provider.type.{account_url}",
        "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
    )
    spark.conf.set(f"fs.azure.account.oauth2.client.id.{account_url}", client_id)
    spark.conf.set(f"fs.azure.account.oauth2.client.secret.{account_url}", client_secret)
    spark.conf.set(
        f"fs.azure.account.oauth2.client.endpoint.{account_url}",
        f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
    )


In [0]:
from delta.tables import DeltaTable


def incremental_load(df, catalog_name, schema_name, table_name, merge_condition):
    """
    Performs an upsert (MERGE) into a Delta table.

    - MERGE if table exists
    - CREATE table if not present
    """

    full_table_name = f"{catalog_name}.{schema_name}.{table_name}"

    if spark.catalog.tableExists(full_table_name):

        delta_table = DeltaTable.forName(spark, full_table_name)

        delta_table.alias("tgt") \
            .merge(df.alias("src"), merge_condition) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()

        return f"Incremental load completed for table: {full_table_name}"

    else:
        df.write.format("delta").saveAsTable(full_table_name)
        return f"Table created successfully: {full_table_name}"


In [0]:
def clean_date_column(df, column_name):
    """
    Cleans and converts an integer/string date column to DateType.

    Rules:
    - Invalid values (0 or incorrect length) → NULL
    - Valid values → parsed using yyyyMMdd format
    """

    return (
        df.withColumn(
            column_name,
            F.when(
                (F.col(column_name) == 0)
                | (F.length(F.col(column_name).cast("string")) != 8),
                None
            ).otherwise(
                F.to_date(F.col(column_name).cast("string"), "yyyyMMdd")
            )
        )
    )