### Gold Layer - Incremental Load Script
- Purpose: Create business-ready data marts with applied business logic
- Layer: Gold (Business Data Marts)
- Load Type: Incremental

---
### Dependencies
---

In [1]:
import os
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import logging
from pyspark import StorageLevel

StatementMeta(, 875e6559-5b8c-407a-a96f-41cffaa6a4fd, 3, Finished, Available, Finished)

---
### Parameters
---

In [2]:
execution_date = os.environ.get("execution_date", datetime.now().isoformat())
lookback_days = int(os.environ.get("lookback_days", "7"))  # Default 7 days lookback
force_full_refresh = os.environ.get("force_full_refresh", "false").lower() == "true"

StatementMeta(, 875e6559-5b8c-407a-a96f-41cffaa6a4fd, 4, Finished, Available, Finished)

---
### Configuraciones de optimización
---

In [3]:
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark.conf.set("spark.databricks.delta.merge.enableLowShuffle", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

StatementMeta(, 875e6559-5b8c-407a-a96f-41cffaa6a4fd, 5, Finished, Available, Finished)

---
### Fecha de corte
---

In [4]:
watermark_date = (datetime.now() - timedelta(days=lookback_days)).strftime('%Y-%m-%d')

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info(f"Incremental processing parameters:")
logger.info(f"  Execution Date: {execution_date}")
logger.info(f"  Lookback Days: {lookback_days}")
logger.info(f"  Watermark Date: {watermark_date}")
logger.info(f"  Force Full Refresh: {force_full_refresh}")

StatementMeta(, 875e6559-5b8c-407a-a96f-41cffaa6a4fd, 6, Finished, Available, Finished)

INFO:__main__:Incremental processing parameters:
INFO:__main__:  Execution Date: 2025-09-19T04:39:32.525736
INFO:__main__:  Lookback Days: 7
INFO:__main__:  Watermark Date: 2025-09-12
INFO:__main__:  Force Full Refresh: False


---
### Reglas de negocio
---
14 Silver tables → 5 Gold semantic models + business views

In [5]:
class IncrementalUtils:
    @staticmethod
    def get_last_execution_watermark(table_name):
        try:
            last_execution = spark.sql(f"""
                SELECT MAX(execution_timestamp) as last_execution
                FROM gold_execution_log 
                WHERE pipeline_name LIKE '%{table_name}%' 
                AND status = 'success'
            """).collect()[0]['last_execution']
            
            if last_execution:
                return last_execution.strftime('%Y-%m-%d %H:%M:%S')
            else:
                return '1900-01-01 00:00:00'  
        except:
            return '1900-01-01 00:00:00'
    
    @staticmethod
    def table_exists(table_name):
        try:
            spark.sql(f"DESCRIBE TABLE {table_name}")
            return True
        except:
            return False
    
    @staticmethod
    def get_changed_keys(source_table, watermark_column, watermark_value, key_column):
        try:
            changed_df = spark.sql(f"""
                SELECT DISTINCT {key_column}
                FROM {source_table}
                WHERE {watermark_column} >= '{watermark_value}'
            """)
            return changed_df
        except Exception as e:
            logger.warning(f"No se pudieron obtener las claves modificadas para {source_table}: {e}")
            return None

class GoldIncrementalProcessor:
    def __init__(self, spark_session):
        self.spark = spark_session
        self.utils = IncrementalUtils()
        
    def merge_gold_dim_customer_incremental(self):
        try:
            logger.info("Procesando gold_dim_customer incremental")
            if not self.utils.table_exists("gold_dim_customer") or force_full_refresh:
                logger.info("La tabla destino no existe o se forzó el refresh, ejecutando carga completa")
                return self._create_gold_dim_customer_full()
            
            last_execution = self.utils.get_last_execution_watermark("gold_dim_customer")
            changed_customers = self.utils.get_changed_keys(
                "silver_dim_customers", "silver_created_date", last_execution, "customer_key"
            )
            
            if changed_customers is None or changed_customers.count() == 0:
                logger.info("No se detectaron cambios en la dimensión de clientes")
                return 0
            
            customers_df = self.spark.table("silver_dim_customers")
            regions_df = self.spark.table("silver_dim_regions")
            employees_df = self.spark.table("silver_dim_employees")
            incremental_customer = customers_df.alias("c") \
                .join(changed_customers.alias("ch"), col("c.customer_key") == col("ch.customer_key"), "inner") \
                .join(regions_df.alias("r"), col("c.station") == col("r.station"), "left") \
                .join(employees_df.filter(col("role") == "Account Manager").alias("am"), 
                      col("c.account_manager") == col("am.employee_name"), "left") \
                .join(employees_df.filter(col("role") == "Key Account Manager").alias("kam"), 
                      col("c.key_account_manager") == col("kam.employee_name"), "left") \
                .select(
                    col("c.customer_key"),
                    col("c.customer_sold_to_name"),
                    col("c.account_name"),
                    col("c.key_account_name"),

                    # Atributos para segmentación
                    col("c.transaction_type"),
                    col("c.account_type"),

                    # Geografía
                    col("r.system").alias("customer_system"),
                    col("r.interplanetary_region"),
                    col("r.territory"),
                    col("r.station"),
                    col("r.tax_rate"),

                    # Equipo de ventas
                    col("c.account_manager"),
                    col("c.key_account_manager"),
                    col("am.employee_email").alias("account_manager_email"),
                    col("kam.employee_email").alias("key_account_manager_email"),
                    
                    current_timestamp().alias("gold_created_date"),
                    lit(execution_date).alias("gold_execution_id")
                )
            
            incremental_customer.createOrReplaceTempView("temp_incremental_customer")
            
            self.spark.sql(f"""
                MERGE INTO gold_dim_customer AS target
                USING temp_incremental_customer AS source
                ON target.customer_key = source.customer_key
                WHEN MATCHED THEN UPDATE SET *
                WHEN NOT MATCHED THEN INSERT *
            """)
            
            record_count = incremental_customer.count()
            logger.info(f"INCREMENTAL gold_dim_customer: {record_count:,} registros procesados")
            return record_count
            
        except Exception as e:
            logger.error(f"Error en merge incremental gold_dim_customer: {str(e)}")
            raise e
    
    def merge_gold_dim_product_incremental(self):
        try:
            logger.info("Procesando gold_dim_product incremental")
            
            if not self.utils.table_exists("gold_dim_product") or force_full_refresh:
                logger.info("La tabla destino no existe o se forzó el refresh, ejecutando carga completa")
                return self._create_gold_dim_product_full()
            
            last_execution = self.utils.get_last_execution_watermark("gold_dim_product")
            
            changed_products = self.utils.get_changed_keys(
                "silver_dim_products", "silver_created_date", last_execution, "product_key"
            )
            
            if changed_products is None or changed_products.count() == 0:
                logger.info("No se detectaron cambios en la dimensión de productos")
                return 0
            
            products_df = self.spark.table("silver_dim_products")
            brands_df = self.spark.table("silver_dim_brands")
            
            incremental_product = products_df.alias("p") \
                .join(changed_products.alias("ch"), col("p.product_key") == col("ch.product_key"), "inner") \
                .join(brands_df.alias("b"), col("p.sub_brand_name") == col("b.sub_brand"), "left") \
                .select(
                    # Llaves
                    col("p.product_key"),
                    col("p.product_name"),
                    col("p.type").alias("product_type"),
                    col("p.subtype").alias("product_subtype"),

                    # Atributos físicos
                    col("p.ship_class_for_part"),
                    col("p.weight_tonnes"),
                    col("p.color"),
                    col("p.material"),

                    # Jerarquía de marca
                    col("b.flagship").alias("brand_flagship"),
                    col("b.class").alias("brand_class"),
                    col("b.brand"),
                    col("p.sub_brand_name"),

                    # Responsables
                    col("p.product_business_line_leader"),
                    col("b.product_brand_vp"),
                    
                    current_timestamp().alias("gold_created_date"),
                    lit(execution_date).alias("gold_execution_id")
                )
            
            incremental_product.createOrReplaceTempView("temp_incremental_product")
            
            self.spark.sql(f"""
                MERGE INTO gold_dim_product AS target
                USING temp_incremental_product AS source
                ON target.product_key = source.product_key
                WHEN MATCHED THEN UPDATE SET *
                WHEN NOT MATCHED THEN INSERT *
            """)
            
            record_count = incremental_product.count()
            logger.info(f"INCREMENTAL gold_dim_product: {record_count:,} registros procesados")
            return record_count
            
        except Exception as e:
            logger.error(f"Error en merge incremental gold_dim_product: {str(e)}")
            raise e

    def merge_gold_fact_sales_incremental(self):
        try:
            logger.info("Procesando gold_fact_sales incremental")
            
            if not self.utils.table_exists("gold_fact_sales") or force_full_refresh:
                logger.info("La tabla destino no existe o se forzó el refresh, ejecutando carga completa")
                return self._create_gold_fact_sales_full()
            
            # Obtener datos de facturas recientes/modificadas
            invoices_df = self.spark.table("silver_fact_invoices")
            budget_rate_df = self.spark.table("silver_dim_budget_rate")
            doc_types_df = self.spark.table("silver_dim_invoice_doctype")
            
            # Filtrar solo facturas recientes
            recent_invoices = invoices_df.filter(
                col("silver_created_date") >= watermark_date
            )
            
            if recent_invoices.count() == 0:
                logger.info("No hay facturas recientes para procesar")
                return 0
            
            # Aplicar la misma lógica SIMPLIFICADA del full load
            incremental_sales = recent_invoices.alias("i") \
                .join(budget_rate_df.alias("br"), 
                      col("i.local_currency") == col("br.from_currency"), "left") \
                .join(doc_types_df.alias("dt"), 
                      col("i.billing_document_type_code") == col("dt.billing_document_type_code"), "left") \
                .select(
                    # Llaves existentes
                    col("i.customer_key"),
                    col("i.product_key"),
                    
                    # Fechas como DATE (simplificado)
                    to_date(col("i.billing_date")).alias("billing_date"),
                    to_date(col("i.ship_date")).alias("ship_date"),

                    # Identificadores
                    col("i.billing_document_number").alias("invoice_number"),
                    col("i.billing_document_line_item_number").alias("line_item"),

                    # Valores en EUR (conversión única, simplificado)
                    (col("i.net_invoice_value") * coalesce(col("br.rate"), lit(1.0))).alias("sales_eur"),
                    (col("i.net_invoice_cogs") * coalesce(col("br.rate"), lit(1.0))).alias("cogs_eur"),
                    (col("i.delivery_cost") * coalesce(col("br.rate"), lit(1.0))).alias("delivery_cost_eur"),
                    (col("i.freight") * coalesce(col("br.rate"), lit(1.0))).alias("freight_eur"),
                    (col("i.taxes_commercial_fees") * coalesce(col("br.rate"), lit(1.0))).alias("taxes_eur"),

                    # Cantidad
                    col("i.net_invoice_quantity").alias("quantity"),

                    # Indicador simple
                    col("i.otd_indicator").cast("boolean").alias("on_time_delivery"),
                    
                    # Categorización de documentos (simplificado)
                    when(col("dt.group_col") == "Invoice", lit("Sale"))
                    .when(col("dt.group_col") == "Adjustment", lit("Adjustment"))
                    .when(col("dt.group_col").isNull(), lit("Unclassified"))
                    .otherwise(col("dt.group_col")).alias("document_category"),
                    
                    # Metadatos para trazabilidad
                    col("i.billing_document_type_code").alias("source_doc_type_code"),
                    col("dt.text").alias("document_type_description"),
                    
                    current_timestamp().alias("gold_created_date"),
                    lit(execution_date).alias("gold_execution_id")
                )
            
            incremental_sales.createOrReplaceTempView("temp_incremental_sales")
            
            # MERGE
            self.spark.sql(f"""
                MERGE INTO gold_fact_sales AS target
                USING temp_incremental_sales AS source
                ON target.customer_key = source.customer_key 
                   AND target.product_key = source.product_key 
                   AND target.billing_date = source.billing_date
                   AND target.invoice_number = source.invoice_number
                   AND target.line_item = source.line_item
                WHEN MATCHED THEN UPDATE SET *
                WHEN NOT MATCHED THEN INSERT *
            """)
            
            record_count = incremental_sales.count()
            logger.info(f"INCREMENTAL gold_fact_sales: {record_count:,} registros procesados")
            return record_count
            
        except Exception as e:
            logger.error(f"Error en merge incremental gold_fact_sales: {str(e)}")
            raise e

    def merge_gold_fact_orders_incremental(self):
        try:
            logger.info("Procesando gold_fact_orders incremental")
            
            if not self.utils.table_exists("gold_fact_orders") or force_full_refresh:
                logger.info("La tabla destino no existe o se forzó el refresh, ejecutando carga completa")
                return self._create_gold_fact_orders_full()
            
            # Obtener datos de órdenes recientes/modificadas
            orders_df = self.spark.table("silver_fact_orders")
            budget_rate_df = self.spark.table("silver_dim_budget_rate")
            
            # Filtrar solo órdenes recientes
            recent_orders = orders_df.filter(
                col("silver_created_date") >= watermark_date
            )
            
            if recent_orders.count() == 0:
                logger.info("No hay órdenes recientes para procesar")
                return 0
            
            # Aplicar la misma lógica del full load
            incremental_orders = recent_orders.alias("o") \
                .join(budget_rate_df.alias("br"), col("o.local_currency") == col("br.from_currency"), "left") \
                .select(
                    # Llaves existentes
                    col("o.customer_key"),
                    col("o.product_key"),

                    # Fechas como DATE
                    to_date(col("o.order_date")).alias("order_date"),
                    to_date(col("o.ship_date")).alias("ship_date"),
                    to_date(col("o.request_goods_receipt_date")).alias("requested_date"),

                    # Identificadores
                    col("o.sales_order_document_number").alias("order_number"),
                    col("o.sales_order_document_line_item_number").alias("line_item"),

                    # Valores en EUR
                    (col("o.net_order_value") * coalesce(col("br.rate"), lit(1.0))).alias("order_value_eur"),
                    col("o.net_order_quantity").alias("quantity"),

                    # Status
                    col("o.sales_order_document_line_item_status").alias("order_status"),
                    
                    current_timestamp().alias("gold_created_date"),
                    lit(execution_date).alias("gold_execution_id")
                )
            
            incremental_orders.createOrReplaceTempView("temp_incremental_orders")
            
            # MERGE 2
            self.spark.sql(f"""
                MERGE INTO gold_fact_orders AS target
                USING temp_incremental_orders AS source
                ON target.customer_key = source.customer_key 
                   AND target.product_key = source.product_key 
                   AND target.order_date = source.order_date
                   AND target.order_number = source.order_number
                   AND target.line_item = source.line_item
                WHEN MATCHED THEN UPDATE SET *
                WHEN NOT MATCHED THEN INSERT *
            """)
            
            record_count = incremental_orders.count()
            logger.info(f"INCREMENTAL gold_fact_orders: {record_count:,} registros procesados")
            return record_count
            
        except Exception as e:
            logger.error(f"Error en merge incremental gold_fact_orders: {str(e)}")
            raise e

    def merge_gold_fact_budget_incremental(self):
        try:
            logger.info("Procesando gold_fact_budget incremental")
            
            if not self.utils.table_exists("gold_fact_budget") or force_full_refresh:
                logger.info("La tabla destino no existe o se forzó el refresh, ejecutando carga completa")
                return self._create_gold_fact_budget_full()
            
            # Obtener datos de presupuesto recientes/modificados
            budget_df = self.spark.table("silver_fact_budget")
            
            # Filtrar solo presupuestos recientes
            recent_budget = budget_df.filter(
                col("silver_created_date") >= watermark_date
            )
            
            if recent_budget.count() == 0:
                logger.info("No hay datos de presupuesto recientes para procesar")
                return 0
            
            # Aplicar la misma lógica del full load
            incremental_budget = recent_budget.select(
                # Llaves existentes
                col("customer_key"),
                col("product_key"),

                # Fecha como DATE
                to_date(col("month")).alias("budget_month"),

                # Valor en EUR (ya viene en EUR desde silver)
                col("total_budget").alias("budget_eur"),
                
                current_timestamp().alias("gold_created_date"),
                lit(execution_date).alias("gold_execution_id")
            ).filter(col("budget_month").isNotNull())
            
            incremental_budget.createOrReplaceTempView("temp_incremental_budget")
            
            # MERGE3
            self.spark.sql(f"""
                MERGE INTO gold_fact_budget AS target
                USING temp_incremental_budget AS source
                ON target.customer_key = source.customer_key 
                   AND target.product_key = source.product_key 
                   AND target.budget_month = source.budget_month
                WHEN MATCHED THEN UPDATE SET *
                WHEN NOT MATCHED THEN INSERT *
            """)
            
            record_count = incremental_budget.count()
            logger.info(f"INCREMENTAL gold_fact_budget: {record_count:,} registros procesados")
            return record_count
            
        except Exception as e:
            logger.error(f"Error en merge incremental gold_fact_budget: {str(e)}")
            raise e

    # Logs
        logger.info("Ejecutando carga completa para gold_dim_customer")
        
        customers_df = self.spark.table("silver_dim_customers")
        regions_df = self.spark.table("silver_dim_regions")
        employees_df = self.spark.table("silver_dim_employees")

        consolidated_customer = (
            customers_df.alias("c")
            .join(regions_df.alias("r"), col("c.station") == col("r.station"), "left")
            .join(
                employees_df.filter(col("role") == "Account Manager").alias("am"),
                col("c.account_manager") == col("am.employee_name"), "left"
            )
            .join(
                employees_df.filter(col("role") == "Key Account Manager").alias("kam"),
                col("c.key_account_manager") == col("kam.employee_name"), "left"
            )
            .select(
                # Llaves
                col("c.customer_key"),
                col("c.customer_sold_to_name"),
                col("c.account_name"),
                col("c.key_account_name"),

                # Atributos para segmentación
                col("c.transaction_type"),
                col("c.account_type"),

                # Geografía
                col("r.system").alias("customer_system"),
                col("r.interplanetary_region"),
                col("r.territory"),
                col("r.station"),
                col("r.tax_rate"),

                # Equipo de ventas
                col("c.account_manager"),
                col("c.key_account_manager"),
                col("am.employee_email").alias("account_manager_email"),
                col("kam.employee_email").alias("key_account_manager_email")
            )
        )

        record_count = consolidated_customer.count()
        consolidated_customer.write.mode("overwrite").format("delta").saveAsTable("gold_dim_customer")
        return record_count
    
    def _create_gold_dim_product_full(self):
        logger.info("Ejecutando carga completa para gold_dim_product")
        
        products_df = self.spark.table("silver_dim_products")
        brands_df = self.spark.table("silver_dim_brands")

        consolidated_product = (
            products_df.alias("p")
            .join(brands_df.alias("b"), col("p.sub_brand_name") == col("b.sub_brand"), "left")
            .select(
                # Llaves
                col("p.product_key"),
                col("p.product_name"),
                col("p.type").alias("product_type"),
                col("p.subtype").alias("product_subtype"),

                # Atributos físicos
                col("p.ship_class_for_part"),
                col("p.weight_tonnes"),
                col("p.color"),
                col("p.material"),

                # Jerarquía de marca
                col("b.flagship").alias("brand_flagship"),
                col("b.class").alias("brand_class"),
                col("b.brand"),
                col("p.sub_brand_name"),

                # Responsables
                col("p.product_business_line_leader"),
                col("b.product_brand_vp")
            )
        )

        record_count = consolidated_product.count()
        consolidated_product.write.mode("overwrite").format("delta").saveAsTable("gold_dim_product")
        return record_count
    
    def _create_gold_fact_sales_full(self):
        logger.info("Ejecutando carga completa para gold_fact_sales")
        
        invoices_df = self.spark.table("silver_fact_invoices")
        budget_rate_df = self.spark.table("silver_dim_budget_rate")
        doc_types_df = self.spark.table("silver_dim_invoice_doctype")

        gold_fact_sales = (
            invoices_df.alias("i")
            .join(budget_rate_df.alias("br"), col("i.local_currency") == col("br.from_currency"), "left")
            .join(doc_types_df.alias("dt"), col("i.billing_document_type_code") == col("dt.billing_document_type_code"), "left")
            .select(
                # Llaves existentes
                col("i.customer_key"),
                col("i.product_key"),
                
                # Fechas como DATE
                to_date(col("i.billing_date")).alias("billing_date"),
                to_date(col("i.ship_date")).alias("ship_date"),

                # Identificadores
                col("i.billing_document_number").alias("invoice_number"),
                col("i.billing_document_line_item_number").alias("line_item"),

                # Valores en EUR (conversión única)
                (col("i.net_invoice_value") * coalesce(col("br.rate"), lit(1.0))).alias("sales_eur"),
                (col("i.net_invoice_cogs") * coalesce(col("br.rate"), lit(1.0))).alias("cogs_eur"),
                (col("i.delivery_cost") * coalesce(col("br.rate"), lit(1.0))).alias("delivery_cost_eur"),
                (col("i.freight") * coalesce(col("br.rate"), lit(1.0))).alias("freight_eur"),
                (col("i.taxes_commercial_fees") * coalesce(col("br.rate"), lit(1.0))).alias("taxes_eur"),

                # Cantidad
                col("i.net_invoice_quantity").alias("quantity"),

                # Indicador simple
                col("i.otd_indicator").cast("boolean").alias("on_time_delivery"),
                
                # Categorización de documentos
                when(col("dt.group_col") == "Invoice", lit("Sale"))
                .when(col("dt.group_col") == "Adjustment", lit("Adjustment"))
                .when(col("dt.group_col").isNull(), lit("Unclassified"))
                .otherwise(col("dt.group_col")).alias("document_category"),
                
                # Metadatos para trazabilidad
                col("i.billing_document_type_code").alias("source_doc_type_code"),
                col("dt.text").alias("document_type_description")
            )
        )

        record_count = gold_fact_sales.count()
        gold_fact_sales.write.mode("overwrite").format("delta").saveAsTable("gold_fact_sales")
        return record_count
    
    def _create_gold_fact_orders_full(self):
        logger.info("Ejecutando carga completa para gold_fact_orders")
        
        orders_df = self.spark.table("silver_fact_orders")
        budget_rate_df = self.spark.table("silver_dim_budget_rate")

        gold_fact_orders = (
            orders_df.alias("o")
            .join(budget_rate_df.alias("br"), col("o.local_currency") == col("br.from_currency"), "left")
            .select(
                # Llaves existentes
                col("o.customer_key"),
                col("o.product_key"),

                # Fechas como DATE
                to_date(col("o.order_date")).alias("order_date"),
                to_date(col("o.ship_date")).alias("ship_date"),
                to_date(col("o.request_goods_receipt_date")).alias("requested_date"),

                # Identificadores
                col("o.sales_order_document_number").alias("order_number"),
                col("o.sales_order_document_line_item_number").alias("line_item"),

                # Valores en EUR
                (col("o.net_order_value") * coalesce(col("br.rate"), lit(1.0))).alias("order_value_eur"),
                col("o.net_order_quantity").alias("quantity"),

                # Status
                col("o.sales_order_document_line_item_status").alias("order_status")
            )
        )

        record_count = gold_fact_orders.count()
        gold_fact_orders.write.mode("overwrite").format("delta").saveAsTable("gold_fact_orders")
        return record_count
    
    def _create_gold_fact_budget_full(self):
        logger.info("Ejecutando carga completa para gold_fact_budget")
        
        budget_df = self.spark.table("silver_fact_budget")

        gold_fact_budget = (
            budget_df.select(
                col("customer_key"),
                col("product_key"),
                to_date(col("month")).alias("budget_month"),
                col("total_budget").alias("budget_eur")
            )
            .filter(col("budget_month").isNotNull())
        )

        record_count = gold_fact_budget.count()
        gold_fact_budget.write.mode("overwrite").format("delta").saveAsTable("gold_fact_budget")
        return record_count

StatementMeta(, 875e6559-5b8c-407a-a96f-41cffaa6a4fd, 7, Finished, Available, Finished)

In [6]:
gold_processor = GoldIncrementalProcessor(spark)

StatementMeta(, 875e6559-5b8c-407a-a96f-41cffaa6a4fd, 8, Finished, Available, Finished)

---
### Creación de data de negocio
---

In [7]:
results = []
total_records = 0

# Procesa incremental dimensions
try:
    record_count = gold_processor.merge_gold_dim_customer_incremental()
    results.append({'table_name': 'gold_dim_customer', 'record_count': record_count, 'status': 'success'})
    total_records += record_count
except Exception as e:
    results.append({'table_name': 'gold_dim_customer', 'record_count': 0, 'status': 'failed', 'error': str(e)})
    print(f"gold_dim_customer: FAILED - {str(e)}")

try:
    record_count = gold_processor.merge_gold_dim_product_incremental()
    results.append({'table_name': 'gold_dim_product', 'record_count': record_count, 'status': 'success'})
    total_records += record_count
except Exception as e:
    results.append({'table_name': 'gold_dim_product', 'record_count': 0, 'status': 'failed', 'error': str(e)})
    print(f"gold_dim_product: FAILED - {str(e)}")

try:
    record_count = gold_processor.merge_gold_dim_date_incremental()
    results.append({'table_name': 'gold_dim_date', 'record_count': record_count, 'status': 'success'})
    total_records += record_count
except Exception as e:
    results.append({'table_name': 'gold_dim_date', 'record_count': 0, 'status': 'failed', 'error': str(e)})
    print(f"gold_dim_date: FAILED - {str(e)}")

# Procesa incremental facts
try:
    record_count = gold_processor.merge_gold_fact_sales_incremental()
    results.append({'table_name': 'gold_fact_sales', 'record_count': record_count, 'status': 'success'})
    total_records += record_count
except Exception as e:
    results.append({'table_name': 'gold_fact_sales', 'record_count': 0, 'status': 'failed', 'error': str(e)})
    print(f"gold_fact_sales: FAILED - {str(e)}")

try:
    record_count = gold_processor.merge_gold_fact_performance_incremental()
    results.append({'table_name': 'gold_fact_performance', 'record_count': record_count, 'status': 'success'})
    total_records += record_count
except Exception as e:
    results.append({'table_name': 'gold_fact_performance', 'record_count': 0, 'status': 'failed', 'error': str(e)})
    print(f"gold_fact_performance: FAILED - {str(e)}")

# Refresca business views
try:
    gold_processor.refresh_business_semantic_views()
    print("Business semantic views refreshed")
except Exception as e:
    print(f"Business views refresh failed: {str(e)}")

print("-" * 60)

StatementMeta(, 875e6559-5b8c-407a-a96f-41cffaa6a4fd, 9, Finished, Available, Finished)

INFO:__main__:Procesando gold_dim_customer incremental
INFO:__main__:INCREMENTAL gold_dim_customer: 3,911 registros procesados
INFO:__main__:Procesando gold_dim_product incremental
INFO:__main__:INCREMENTAL gold_dim_product: 256,293 registros procesados
INFO:__main__:Procesando gold_dim_date incremental
INFO:__main__:No hay nuevas fechas para procesar
INFO:__main__:Procesando gold_fact_sales incremental
INFO:__main__:No hay facturas recientes para procesar
INFO:__main__:Procesando gold_fact_performance incremental


Business semantic views refreshed
------------------------------------------------------------


---
### Logs del proceso
---

In [8]:
execution_log_data = [(
    execution_date,
    "gold_incremental_load_semantic",
    datetime.now(),
    "completed" if all(r["status"] == "success" for r in results) else "completed_with_errors",
    "gold",
    "incremental",
    total_records,
    len([r for r in results if r["status"] == "success"]),
    len([r for r in results if r["status"] == "failed"]),
    f"Incremental processing: {lookback_days} days lookback, watermark: {watermark_date}"
)]

execution_log_schema = StructType([
    StructField("execution_id", StringType(), True),
    StructField("pipeline_name", StringType(), True),
    StructField("execution_timestamp", TimestampType(), True),
    StructField("status", StringType(), True),
    StructField("layer", StringType(), True),
    StructField("load_type", StringType(), True),
    StructField("total_records", LongType(), True),
    StructField("successful_tables", IntegerType(), True),
    StructField("failed_tables", IntegerType(), True),
    StructField("details", StringType(), True)
])

execution_log = spark.createDataFrame(execution_log_data, execution_log_schema)
execution_log.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable("gold_execution_log")

StatementMeta(, 875e6559-5b8c-407a-a96f-41cffaa6a4fd, 10, Finished, Available, Finished)

---
### Resumen del proceso
---

In [9]:
successful_loads = len([r for r in results if r["status"] == "success"])
failed_loads = len([r for r in results if r["status"] == "failed"])

print("=" * 60)
print("GOLD INCREMENTAL LAYER SUMMARY:")
print(f"Successful incremental loads: {successful_loads}/5")
print(f"Failed incremental loads: {failed_loads}/5")
print(f"Total records processed: {total_records:,}")
print(f"Processing window: {lookback_days} days (desde {watermark_date})")
print(f"Execution Date: {execution_date}")
print("=" * 60)

if successful_loads > 0:
    print("\nSuccessfully processed Gold tables:")
    for result in results:
        if result["status"] == "success":
            print(f"  {result['table_name']}: {result['record_count']:,} records")

print("\nOptimizing processed Gold tables...")
for result in results:
    if result["status"] == "success" and result['record_count'] > 0:
        try:
            spark.sql(f"OPTIMIZE {result['table_name']}")
            print(f"Optimized: {result['table_name']}")
        except Exception as e:
            print(f"Error optimizing {result['table_name']}: {e}")

StatementMeta(, 875e6559-5b8c-407a-a96f-41cffaa6a4fd, 11, Finished, Available, Finished)

GOLD INCREMENTAL LAYER SUMMARY:
Successful incremental loads: 5/5
Failed incremental loads: 0/5
Total records processed: 260,204
Processing window: 7 days (desde 2025-09-12)
Execution Date: 2025-09-19T04:39:32.525736

Successfully processed Gold tables:
  gold_dim_customer: 3,911 records
  gold_dim_product: 256,293 records
  gold_dim_date: 0 records
  gold_fact_sales: 0 records
  gold_fact_performance: 0 records

Optimizing processed Gold tables...
Optimized: gold_dim_customer
Optimized: gold_dim_product
