### Gold Layer - Full Load Script
- Purpose: Create business-ready data marts with applied business logic
- Layer: Gold (Business Data Marts)
- Load Type: Full

---
### Dependencies
---

In [1]:
import os
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import logging
from pyspark import StorageLevel

StatementMeta(, e87e3c5c-a33c-4f2f-bb9b-f4547d9be370, 3, Finished, Available, Finished)

---
### Parameters
---


In [2]:
execution_date = os.environ.get("execution_date", datetime.now().isoformat())

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

StatementMeta(, e87e3c5c-a33c-4f2f-bb9b-f4547d9be370, 4, Finished, Available, Finished)

---
### Configuraciones de optimización
---

In [3]:
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

StatementMeta(, e87e3c5c-a33c-4f2f-bb9b-f4547d9be370, 5, Finished, Available, Finished)

---
### Reglas de negocio
---
14 Silver tables → 5 Gold semantic models + business views

In [4]:
class GoldSimplifiedProcessor:
    def __init__(self, spark_session, execution_date):
        self.spark = spark_session
        self.execution_date = execution_date

    def create_gold_dim_customer(self):
        try:
            logger.info("Creando gold_dim_customer")

            customers_df = self.spark.table("silver_dim_customers")
            regions_df   = self.spark.table("silver_dim_regions")
            employees_df = self.spark.table("silver_dim_employees")

            consolidated_customer = (
                customers_df.alias("c")
                .join(regions_df.alias("r"), col("c.station") == col("r.station"), "left")
                .join(
                    employees_df.filter(col("role") == "Account Manager").alias("am"),
                    col("c.account_manager") == col("am.employee_name"), "left"
                )
                .join(
                    employees_df.filter(col("role") == "Key Account Manager").alias("kam"),
                    col("c.key_account_manager") == col("kam.employee_name"), "left"
                )
                .select(
                    # Llaves
                    col("c.customer_key"),
                    col("c.customer_sold_to_name"),
                    col("c.account_name"),
                    col("c.key_account_name"),

                    # Atributos para segmentación
                    col("c.transaction_type"),
                    col("c.account_type"),

                    # Geografía
                    col("r.system").alias("customer_system"),
                    col("r.interplanetary_region"),
                    col("r.territory"),
                    col("r.station"),
                    col("r.tax_rate"),

                    # Equipo de ventas
                    col("c.account_manager"),
                    col("c.key_account_manager"),
                    col("am.employee_email").alias("account_manager_email"),
                    col("kam.employee_email").alias("key_account_manager_email")
                )
            )

            record_count = consolidated_customer.count()
            consolidated_customer.write.mode("overwrite").format("delta").saveAsTable("gold_dim_customer")
            
            logger.info(f"gold_dim_customer: {record_count:,} registros")
            return record_count

        except Exception as e:
            logger.error(f"Error creando gold_dim_customer: {str(e)}")
            raise e

    def create_gold_dim_product(self):
        try:
            logger.info("Creando gold_dim_product")

            products_df = self.spark.table("silver_dim_products")
            brands_df   = self.spark.table("silver_dim_brands")

            consolidated_product = (
                products_df.alias("p")
                .join(brands_df.alias("b"), col("p.sub_brand_name") == col("b.sub_brand"), "left")
                .select(
                    # Llaves
                    col("p.product_key"),
                    col("p.product_name"),
                    col("p.type").alias("product_type"),
                    col("p.subtype").alias("product_subtype"),

                    # Atributos físicos
                    col("p.ship_class_for_part"),
                    col("p.weight_tonnes"),
                    col("p.color"),
                    col("p.material"),

                    # Jerarquía de marca
                    col("b.flagship").alias("brand_flagship"),
                    col("b.class").alias("brand_class"),
                    col("b.brand"),
                    col("p.sub_brand_name"),

                    # Responsables
                    col("p.product_business_line_leader"),
                    col("b.product_brand_vp")
                )
            )

            record_count = consolidated_product.count()
            consolidated_product.write.mode("overwrite").format("delta").saveAsTable("gold_dim_product")
            
            logger.info(f"gold_dim_product: {record_count:,} registros")
            return record_count

        except Exception as e:
            logger.error(f"Error creando gold_dim_product: {str(e)}")
            raise e

    def create_gold_fact_sales(self):
        try:
            logger.info("Creando gold_fact_sales")

            invoices_df = self.spark.table("silver_fact_invoices")
            budget_rate_df = self.spark.table("silver_dim_budget_rate")
            doc_types_df = self.spark.table("silver_dim_invoice_doctype")

            gold_fact_sales = (
                invoices_df.alias("i")
                .join(budget_rate_df.alias("br"), col("i.local_currency") == col("br.from_currency"), "left")
                .join(doc_types_df.alias("dt"), col("i.billing_document_type_code") == col("dt.billing_document_type_code"), "left")
                .select(
                    # Llaves existentes
                    col("i.customer_key"),
                    col("i.product_key"),
                    
                    # Fechas como DATE
                    to_date(col("i.billing_date")).alias("billing_date"),
                    to_date(col("i.ship_date")).alias("ship_date"),

                    # Identificadores
                    col("i.billing_document_number").alias("invoice_number"),
                    col("i.billing_document_line_item_number").alias("line_item"),

                    # Valores en EUR (conversión única)
                    (col("i.net_invoice_value") * coalesce(col("br.rate"), lit(1.0))).alias("sales_eur"),
                    (col("i.net_invoice_cogs") * coalesce(col("br.rate"), lit(1.0))).alias("cogs_eur"),
                    (col("i.delivery_cost") * coalesce(col("br.rate"), lit(1.0))).alias("delivery_cost_eur"),
                    (col("i.freight") * coalesce(col("br.rate"), lit(1.0))).alias("freight_eur"),
                    (col("i.taxes_commercial_fees") * coalesce(col("br.rate"), lit(1.0))).alias("taxes_eur"),

                    # Cantidad
                    col("i.net_invoice_quantity").alias("quantity"),

                    # Indicador simple
                    col("i.otd_indicator").cast("boolean").alias("on_time_delivery"),
                    
                    # Categorización de documentos (CRÍTICO para análisis completo)
                    when(col("dt.group_col") == "Invoice", lit("Sale"))
                    .when(col("dt.group_col") == "Adjustment", lit("Adjustment"))
                    .when(col("dt.group_col").isNull(), lit("Unclassified"))
                    .otherwise(col("dt.group_col")).alias("document_category"),
                    
                    # Metadatos para trazabilidad
                    col("i.billing_document_type_code").alias("source_doc_type_code"),
                    col("dt.text").alias("document_type_description")
                )
            )

            record_count = gold_fact_sales.count()
            gold_fact_sales.write.mode("overwrite").format("delta").saveAsTable("gold_fact_sales")

            logger.info(f"gold_fact_sales: {record_count:,} registros")
            return record_count

        except Exception as e:
            logger.error(f"Error creando gold_fact_sales: {str(e)}")
            raise e

    def create_gold_fact_orders(self):
        try:
            logger.info("Creando gold_fact_orders")

            orders_df = self.spark.table("silver_fact_orders")
            budget_rate_df = self.spark.table("silver_dim_budget_rate")

            gold_fact_orders = (
                orders_df.alias("o")
                .join(budget_rate_df.alias("br"), col("o.local_currency") == col("br.from_currency"), "left")
                .select(
                    # Llaves existentes
                    col("o.customer_key"),
                    col("o.product_key"),

                    # Fechas como DATE
                    to_date(col("o.order_date")).alias("order_date"),
                    to_date(col("o.ship_date")).alias("ship_date"),
                    to_date(col("o.request_goods_receipt_date")).alias("requested_date"),

                    # Identificadores
                    col("o.sales_order_document_number").alias("order_number"),
                    col("o.sales_order_document_line_item_number").alias("line_item"),

                    # Valores en EUR
                    (col("o.net_order_value") * coalesce(col("br.rate"), lit(1.0))).alias("order_value_eur"),
                    col("o.net_order_quantity").alias("quantity"),

                    # Status
                    col("o.sales_order_document_line_item_status").alias("order_status")
                )
            )

            record_count = gold_fact_orders.count()
            gold_fact_orders.write.mode("overwrite").format("delta").saveAsTable("gold_fact_orders")

            logger.info(f"gold_fact_orders: {record_count:,} registros")
            return record_count

        except Exception as e:
            logger.error(f"Error creando gold_fact_orders: {str(e)}")
            raise e

    def create_gold_fact_budget(self):
        try:
            logger.info("Creando gold_fact_budget")

            budget_df = self.spark.table("silver_fact_budget")

            gold_fact_budget = (
                budget_df.select(
                    # Llaves existentes
                    col("customer_key"),
                    col("product_key"),

                    # Fecha como DATE
                    to_date(col("month")).alias("budget_month"),

                    # Valor en EUR (ya viene en EUR desde silver)
                    col("total_budget").alias("budget_eur")
                )
                .filter(col("budget_month").isNotNull())
            )

            record_count = gold_fact_budget.count()
            gold_fact_budget.write.mode("overwrite").format("delta").saveAsTable("gold_fact_budget")

            logger.info(f"gold_fact_budget: {record_count:,} registros")
            return record_count

        except Exception as e:
            logger.error(f"Error creando gold_fact_budget: {str(e)}")
            raise e

StatementMeta(, e87e3c5c-a33c-4f2f-bb9b-f4547d9be370, 11, Finished, Available, Finished)

In [5]:
gold_processor = GoldSimplifiedProcessor(spark, execution_date)

StatementMeta(, e87e3c5c-a33c-4f2f-bb9b-f4547d9be370, 12, Finished, Available, Finished)

---
### Creación de data de negocio
---

In [6]:
results = []
total_records = 0

tables_to_create = [
    ('gold_dim_customer', gold_processor.create_gold_dim_customer),
    ('gold_dim_product', gold_processor.create_gold_dim_product),
    ('gold_fact_sales', gold_processor.create_gold_fact_sales),
    ('gold_fact_orders', gold_processor.create_gold_fact_orders),
    ('gold_fact_budget', gold_processor.create_gold_fact_budget)
]

for table_name, create_function in tables_to_create:
    try:
        record_count = create_function()
        results.append({'table': table_name, 'records': record_count, 'status': 'success'})
        total_records += record_count
        print(f"✓ {table_name}: {record_count:,} registros")
    except Exception as e:
        results.append({'table': table_name, 'records': 0, 'status': 'failed', 'error': str(e)})
        print(f"✗ {table_name}: FAILED - {str(e)}")

successful = [r for r in results if r['status'] == 'success']
failed = [r for r in results if r['status'] == 'failed']

print("\n" + "=" * 50)
print("RESUMEN:")
print(f"✓ Exitosas: {len(successful)}/5")
print(f"✗ Fallidas: {len(failed)}/5")
print(f"Total registros: {total_records:,}")

print("\n" + "=" * 50)

StatementMeta(, e87e3c5c-a33c-4f2f-bb9b-f4547d9be370, 13, Finished, Available, Finished)

INFO:__main__:Creando gold_dim_customer


✓ gold_dim_customer: 3,911 registros


✓ gold_dim_product: 256,293 registros


✓ gold_fact_sales: 17,037,850 registros


✓ gold_fact_orders: 14,766,489 registros


✓ gold_fact_budget: 276,151 registros

RESUMEN:
✓ Exitosas: 5/5
✗ Fallidas: 0/5
Total registros: 32,340,694



---
### Logs del proceso
---

In [7]:
execution_log_data = [(
    execution_date,
    "gold_full_load_semantic_7tables",
    datetime.now(),
    "completed" if all(r["status"] == "success" for r in results) else "completed_with_errors",
    "gold",
    "full",
    total_records,
    len([r for r in results if r["status"] == "success"]),
    len([r for r in results if r["status"] == "failed"]),
    "Semantic consolidation: 16 Silver → 7 Gold models (3 dimensions + 4 fact tables)"
)]

execution_log_schema = StructType([
    StructField("execution_id", StringType(), True),
    StructField("pipeline_name", StringType(), True),
    StructField("execution_timestamp", TimestampType(), True),
    StructField("status", StringType(), True),
    StructField("layer", StringType(), True),
    StructField("load_type", StringType(), True),
    StructField("total_records", LongType(), True),
    StructField("successful_tables", IntegerType(), True),
    StructField("failed_tables", IntegerType(), True),
    StructField("details", StringType(), True)
])

execution_log = spark.createDataFrame(execution_log_data, execution_log_schema)
execution_log.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable("gold_execution_log")

StatementMeta(, e87e3c5c-a33c-4f2f-bb9b-f4547d9be370, 14, Finished, Available, Finished)

---
### Resumen del proceso
---

In [8]:
successful_loads = len([r for r in results if r["status"] == "success"])
failed_loads = len([r for r in results if r["status"] == "failed"])

print("=" * 60)
print("GOLD OPTIMIZED LAYER SUMMARY:")
print(f"Successful tables: {successful_loads}/5")
print(f"Failed tables: {failed_loads}/5")
print(f"Total records in Gold layer: {total_records:,}")
print(f"Optimization: 14+ Silver tables → 5 Gold clean tables")
print(f"Execution Date: {execution_date}")
print("=" * 60)

if successful_loads > 0:
    print("\nSuccessfully created Gold tables:")
    for result in results:
        if result["status"] == "success":
            print(f"  {result['table']}: {result['records']:,} records")


print(f"\nOptimizing Gold tables...")
for result in results:
    if result["status"] == "success":
        try:
            spark.sql(f"OPTIMIZE {result['table']}")
            print(f"Optimized: {result['table']}")
        except Exception as e:
            print(f"Error optimizing {result['table']}: {e}")

StatementMeta(, e87e3c5c-a33c-4f2f-bb9b-f4547d9be370, 18, Finished, Available, Finished)

GOLD OPTIMIZED LAYER SUMMARY:
Successful tables: 5/5
Failed tables: 0/5
Total records in Gold layer: 32,340,694
Optimization: 14+ Silver tables → 5 Gold clean tables
Execution Date: 2025-09-20T00:24:59.861540

Successfully created Gold tables:
  gold_dim_customer: 3,911 records
  gold_dim_product: 256,293 records
  gold_fact_sales: 17,037,850 records
  gold_fact_orders: 14,766,489 records
  gold_fact_budget: 276,151 records

Optimizing Gold tables...
Optimized: gold_dim_customer


Optimized: gold_dim_product
Optimized: gold_fact_sales
Optimized: gold_fact_orders
Optimized: gold_fact_budget
