## Bronze Layer - Full Load (Azure SQL Database)
- **Purpose**: Extract data from SpaceParts training database
- **Layer**: Bronze (Raw Data)
- **Load Type**: Full

---
### Parametros
---

In [None]:
import os
from datetime import datetime

execution_date = os.environ.get("execution_date", datetime.now().isoformat())

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 3, Finished, Available, Finished)

---
### Dependencias
---

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta
import logging
import pandas as pd
import re, unicodedata
from collections import defaultdict

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 4, Finished, Available, Finished)

---
### Configuraciones de optimización
---

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 5, Finished, Available, Finished)

- `spark.sql.adaptive.enabled = true`:  
  Activa el *Adaptive Query Execution (AQE)*, que ajusta dinámicamente los planes de ejecución en función de los datos en tiempo de ejecución.

- `spark.sql.adaptive.coalescePartitions.enabled = true`:  
  Permite fusionar particiones pequeñas automáticamente para mejorar el rendimiento de las consultas.

- `spark.databricks.delta.optimizeWrite.enabled = true`:  
  Optimiza la escritura en tablas Delta Lake, reduciendo el número de pequeños archivos y mejorando la eficiencia.

---
### Configuraciones de los Logs
---

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 6, Finished, Available, Finished)

---
### Credenciales de conexión
---

In [None]:
class BronzeFullLoader:   # Credenciales para el proceso
    def __init__(self, spark_session):
        self.spark = spark_session
        self.server = "te3-training-eu.database.windows.net"
        self.database = "SpacePartsCoDW"
        self.username = "dwreader@te3-training-eu"
        self.password = "TE3#reader!"
    
    def get_jdbc_connection_properties(self):
        jdbc_url = f"jdbc:sqlserver://{self.server}:1433;database={self.database};encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.database.windows.net;loginTimeout=30;"
    
        return {
            "url": jdbc_url,
            "user": self.username,
            "password": self.password,
            "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
        }

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 7, Finished, Available, Finished)

In [None]:
bronze_loader = BronzeFullLoader(spark)

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 8, Finished, Available, Finished)

---
### Test de la data extraida
---
Unicamente 5 de las tablas disponibles

In [None]:
try:
    connection_props = bronze_loader.get_jdbc_connection_properties()
    
    test_query = "(SELECT TOP 5 TABLE_SCHEMA, TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE='BASE TABLE') as test_query"
    
    test_df = spark.read \
        .format("jdbc") \
        .option("url", connection_props["url"]) \
        .option("dbtable", test_query) \
        .option("user", connection_props["user"]) \
        .option("password", connection_props["password"]) \
        .option("driver", connection_props["driver"]) \
        .load()
    
    print("Conección establecida:")
    test_df.show(truncate=False)
    
except Exception as e:
    print(f"Conección fallida: {str(e)}")
    raise

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 9, Finished, Available, Finished)

Conección establecida:


+------------+-------------+
|TABLE_SCHEMA|TABLE_NAME   |
+------------+-------------+
|dim         |Employees    |
|dim         |Brands       |
|dim         |Budget-Rate  |
|dim         |Customers    |
|dim         |Exchange-Rate|
+------------+-------------+



---
### Mapeo de las tablas origen
---

In [None]:

try:
    tables_query = """(
        SELECT TABLE_SCHEMA, TABLE_NAME 
        FROM INFORMATION_SCHEMA.TABLES 
        WHERE TABLE_TYPE='BASE TABLE' 
        AND TABLE_SCHEMA IN ('dim', 'fact')
    ) as tables_query"""
    
    connection_props = bronze_loader.get_jdbc_connection_properties()
    
    tables_df = spark.read \
        .format("jdbc") \
        .option("url", connection_props["url"]) \
        .option("dbtable", tables_query) \
        .option("user", connection_props["user"]) \
        .option("password", connection_props["password"]) \
        .option("driver", connection_props["driver"]) \
        .load()
    
    tables_df = tables_df.orderBy("TABLE_SCHEMA", "TABLE_NAME")    
    tables_list = tables_df.collect()
    
    print(f"Se encontraron {len(tables_list)} tablas que se pueden extraer:")
    for row in tables_list:
        print(f"  - {row.TABLE_SCHEMA}.{row.TABLE_NAME}")

except Exception as e:
    print(f"Failed to get table list: {str(e)}")
    tables_list = [
        type('obj', (object,), {'TABLE_SCHEMA': 'dim', 'TABLE_NAME': 'Brands'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'dim', 'TABLE_NAME': 'Budget-Rate'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'dim', 'TABLE_NAME': 'Customers'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'dim', 'TABLE_NAME': 'Employees'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'dim', 'TABLE_NAME': 'Exchange-Rate'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'dim', 'TABLE_NAME': 'Products'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'dim', 'TABLE_NAME': 'Regions'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'fact', 'TABLE_NAME': 'Invoices'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'fact', 'TABLE_NAME': 'Orders'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'fact', 'TABLE_NAME': 'Budget'}),
        type('obj', (object,), {'TABLE_SCHEMA': 'fact', 'TABLE_NAME': 'Forecast'})
    ]


StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 10, Finished, Available, Finished)

Se encontraron 14 tablas que se pueden extraer:
  - dim.Brands
  - dim.Budget-Rate
  - dim.Customers
  - dim.Employees
  - dim.Exchange-Rate
  - dim.Invoice-DocType
  - dim.Order-DocType
  - dim.Order-Status
  - dim.Products
  - dim.Regions
  - fact.Budget
  - fact.Forecast
  - fact.Invoices
  - fact.Orders


In [None]:
extraction_results = []
total_records = 0

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 11, Finished, Available, Finished)

---
### Estudio de la información a extraer
---

In [None]:
def split_table_name(table_name: str):
    if "." in table_name:
        schema, table = table_name.split(".", 1)
    else:
        schema, table = "dbo", table_name
    return schema, table

def bracket_ident(ident: str) -> str:
    return "[" + ident.replace("]", "]]") + "]"

def sql_full_name(schema: str, table: str) -> str:
    return f"{bracket_ident(schema)}.{bracket_ident(table)}"

def sql_object_id_literal(schema: str, table: str) -> str:
    return f"{bracket_ident(schema)}.{bracket_ident(table)}"

def _sanitize_sql(sql: str) -> str:
    s = sql.strip()
    if s.endswith(";"):
        s = s[:-1]
    s = re.sub(r"--.*?$", "", s, flags=re.MULTILINE)
    s = re.sub(r"[ \t]+", " ", s)
    if not s.endswith("\n"):
        s += "\n"
    return s

def _q(loader: BronzeFullLoader, sql: str) -> pd.DataFrame:
    sql = _sanitize_sql(sql)
    props = loader.get_jdbc_connection_properties()
    df = (
        loader.spark.read
        .format("jdbc")
        .option("url", props["url"])
        .option("query", sql)
        .option("user", props["user"])
        .option("password", props["password"])
        .option("driver", props["driver"])
        .load()
    )
    return df.toPandas()

def profile_table(loader: BronzeFullLoader, table_name: str, sample_n: int = 5):
    schema, table = split_table_name(table_name)
    obj_literal = sql_object_id_literal(schema, table)   
    full_name   = sql_full_name(schema, table)          

    metrics_sql = f"""
        SELECT SUM(p.rows) AS row_count
        FROM sys.objects o
        JOIN sys.indexes i    ON i.object_id = o.object_id
        JOIN sys.partitions p ON p.object_id = i.object_id AND p.index_id = i.index_id
        WHERE o.object_id = OBJECT_ID(N'{obj_literal}')
          AND o.type = 'U'
          AND i.index_id IN (0,1)
    """

    columns_sql = f"""
        SELECT c.column_id,
               c.name AS column_name,
               t.name AS sql_type,
               c.max_length,
               c.precision,
               c.scale,
               c.is_nullable
        FROM sys.columns c
        JOIN sys.types t ON t.user_type_id = c.user_type_id
        WHERE c.object_id = OBJECT_ID(N'{obj_literal}')
    """

    sample_sql = f"SELECT TOP {int(sample_n)} * FROM {full_name}"

    try:
        metrics = _q(loader, metrics_sql)
    except Exception as e:
        print(f"Error obteniendo métricas para {full_name}: {e}")
        metrics = pd.DataFrame([{"row_count": 0}])

    try:
        columns = _q(loader, columns_sql)
        if not columns.empty and "column_id" in columns.columns:
            columns = columns.sort_values("column_id").reset_index(drop=True)
    except Exception as e:
        print(f"Error obteniendo columnas para {full_name}: {e}")
        columns = pd.DataFrame()

    try:
        sample = _q(loader, sample_sql)
    except Exception as e:
        print(f"Muestra no disponible para {full_name}: {e}")
        sample = pd.DataFrame()

    row_count = int(metrics.iloc[0]["row_count"]) if not metrics.empty and pd.notna(metrics.iloc[0]["row_count"]) else 0

    display(pd.DataFrame([{
        "table_name": f"{schema}.{table}",
        "row_count": row_count,
        "column_count": len(columns)
    }]))
    display(columns)
    if not sample.empty:
        display(sample)

def profile_many(loader: BronzeFullLoader, tables: list[str], sample_n: int = 5):
    for t in tables:
        print(f"\n--- {t} ---")
        profile_table(loader, t, sample_n=sample_n)


StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 12, Finished, Available, Finished)

In [None]:
dim_tables = [
 'dim.Brands',
 'dim.Budget-Rate',
 'dim.Customers',
 'dim.Employees',
 'dim.Exchange-Rate',
 'dim.Invoice-DocType',
 'dim.Order-DocType',
 'dim.Order-Status',
 'dim.Products',
 'dim.Regions',
 'fact.Budget',
 'fact.Forecast',
 'fact.Invoices',
 'fact.Orders'
]

loader = BronzeFullLoader(spark) 
profile_many(loader, dim_tables, sample_n=5)

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 13, Finished, Available, Finished)


--- dim.Brands ---


SynapseWidget(Synapse.DataFrame, 4d620510-a932-47f6-b7c5-1d03c29a221a)

SynapseWidget(Synapse.DataFrame, 9e1ab2a3-56a8-49b0-a3a3-30e72a8b1e30)

SynapseWidget(Synapse.DataFrame, e7bbf6ec-cf34-45e7-9942-033698bce1fb)


--- dim.Budget-Rate ---


SynapseWidget(Synapse.DataFrame, 6ebf3865-75fd-41af-a8ff-db08f91ac088)

SynapseWidget(Synapse.DataFrame, 2f73f8c3-497c-40fa-bc23-63dc48c36f73)

SynapseWidget(Synapse.DataFrame, 8c8f7ff8-cca7-4962-b594-f72de56789bf)


--- dim.Customers ---


SynapseWidget(Synapse.DataFrame, feaff6e6-bbb0-4d64-aeb7-4c38dae5dae1)

SynapseWidget(Synapse.DataFrame, a3adc76e-3b92-403d-8a74-799fb86b6ef8)

SynapseWidget(Synapse.DataFrame, 99165491-e232-4976-b577-427980855182)


--- dim.Employees ---


SynapseWidget(Synapse.DataFrame, c790fd6e-974f-47a8-8029-24d198147896)

SynapseWidget(Synapse.DataFrame, bac9cc3d-ad5e-4957-af14-a3e06f3c1888)

SynapseWidget(Synapse.DataFrame, b1fd10a2-bbb4-4b7a-a84e-234e9eaada5e)


--- dim.Exchange-Rate ---


SynapseWidget(Synapse.DataFrame, 4fe6aab2-9875-4856-938c-6e36ea31097a)

SynapseWidget(Synapse.DataFrame, 9202f56b-ed06-473e-8034-f945221ec86d)

SynapseWidget(Synapse.DataFrame, 8309a7ca-53bd-4379-8a60-4bfd834e0f75)


--- dim.Invoice-DocType ---


SynapseWidget(Synapse.DataFrame, 40b7d1f3-9014-4fbe-bce8-d013404411d8)

SynapseWidget(Synapse.DataFrame, a7efebac-b791-4a43-8f81-5bc052e2a96b)

SynapseWidget(Synapse.DataFrame, 2d63d914-19ba-4827-9fe4-2eec3077ae64)


--- dim.Order-DocType ---


SynapseWidget(Synapse.DataFrame, dba73741-b183-4a49-9a01-c41d9ff2fffe)

SynapseWidget(Synapse.DataFrame, f02d0c9f-eb6d-46d1-8355-62309308ab14)

SynapseWidget(Synapse.DataFrame, 8d20b581-2d77-4aea-973e-e1991fa3e928)


--- dim.Order-Status ---


SynapseWidget(Synapse.DataFrame, 4cc05dc7-ab69-482c-b101-9369e059f0be)

SynapseWidget(Synapse.DataFrame, 105dddf6-1a51-41de-b0d7-f2c70a4da751)

SynapseWidget(Synapse.DataFrame, 8a972e27-291f-4763-9c0e-275f8b4d6c45)


--- dim.Products ---


SynapseWidget(Synapse.DataFrame, 66a868e4-3133-46c4-830c-30c208a4f2b0)

SynapseWidget(Synapse.DataFrame, 89449639-b9e7-4f96-8ed7-38094ebf495f)

SynapseWidget(Synapse.DataFrame, f88f15a3-24f3-4563-a703-990b70d9aab0)


--- dim.Regions ---


SynapseWidget(Synapse.DataFrame, d43d2963-ecb5-4fc3-9256-621ca12787de)

SynapseWidget(Synapse.DataFrame, 84ff22f2-8f18-416d-b0e5-a27846dee7ba)

SynapseWidget(Synapse.DataFrame, fd78786f-6872-4af1-be21-014a41d283b2)


--- fact.Budget ---


SynapseWidget(Synapse.DataFrame, 6974bf50-d65e-4e37-9c8d-f0ee7eaab823)

SynapseWidget(Synapse.DataFrame, aef1885d-e361-4894-943b-fd4a8a5e7480)

SynapseWidget(Synapse.DataFrame, 7e13156c-ca72-4bd1-ba72-1f6c8dced759)


--- fact.Forecast ---


SynapseWidget(Synapse.DataFrame, cdfaf98c-4aa7-4a52-bf0f-e52e06e72b6d)

SynapseWidget(Synapse.DataFrame, 3757adbf-5515-49fd-888d-7e4f5f65956d)

SynapseWidget(Synapse.DataFrame, 920074cd-0820-4e65-9351-e2d4f37ed612)


--- fact.Invoices ---


SynapseWidget(Synapse.DataFrame, ad1514fb-ed2f-4438-8839-9527d405f298)

SynapseWidget(Synapse.DataFrame, ed47d639-f7b3-46e5-834b-188dba7229c8)

SynapseWidget(Synapse.DataFrame, 42fb92cc-f3c8-4a7c-a8ca-a33222971822)


--- fact.Orders ---


SynapseWidget(Synapse.DataFrame, 6f3f75ad-d9d1-4ce9-8a1f-d6b4fbfe3e0d)

SynapseWidget(Synapse.DataFrame, 193a65ab-3bd3-4ce9-8e5c-253acb7d967f)

SynapseWidget(Synapse.DataFrame, 1bca0f73-135f-4769-b749-14b9c117c73b)

---
### Nombre de las columnas con la tabla origen
---

In [None]:
def columns_panel(loader: BronzeFullLoader, tables: list[str]):
    results = []
    for t in tables:
        schema, table = split_table_name(t)
        obj_literal = sql_object_id_literal(schema, table)
        full_name   = sql_full_name(schema, table)

        columns_sql = f"""
            SELECT c.name AS column_name
            FROM sys.columns c
            WHERE c.object_id = OBJECT_ID('{obj_literal}')
        """

        try:
            cols = _q(loader, columns_sql)
            for col in cols["column_name"].tolist():
                results.append({"table_name": f"{schema}.{table}", "column_name": col})
        except Exception as e:
            print(f"Error obteniendo columnas para {full_name}: {e}")

    df = pd.DataFrame(results)
    display(df)
    return df

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 14, Finished, Available, Finished)

In [None]:
dim_tables = [
 'dim.Brands',
 'dim.Budget-Rate',
 'dim.Customers',
 'dim.Employees',
 'dim.Exchange-Rate',
 'dim.Invoice-DocType',
 'dim.Order-DocType',
 'dim.Order-Status',
 'dim.Products',
 'dim.Regions'
]

fact_tables = [
 'fact.Budget',
 'fact.Forecast',
 'fact.Invoices',
 'fact.Orders'
]

all_tables = dim_tables + fact_tables

columns_df = columns_panel(loader, all_tables)

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 1580f050-a62b-49b0-9ee3-ecdda42439ce)

---
### Normalización de nombres para cargue
---

In [None]:
FORBIDDEN_CHARS = r"[ ,;{}\(\)\n\t=]+"
RESERVED = {
    "select","from","where","group","order","by","having","limit","offset",
    "and","or","not","as","on","join","inner","left","right","full","cross",
    "desc","asc","table","column","index","view","database","schema","create",
    "drop","alter","insert","update","delete","merge","into","values","set",
    "case","when","then","else","end","union","all","distinct","true","false",
    "null"
}

def strip_accents(text: str) -> str:
    t = unicodedata.normalize("NFKD", str(text))
    return "".join([c for c in t if not unicodedata.combining(c)])

def clean_identifier(name: str) -> str:
    if name is None: return "col"
    s = strip_accents(str(name).strip())
    s = re.sub(FORBIDDEN_CHARS, "_", s)
    s = s.replace(".", "_").replace("-", "_").replace("/", "_").replace("\\", "_")
    s = re.sub(r"[^0-9a-zA-Z_]", "", s)
    s = re.sub(r"_+", "_", s).strip("_").lower()
    if re.match(r"^[0-9]", s): s = "c_" + s
    if s in RESERVED: s = s + "_col"
    if not s: s = "col"
    return s[:128]

def split_schema_table(table_name: str):
    val = str(table_name).strip()
    if "." in val:
        s, t = val.split(".", 1)
    else:
        s, t = "dbo", val
    return s.strip(), t.strip()

def clean_bronze_table_name(schema: str, table: str) -> str:
    schema_c = clean_identifier(schema)
    table_c  = clean_identifier(table)
    return f"bronze_{schema_c}_{table_c}"

def build_column_mapping_from_df(columns_df):
    cols_norm = {c.lower().strip(): c for c in columns_df.columns}
    df = columns_df.rename(columns=cols_norm)

    table_col  = next((c for c in df.columns if c in ["table","table_name","tabla","table_name_full","tabla_origen"]), None)
    column_col = next((c for c in df.columns if c in ["column","column_name","columna","nombre_columna"]), None)
    if table_col is None or column_col is None:
        raise ValueError(f"Se requieren columnas 'table_name' y 'column_name' (o equivalentes). Encontradas: {list(df.columns)}")

    per_table_seen = defaultdict(set)
    mapping = defaultdict(dict)   
    for _, r in df.iterrows():
        schema, table = split_schema_table(r[table_col])
        full = f"{schema}.{table}"
        old  = str(r[column_col]).strip()
        new  = clean_identifier(old)

        base = new; k = 1
        while new in per_table_seen[full]:
            k += 1
            new = f"{base}_{k}"
        per_table_seen[full].add(new)

        mapping[full][old] = new
    return mapping

def apply_column_mapping(df_spark, schema: str, table: str, mapping_by_table: dict):
    full = f"{schema}.{table}"
    if full not in mapping_by_table:
        return df_spark
    mp = mapping_by_table[full]
    for old_col, new_col in mp.items():
        if old_col in df_spark.columns and old_col != new_col:
            df_spark = df_spark.withColumnRenamed(old_col, new_col)
    return df_spark


StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 16, Finished, Available, Finished)

---
### Extracción completa de la información
---

In [None]:
if not hasattr(BronzeFullLoader, "extract_table_full"):
    def _extract_table_full(self, schema: str, table: str):
        props = self.get_jdbc_connection_properties()
        full_name = f"[{schema}].[{table}]"
        query = f"SELECT * FROM {full_name}"

        return (
            self.spark.read.format("jdbc")
            .option("url", props["url"])
            .option("user", props["user"])
            .option("password", props["password"])
            .option("driver", props["driver"])
            .option("query", query)
            .option("fetchsize", "10000")
            .load()
        )
    BronzeFullLoader.extract_table_full = _extract_table_full 

if not hasattr(BronzeFullLoader, "save_bronze_table"):
    def _save_bronze_table(self, df_spark, bronze_table_name: str) -> int:
        df_spark.write.mode("overwrite").format("delta").saveAsTable(bronze_table_name)
        return df_spark.count()
    BronzeFullLoader.save_bronze_table = _save_bronze_table

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 22, Finished, Available, Finished)

In [None]:
column_map_by_table = build_column_mapping_from_df(columns_df)

extraction_results = []
total_records = 0

for table_row in tables_list:
    schema = table_row.TABLE_SCHEMA
    table  = table_row.TABLE_NAME
    bronze_table_name = clean_bronze_table_name(schema, table)

    try:
        df_extracted = bronze_loader.extract_table_full(schema, table)
        df_clean = apply_column_mapping(df_extracted, schema, table, column_map_by_table)
        record_count = bronze_loader.save_bronze_table(df_clean, bronze_table_name)

        extraction_results.append({
            "source_table": f"{schema}.{table}",
            "bronze_table": bronze_table_name,
            "record_count": record_count,
            "status": "success"
        })
        total_records += record_count
        print(f" {bronze_table_name}: {record_count:,} records")

    except Exception as e:
        extraction_results.append({
            "source_table": f"{schema}.{table}",
            "bronze_table": bronze_table_name,
            "record_count": 0,
            "status": "failed",
            "error": str(e)
        })
        print(f"✗ {bronze_table_name}: FAILED - {str(e)}")

    print("-" * 40)

print("-" * 60)

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 23, Finished, Available, Finished)

 bronze_dim_brands: 20 records
----------------------------------------


 bronze_dim_budget_rate: 15 records
----------------------------------------


 bronze_dim_customers: 3,911 records
----------------------------------------


 bronze_dim_employees: 893 records
----------------------------------------


 bronze_dim_exchange_rate: 57,900 records
----------------------------------------


 bronze_dim_invoice_doctype: 5 records
----------------------------------------


 bronze_dim_order_doctype: 4 records
----------------------------------------


 bronze_dim_order_status: 6 records
----------------------------------------


 bronze_dim_products: 256,293 records
----------------------------------------


 bronze_dim_regions: 181 records
----------------------------------------


 bronze_fact_budget: 2,947,811 records
----------------------------------------


 bronze_fact_forecast: 5,197 records
----------------------------------------


 bronze_fact_invoices: 18,459,441 records
----------------------------------------


 bronze_fact_orders: 16,910,069 records
----------------------------------------
------------------------------------------------------------


---
### Logs del proceso
---

In [None]:
execution_log_data = [(
    execution_date,
    "bronze_full_load_azure",
    datetime.now(),
    "completed" if all(r["status"] == "success" for r in extraction_results) else "completed_with_errors",
    "bronze",
    "full",
    total_records,
    len([r for r in extraction_results if r["status"] == "success"]),
    len([r for r in extraction_results if r["status"] == "failed"]),
    str(extraction_results)[:1000]
)]

execution_log_schema = StructType([
    StructField("execution_id", StringType(), True),
    StructField("pipeline_name", StringType(), True),
    StructField("execution_timestamp", TimestampType(), True),
    StructField("status", StringType(), True),
    StructField("layer", StringType(), True),
    StructField("load_type", StringType(), True),
    StructField("total_records", LongType(), True),
    StructField("successful_tables", IntegerType(), True),
    StructField("failed_tables", IntegerType(), True),
    StructField("initial_loads", IntegerType(), True),
    StructField("incremental_loads", IntegerType(), True),
    StructField("details", StringType(), True)
])


StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 24, Finished, Available, Finished)

---
### Resumen del proceso
---

In [None]:
successful_loads = len([r for r in extraction_results if r["status"] == "success"])
failed_loads     = len([r for r in extraction_results if r["status"] == "failed"])

execution_summary = {
    "status": "completed" if failed_loads == 0 else "completed_with_errors",
    "successful_tables": successful_loads,
    "failed_tables": failed_loads,
    "total_records": total_records,
    "execution_date": execution_date,   
    "data_source": "azure_sql_database"
}

for key, value in execution_summary.items():
    print(f"{key}: {value}")

summary_data = [(
    execution_date,
    "bronze_full_load",
    execution_summary["status"],
    execution_summary["successful_tables"],
    execution_summary["failed_tables"],
    execution_summary["total_records"],
    datetime.now()
)]

summary_schema = StructType([
    StructField("execution_date", StringType(), True),
    StructField("notebook_name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("successful_tables", IntegerType(), True),
    StructField("failed_tables", IntegerType(), True),
    StructField("total_records", LongType(), True),
    StructField("completed_timestamp", TimestampType(), True)
])

summary_df = spark.createDataFrame(summary_data, summary_schema)

summary_df.write.format("delta").mode("append").saveAsTable("notebook_execution_summary")

StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 28, Finished, Available, Finished)

status: completed
successful_tables: 14
failed_tables: 0
total_records: 38641746
execution_date: 2025-09-18T20:17:22.913548
data_source: azure_sql_database


---
### Resumen detallado
---

In [None]:
print(f"EXTRACTION SUMMARY:")
print(f"Successful extractions: {successful_loads}")
print(f"Failed extractions: {failed_loads}")
print(f"Total records extracted: {total_records:,}")

if successful_loads > 0:
    print("\nSuccessfully created bronze tables:")
    for result in extraction_results:
        if result["status"] == "success":
            print(f"  - {result['bronze_table']}: {result['record_count']:,} records")


StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 29, Finished, Available, Finished)

EXTRACTION SUMMARY:
Successful extractions: 14
Failed extractions: 0
Total records extracted: 38,641,746

Successfully created bronze tables:
  - bronze_dim_brands: 20 records
  - bronze_dim_budget_rate: 15 records
  - bronze_dim_customers: 3,911 records
  - bronze_dim_employees: 893 records
  - bronze_dim_exchange_rate: 57,900 records
  - bronze_dim_invoice_doctype: 5 records
  - bronze_dim_order_doctype: 4 records
  - bronze_dim_order_status: 6 records
  - bronze_dim_products: 256,293 records
  - bronze_dim_regions: 181 records
  - bronze_fact_budget: 2,947,811 records
  - bronze_fact_forecast: 5,197 records
  - bronze_fact_invoices: 18,459,441 records
  - bronze_fact_orders: 16,910,069 records


---
### Columnas renombradas
---

In [None]:
if successful_loads > 0:
    for result in extraction_results:
        if result["status"] == "success":
            full_table = result["source_table"]
            bronze_table = result["bronze_table"]
            print(f"\n Bronze table: {bronze_table}")
            print(f"   Source table: {full_table}")
            
            if full_table in column_map_by_table:
                print("   Columns (bronze → source):")
                mapping = column_map_by_table[full_table]
                for src, tgt in mapping.items():
                    print(f"       {tgt:<30} ← {src}")


StatementMeta(, d2de978b-792d-4d10-88f6-15acbda097a9, 30, Finished, Available, Finished)


 Bronze table: bronze_dim_brands
   Source table: dim.Brands
   Columns (bronze → source):
       flagship                       ← Flagship
       class                          ← Class
       type                           ← Type
       brand                          ← Brand
       sub_brand                      ← Sub Brand
       product_brand_vp               ← Product Brand VP
       dwcreateddate                  ← DWCreatedDate

 Bronze table: bronze_dim_budget_rate
   Source table: dim.Budget-Rate
   Columns (bronze → source):
       rate                           ← Rate
       from_currency                  ← From Currency
       to_currency                    ← To Currency
       currency_system                ← Currency System
       dwcreateddate                  ← DWCreatedDate

 Bronze table: bronze_dim_customers
   Source table: dim.Customers
   Columns (bronze → source):
       customer_key                   ← Customer Key
       customer_sold_to_name          ← Custome