In [0]:
dbutils.widgets.removeAll()

In [0]:
dbutils.widgets.removeAll()
from pyspark.sql.functions import *
from pyspark.sql.types import *

# =========================
# (1) PARÁMETROS Y WIDGETS
# =========================
dbutils.widgets.text("storage_account", "adlssmartdata0303ev")
# dbutils.widgets.text("container", "raw") # Acá pasaremos a definir si es 'dev' o 'prod'
dbutils.widgets.text("container", "dev")
dbutils.widgets.text("catalogo", "catalog_footballdata")
dbutils.widgets.text("esquema", "bronze")

storage_account = dbutils.widgets.get("storage_account")
container = dbutils.widgets.get("container")
catalogo = dbutils.widgets.get("catalogo")
esquema = dbutils.widgets.get("esquema")

# RUTA AL ARCHIVO MAESTRO DE COMPETICIONES (EDICIONES)
# ruta = f"abfss://{container}@{storage_account}.dfs.core.windows.net/datasets/competitions/edition_master.csv"
# RUTA DINÁMICA: Ahora apunta a {contenedor}/raw/...
ruta = f"abfss://{container}@{storage_account}.dfs.core.windows.net/raw/datasets/competitions/edition_master.csv"

# =========================
# (2) LECTURA INICIAL (VALIDACIÓN)
# =========================
df_editions = spark.read.option('header', True)\
    .option('inferSchema', True)\
    .csv(ruta)

# =========================
# (3) DEFINICIÓN DE SCHEMA MANUAL (TODAS LAS COLUMNAS)
# =========================
# Basado exactamente en el encabezado del archivo edition_master.csv
editions_schema = StructType(fields=[
    StructField("edition_id", IntegerType(), False),
    StructField("competition_id", IntegerType(), True),
    StructField("competition_name", StringType(), True),
    StructField("area_name", StringType(), True),
    StructField("season_id", IntegerType(), True),
    StructField("season_name", StringType(), True),
    StructField("edition_name", StringType(), True)
])

# =========================
# (4) LECTURA FINAL CON SCHEMA
# =========================
df_editions_final = spark.read\
    .option('header', True)\
    .schema(editions_schema)\
    .csv(ruta)

# =========================
# (5) SELECCIÓN DE COLUMNAS
# =========================
editions_selected_df = df_editions_final.select(
    col("edition_id"),
    col("competition_id"),
    col("competition_name"),
    col("area_name"),
    col("season_id"),
    col("season_name"),
    col("edition_name")
)

# =========================
# (6) RENOMBRADO DE COLUMNAS (MANTENIENDO SNAKE_CASE)
# =========================
#editions_renamed_df = editions_selected_df # Ya vienen en snake_case en el archivo original

# =========================
# (7) AUDITORÍA Y CARGA A UNITY CATALOG
# =========================
# Agregamos fecha de ingesta y metadatos de archivo
#editions_final_df = editions_renamed_df.withColumn("ingestion_date", current_timestamp()) \
#                                       .withColumn("source_file", col("_metadata.file_path"))

editions_final_df = editions_selected_df.withColumn("ingestion_date", current_timestamp()) \
                                        .withColumn("source_file", col("_metadata.file_path")) # Usamos _metadata.file_path en lugar de input_file_name()
                                        
# Escritura en Unity Catalog con sobreescritura de esquema
editions_final_df.write.mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{catalogo}.{esquema}.competitions")

print(f"Tabla {catalogo}.{esquema}.competitions cargada con éxito y esquema actualizado.")

In [0]:
%sql
SELECT * FROM ${catalogo}.${esquema}.competitions
LIMIT 3