In [0]:
dbutils.widgets.removeAll()

In [0]:
dbutils.widgets.removeAll()
from pyspark.sql.functions import *
from pyspark.sql.types import *

# =========================
# (1) PARÁMETROS Y WIDGETS
# =========================
dbutils.widgets.text("storage_account", "adlssmartdata0303ev")
# Cambio mínimo: Ahora el contenedor raíz es el ambiente (dev/prod)
dbutils.widgets.text("container", "dev") 
dbutils.widgets.text("catalogo", "catalog_footballdata")
dbutils.widgets.text("esquema", "bronze")

storage_account = dbutils.widgets.get("storage_account")
container = dbutils.widgets.get("container")
catalogo = dbutils.widgets.get("catalogo")
esquema = dbutils.widgets.get("esquema")

# RUTA AL ARCHIVO MAESTRO DE EQUIPOS
# Cambio mínimo: Agregamos el folder /raw/ antes de /datasets/
ruta = f"abfss://{container}@{storage_account}.dfs.core.windows.net/raw/datasets/teams/team_master.csv"

# =========================
# (2) LECTURA INICIAL (VALIDACIÓN)
# =========================
df_teams = spark.read.option('header', True)\
    .option('inferSchema', True)\
    .csv(ruta)

# =========================
# (3) DEFINICIÓN DE SCHEMA MANUAL (TODAS LAS COLUMNAS)
# =========================
teams_schema = StructType(fields=[
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("short_name", StringType(), True),
    StructField("acronym", StringType(), True),
    StructField("coach_name", StringType(), True),
    StructField("stadium_id", IntegerType(), True),
    StructField("stadium_name", StringType(), True),
    StructField("stadium_city", StringType(), True),
    StructField("stadium_capacity", DoubleType(), True),
    StructField("area_name", StringType(), True)
])

# =========================
# (4) LECTURA FINAL CON SCHEMA
# =========================
df_teams_final = spark.read\
    .option('header', True)\
    .schema(teams_schema)\
    .csv(ruta)

# =========================
# (5) SELECCIÓN DE COLUMNAS
# =========================
teams_selected_df = df_teams_final.select(
    col("id"),
    col("name"),
    col("short_name"),
    col("acronym"),
    col("coach_name"),
    col("stadium_id"),
    col("stadium_name"),
    col("stadium_city"),
    col("stadium_capacity"),
    col("area_name")
)

# =========================
# (6) RENOMBRADO DE COLUMNAS (SNAKE_CASE)
# =========================
teams_renamed_df = teams_selected_df.withColumnRenamed("id", "team_id") \
    .withColumnRenamed("name", "team_name") \
    .withColumnRenamed("short_name", "short_name") \
    .withColumnRenamed("acronym", "acronym") \
    .withColumnRenamed("coach_name", "coach_name") \
    .withColumnRenamed("stadium_id", "stadium_id") \
    .withColumnRenamed("stadium_name", "stadium_name") \
    .withColumnRenamed("stadium_city", "stadium_city") \
    .withColumnRenamed("stadium_capacity", "stadium_capacity") \
    .withColumnRenamed("area_name", "area_name")

# =========================
# (7) AUDITORÍA Y CARGA A UNITY CATALOG
# =========================
teams_final_df = teams_renamed_df.withColumn("ingestion_date", current_timestamp()) \
                                 .withColumn("source_file", col("_metadata.file_path"))

# Escritura directa (Unity Catalog usará el MANAGED LOCATION del esquema bronze)
teams_final_df.write.mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{catalogo}.{esquema}.teams")

print(f"Esquema actualizado e ingesta completada en {catalogo}.{esquema}.teams")
print(f"Ingesta de 'team_master' completada exitosamente con todas sus columnas en {catalogo}.{esquema}.teams")

In [0]:
%sql
SELECT * FROM ${catalogo}.${esquema}.teams
LIMIT 3