In [0]:
dbutils.widgets.removeAll()

In [0]:
dbutils.widgets.removeAll()
from pyspark.sql.functions import *
from pyspark.sql.types import *

# =========================
# (1) PARÁMETROS Y WIDGETS
# =========================
dbutils.widgets.text("storage_account", "adlssmartdata0303ev")
# Cambio mínimo: Ajustamos el default a 'dev'
dbutils.widgets.text("container", "dev")
dbutils.widgets.text("catalogo", "catalog_footballdata")
dbutils.widgets.text("esquema", "bronze")

storage_account = dbutils.widgets.get("storage_account")
container = dbutils.widgets.get("container")
catalogo = dbutils.widgets.get("catalogo")
esquema = dbutils.widgets.get("esquema")

# Cambio mínimo: Insertamos '/raw' en la ruta física
ruta_partidos = f"abfss://{container}@{storage_account}.dfs.core.windows.net/raw/datasets/matches/*.csv"

# =========================
# (2) LECTURA PARA CAPTURAR SCHEMA REAL
# =========================
temp_df = spark.read.option("header", True).option("inferSchema", True).csv(ruta_partidos).limit(1)

# =========================
# (3) DEFINICIÓN DE SCHEMA (LAS 295 COLUMNAS)
# =========================
matches_schema = temp_df.schema

# =========================
# (4) LECTURA FINAL CON SCHEMA COMPLETO
# =========================
df_matches_final = spark.read \
    .option("header", True) \
    .schema(matches_schema) \
    .csv(ruta_partidos)

# =========================
# (5) SELECCIÓN DE TODO (SELECT *)
# =========================
matches_selected_df = df_matches_final.select("*")

# =========================
# (6) RENOMBRADO TÉCNICO (Opcional)
# =========================
matches_renamed_df = matches_selected_df.withColumnRenamed("team_shortname", "team_name")

# =========================
# (7) AUDITORÍA Y CARGA A UNITY CATALOG
# =========================
matches_final_df = matches_renamed_df.withColumn("ingestion_date", current_timestamp()) \
                                      .withColumn("source_file", col("_metadata.file_path"))

# Unity Catalog usará el MANAGED LOCATION definido en el esquema bronze
target_table = f"{catalogo}.{esquema}.match_events"

matches_final_df.write.mode("overwrite") \
    .option("overwriteSchema", "true") \
    .format("delta") \
    .saveAsTable(target_table)

print(f"Éxito: Se han ingestada las {len(matches_schema)} columnas en {target_table}")

In [0]:
%sql
SELECT * FROM ${catalogo}.${esquema}.match_events
LIMIT 20