In [0]:
dbutils.widgets.removeAll()

In [0]:
dbutils.widgets.removeAll()
from pyspark.sql.functions import *
from pyspark.sql.types import *

# =========================
# (1) PARÁMETROS Y WIDGETS
# =========================
dbutils.widgets.text("storage_account", "adlssmartdata0303ev")
# Cambio mínimo: Ajustamos el default a 'dev' (el paraguas del ambiente)
dbutils.widgets.text("container", "dev")
dbutils.widgets.text("catalogo", "catalog_footballdata")
dbutils.widgets.text("esquema", "bronze")

storage_account = dbutils.widgets.get("storage_account")
container = dbutils.widgets.get("container")
catalogo = dbutils.widgets.get("catalogo")
esquema = dbutils.widgets.get("esquema")

# Cambio mínimo: Insertamos '/raw' en la ruta física para coincidir con la nueva estructura
ruta = f"abfss://{container}@{storage_account}.dfs.core.windows.net/raw/datasets/physical/*.csv"

# =========================
# (2) LECTURA INICIAL (VALIDACIÓN)
# =========================
df_physical = spark.read.option('header', True)\
    .option('inferSchema', True)\
    .csv(ruta)

# =========================
# (3) DEFINICIÓN DE SCHEMA MANUAL (TODAS LAS COLUMNAS)
# =========================
# Mapeo exacto de las 42 columnas del archivo de eventos físicos
physical_schema = StructType(fields=[
    StructField("Unnamed: 0", IntegerType(), True),
    StructField("player_name", StringType(), True),
    StructField("player_short_name", StringType(), True),
    StructField("player_id", IntegerType(), True),
    StructField("player_birthdate", StringType(), True),
    StructField("team_name", StringType(), True),
    StructField("team_id", IntegerType(), True),
    StructField("match_name", StringType(), True),
    StructField("match_id", IntegerType(), True),
    StructField("match_date", StringType(), True),
    StructField("competition_name", StringType(), True),
    StructField("competition_id", IntegerType(), True),
    StructField("season_name", StringType(), True),
    StructField("season_id", IntegerType(), True),
    StructField("competition_edition_id", IntegerType(), True),
    StructField("position", StringType(), True),
    StructField("position_group", StringType(), True),
    StructField("minutes_full_all", DoubleType(), True),
    StructField("physical_check_passed", BooleanType(), True),
    StructField("total_distance_full_all", DoubleType(), True),
    StructField("total_metersperminute_full_all", DoubleType(), True),
    StructField("running_distance_full_all", DoubleType(), True),
    StructField("hsr_distance_full_all", DoubleType(), True),
    StructField("hsr_count_full_all", DoubleType(), True),
    StructField("sprint_distance_full_all", DoubleType(), True),
    StructField("sprint_count_full_all", DoubleType(), True),
    StructField("hi_distance_full_all", DoubleType(), True),
    StructField("hi_count_full_all", DoubleType(), True),
    StructField("psv99", DoubleType(), True),
    StructField("medaccel_count_full_all", DoubleType(), True),
    StructField("highaccel_count_full_all", DoubleType(), True),
    StructField("meddecel_count_full_all", DoubleType(), True),
    StructField("highdecel_count_full_all", DoubleType(), True),
    StructField("explacceltohsr_count_full_all", DoubleType(), True),
    StructField("timetohsr", DoubleType(), True),
    StructField("timetohsrpostcod", DoubleType(), True),
    StructField("explacceltosprint_count_full_all", DoubleType(), True),
    StructField("timetosprint", DoubleType(), True),
    StructField("timetosprintpostcod", DoubleType(), True),
    StructField("cod_count_full_all", DoubleType(), True),
    StructField("timeto505around90", DoubleType(), True),
    StructField("timeto505around180", DoubleType(), True)
])

# =========================
# (4) LECTURA FINAL CON SCHEMA
# =========================
df_physical_final = spark.read\
    .option('header', True)\
    .schema(physical_schema)\
    .csv(ruta)

# =========================
# (5) SELECCIÓN Y LIMPIEZA INICIAL
# =========================
physical_selected_df = df_physical_final.drop("Unnamed: 0")

# =========================
# (6) RENOMBRADO TÉCNICO
# =========================
physical_renamed_df = physical_selected_df

# =========================
# (7) AUDITORÍA Y CARGA A UNITY CATALOG
# =========================
physical_final_df = physical_renamed_df.withColumn("ingestion_date", current_timestamp()) \
                                       .withColumn("source_file", col("_metadata.file_path"))

# Escritura directa (Unity Catalog usará el MANAGED LOCATION del esquema bronze en dev/prod)
physical_final_df.write.mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{catalogo}.{esquema}.physical_performance")

print(f"Ingesta de 42 columnas completada en {catalogo}.{esquema}.physical_performance")

In [0]:
%sql
SELECT * FROM ${catalogo}.${esquema}.physical_performance
LIMIT 3