In [None]:
from pyspark.sql.functions import col, year, month, to_date, when, count, lit, current_timestamp, md5, concat_ws, input_file_name
from pyspark.sql.types import *
from notebookutils import mssparkutils

# Variables de parametros
TAXI_TYPE = "yellow"
DATE_ID = "2023-01"

In [None]:
from delta.tables import DeltaTable

# Rutas
SOURCE_PATH = f"Files/raw_Data/{TAXI_TYPE}_Taxi/{TAXI_TYPE}_tripdata_{DATE_ID}.parquet"
TABLE_NAME = f"silver_{TAXI_TYPE}_taxi"

print(f"Procesando: {TAXI_TYPE} - {DATE_ID}")

# Carga de Bronze
try:
    df_bronze = spark.read.parquet(SOURCE_PATH)
except:
    mssparkutils.notebook.exit(f"No se encontroel archivo en: {SOURCE_PATH}")

# Normalizar nombres de columnas
if "tpep_pickup_datetime" in df_bronze.columns:
    df_bronze = df_bronze.withColumnRenamed("tpep_pickup_datetime", "pickup_time") \
                         .withColumnRenamed("tpep_dropoff_datetime", "dropoff_time")
elif "lpep_pickup_datetime" in df_bronze.columns:
    df_bronze = df_bronze.withColumnRenamed("lpep_pickup_datetime", "pickup_time") \
                         .withColumnRenamed("lpep_dropoff_datetime", "dropoff_time")

# Transformación y Selección
df_silver = df_bronze.select(
    col("pickup_time").cast("timestamp"),
    col("dropoff_time").cast("timestamp"),
    col("passenger_count").cast("int"),
    col("trip_distance").cast("double"),
    col("PULocationID").cast("int").alias("pickup_zone_id"),
    col("DOLocationID").cast("int").alias("dropoff_zone_id"),
    col("payment_type").cast("int"),
    col("fare_amount").cast("double"),
    col("tip_amount").cast("double"),
    col("total_amount").cast("double"),
    lit(TAXI_TYPE).alias("taxi_type"), 
    lit("batch_pipeline").alias("ingestion_source"),
    input_file_name().alias("source_file"),
    current_timestamp().alias("processed_at")
)

# Data Quality y Generación de ID unico
df_final = df_silver.filter(
    (col("trip_distance") > 0) & 
    (col("total_amount") > 0) & 
    (col("pickup_time") < col("dropoff_time"))
).withColumn(
    "trip_id", 
    md5(concat_ws("-", 
        col("taxi_type"), 
        col("pickup_time"), 
        col("dropoff_time"), 
        col("pickup_zone_id"), 
        col("dropoff_zone_id"),
        col("trip_distance"),
        col("total_amount")
    ))
).withColumn("year", year(col("pickup_time"))) \
 .withColumn("month", month(col("pickup_time"))) \
 .dropDuplicates(["trip_id"]) # <--- garantía de no duplicados después de los filtros

# GUARDADO CON MERGE
if spark.catalog.tableExists(TABLE_NAME):
    print(f"La tabla {TABLE_NAME} ya existe. Aplicando MERGE...")
    dt = DeltaTable.forName(spark, TABLE_NAME)
    
    dt.alias("target").merge(
        df_final.alias("updates"),
        "target.trip_id = updates.trip_id"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
else:
    print(f"La tabla {TABLE_NAME} no existe. Creándola...")
    df_final.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(TABLE_NAME)

print(f"Finalizado con éxito: {TABLE_NAME}")
mssparkutils.notebook.exit("OK")