In [0]:
import dlt
from pyspark.sql.functions import *

In [0]:
#creation of bronze streaming table 
@dlt.table (
    name="taxi_raw_records",
    comment="Bronze layer: Raw data ingestion from samples"
)
@dlt.expect_or_drop("valid_distance","trip_distance>0.0")
def taxi_raw_records():
    return (
        spark.readStream #readstream bcz wanted to create a streaming table
        .format("delta")  # Because samples.nyctaxi.trips is a Delta table
        .table("samples.nyctaxi.trips")
    )

In [0]:
# silver flagged table code 
@dlt.table (
    name="flagged_rides",
    comment="Silver flagged layer: Rides with either suspicious fare or short trip but high fare"
)
def flagged_rides():
    df = dlt.read_stream("taxi_raw_records")

    return (
        df.filter(
            (
                (col("pickup_zip") == col("dropoff_zip")) & (col("fare_amount") > 50)
            ) |
            (
                (col("trip_distance") < 5) & (col("fare_amount") > 50)
            )
        )
        .withColumn("week", date_trunc("week", col("tpep_pickup_datetime")))
        .select("week", col("pickup_zip").alias("zip"), "fare_amount", "trip_distance")
    )


In [0]:
import dlt
from pyspark.sql.functions import date_trunc, avg, col

@dlt.table(  # For materialized view, use @dlt.view + mark as materialized in pipeline settings
    name="weekly_stats",
    comment="Silver layer 2: Weekly statistics with average fare and distance"
)
def weekly_stats():
    df = dlt.read("taxi_raw_records")  # Use read (not read_stream) for materialized view

    return (
        df.withColumn("week", date_trunc("week", col("tpep_pickup_datetime")))
          .groupBy("week")
          .agg(
              avg("fare_amount").alias("avg_amount"),
              avg("trip_distance").alias("avg_distance")
          )
          .orderBy("week")
    )


In [0]:
from pyspark.sql.functions import round as spark_round

In [0]:
import dlt
from pyspark.sql.functions import round as spark_round

@dlt.table(
    name="top_n",
    comment="Gold layer: Top N rides to investigate"
)
def top_n():
    # Reading input tables
    flagged_df = dlt.read("flagged_rides")
    weekly_df = dlt.read("weekly_stats")

    # applying left join and then select
    joined_df = (
        flagged_df.join(weekly_df, on="week", how="left")
        .select(
            flagged_df["week"],
            spark_round(weekly_df["avg_amount"], 2).alias("avg_amount"),
            spark_round(weekly_df["avg_distance"], 3).alias("avg_distance"),
            flagged_df["fare_amount"],
            flagged_df["trip_distance"],
            flagged_df["zip"]
        )
        .orderBy(flagged_df["fare_amount"].desc())
        .limit(10)
    )

    return joined_df
