In [0]:
%skip
drop table aviation_project.bronze.flights_stream

In [0]:
df_silver_flights_stream = spark.table("aviation_project.bronze.flights_stream")

In [0]:
from pyspark.sql.functions import (
    col, when, upper, coalesce, lit,
    current_timestamp
)

df_silver_flights_stream_clean = (
    df_silver_flights_stream

    # --- DERIVED FIELD (create first) ---
    .withColumn(
        "TOTAL_DELAY",
        col("DEPARTURE_DELAY") + col("ARRIVAL_DELAY")
    )

    # --- Distance Category ---
    .withColumn(
        "DISTANCE_CATEGORY",
        when(col("DISTANCE") < 500, "SHORT_HAUL")
        .when(col("DISTANCE").between(500, 1500), "MEDIUM_HAUL")
        .otherwise("LONG_HAUL")
    )

    # --- Day Type ---
    .withColumn(
        "DAY_TYPE",
        when(col("DAY_OF_WEEK").isin(1, 7), "WEEKEND")
        .otherwise("WEEKDAY")
    )

    # --- Flight Status ---
    .withColumn(
        "FLIGHT_STATUS",
        when(col("CANCELLED") == 1, "CANCELLED")
        .when(col("DIVERTED") == 1, "DIVERTED")
        .otherwise("COMPLETED")
    )

    # --- Weather Impact ---
    .withColumn(
        "WEATHER_IMPACT",
        when(col("WEATHER_DELAY") > 30, "WEATHER_HIGH")
        .when(col("WEATHER_DELAY") > 0, "WEATHER_LOW")
        .otherwise("NO_WEATHER_IMPACT")
    )

    # --- On-time Flag ---
    .withColumn(
        "ON_TIME_FLAG",
        when(col("TOTAL_DELAY") <= 0, lit(1))
        .otherwise(lit(0))
    )

    # --- UPPER CASE CLEANING ---
    .withColumn("AIRLINE", upper(col("AIRLINE")))
    .withColumn("ORIGIN_AIRPORT", upper(col("ORIGIN_AIRPORT")))
    .withColumn("DESTINATION_AIRPORT", upper(col("DESTINATION_AIRPORT")))

    # --- NULL HANDLING ---
    .withColumn("CANCELLATION_REASON", coalesce(col("CANCELLATION_REASON"), lit("NONE")))
    .withColumn("AIR_SYSTEM_DELAY", coalesce(col("AIR_SYSTEM_DELAY"), lit(0)))
    .withColumn("SECURITY_DELAY", coalesce(col("SECURITY_DELAY"), lit(0)))
    .withColumn("AIRLINE_DELAY", coalesce(col("AIRLINE_DELAY"), lit(0)))
    .withColumn("LATE_AIRCRAFT_DELAY", coalesce(col("LATE_AIRCRAFT_DELAY"), lit(0)))
    .withColumn("WEATHER_DELAY", coalesce(col("WEATHER_DELAY"), lit(0)))

    # --- LOAD TIMESTAMP ---
    .withColumn("LOAD_TIMESTAMP", current_timestamp())

    # --- DROP CRITICAL NULLS ---
    .dropna(subset=["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"])
)


In [0]:
%sql
CREATE TABLE IF NOT EXISTS aviation_project.silver.flights_stream
   USING DELTA
   LOCATION 'abfss://silver@revtraining.dfs.core.windows.net/Tables/flights_stream'

In [0]:
%python
df_silver_flights_stream_clean.write.mode("overwrite").option(
       "overwriteSchema", "true"
   ).format("delta").save("abfss://silver@revtraining.dfs.core.windows.net/Tables/flights_stream")

In [0]:
%sql
select * from aviation_project.silver.flights_stream 
where flight_number in (1010, 3444, 1509, 5658)
order by flight_number asc

%md

**Data Quality checks:**
 - Null
 - data type
 - date validity check
 - delay & duration validation 
 - Cancellation check
 - Weather Impact check

In [0]:
%sql
SELECT
  SUM(CASE WHEN YEAR IS NULL THEN 1 ELSE 0 END) AS YEAR_NULLS,
  SUM(CASE WHEN MONTH IS NULL THEN 1 ELSE 0 END) AS MONTH_NULLS,
  SUM(CASE WHEN DAY IS NULL THEN 1 ELSE 0 END) AS DAY_NULLS,
  SUM(CASE WHEN DAY_OF_WEEK IS NULL THEN 1 ELSE 0 END) AS DAY_OF_WEEK_NULLS,
  SUM(CASE WHEN AIRLINE IS NULL THEN 1 ELSE 0 END) AS AIRLINE_NULLS,
  SUM(CASE WHEN FLIGHT_NUMBER IS NULL THEN 1 ELSE 0 END) AS FLIGHT_NUMBER_NULLS,
  SUM(CASE WHEN TAIL_NUMBER IS NULL THEN 1 ELSE 0 END) AS TAIL_NUMBER_NULLS,
  SUM(CASE WHEN ORIGIN_AIRPORT IS NULL THEN 1 ELSE 0 END) AS ORIGIN_AIRPORT_NULLS,
  SUM(CASE WHEN DESTINATION_AIRPORT IS NULL THEN 1 ELSE 0 END) AS DESTINATION_AIRPORT_NULLS,
  SUM(CASE WHEN SCHEDULED_DEPARTURE IS NULL THEN 1 ELSE 0 END) AS SCHEDULED_DEPARTURE_NULLS,
  SUM(CASE WHEN DEPARTURE_TIME IS NULL THEN 1 ELSE 0 END) AS DEPARTURE_TIME_NULLS,
  SUM(CASE WHEN DEPARTURE_DELAY IS NULL THEN 1 ELSE 0 END) AS DEPARTURE_DELAY_NULLS,
  SUM(CASE WHEN TAXI_OUT IS NULL THEN 1 ELSE 0 END) AS TAXI_OUT_NULLS,
  SUM(CASE WHEN WHEELS_OFF IS NULL THEN 1 ELSE 0 END) AS WHEELS_OFF_NULLS,
  SUM(CASE WHEN SCHEDULED_TIME IS NULL THEN 1 ELSE 0 END) AS SCHEDULED_TIME_NULLS,
  SUM(CASE WHEN ELAPSED_TIME IS NULL THEN 1 ELSE 0 END) AS ELAPSED_TIME_NULLS,
  SUM(CASE WHEN AIR_TIME IS NULL THEN 1 ELSE 0 END) AS AIR_TIME_NULLS,
  SUM(CASE WHEN DISTANCE IS NULL THEN 1 ELSE 0 END) AS DISTANCE_NULLS,
  SUM(CASE WHEN WHEELS_ON IS NULL THEN 1 ELSE 0 END) AS WHEELS_ON_NULLS,
  SUM(CASE WHEN TAXI_IN IS NULL THEN 1 ELSE 0 END) AS TAXI_IN_NULLS,
  SUM(CASE WHEN SCHEDULED_ARRIVAL IS NULL THEN 1 ELSE 0 END) AS SCHEDULED_ARRIVAL_NULLS,
  SUM(CASE WHEN ARRIVAL_TIME IS NULL THEN 1 ELSE 0 END) AS ARRIVAL_TIME_NULLS,
  SUM(CASE WHEN ARRIVAL_DELAY IS NULL THEN 1 ELSE 0 END) AS ARRIVAL_DELAY_NULLS,
  SUM(CASE WHEN DIVERTED IS NULL THEN 1 ELSE 0 END) AS DIVERTED_NULLS,
  SUM(CASE WHEN CANCELLED IS NULL THEN 1 ELSE 0 END) AS CANCELLED_NULLS,
  SUM(CASE WHEN CANCELLATION_REASON IS NULL THEN 1 ELSE 0 END) AS CANCELLATION_REASON_NULLS,
  SUM(CASE WHEN AIR_SYSTEM_DELAY IS NULL THEN 1 ELSE 0 END) AS AIR_SYSTEM_DELAY_NULLS,
  SUM(CASE WHEN SECURITY_DELAY IS NULL THEN 1 ELSE 0 END) AS SECURITY_DELAY_NULLS,
  SUM(CASE WHEN AIRLINE_DELAY IS NULL THEN 1 ELSE 0 END) AS AIRLINE_DELAY_NULLS,
  SUM(CASE WHEN LATE_AIRCRAFT_DELAY IS NULL THEN 1 ELSE 0 END) AS LATE_AIRCRAFT_DELAY_NULLS,
  SUM(CASE WHEN WEATHER_DELAY IS NULL THEN 1 ELSE 0 END) AS WEATHER_DELAY_NULLS,
  SUM(CASE WHEN TOTAL_DELAY IS NULL THEN 1 ELSE 0 END) AS TOTAL_DELAY_NULLS,
  SUM(CASE WHEN DISTANCE_CATEGORY IS NULL THEN 1 ELSE 0 END) AS DISTANCE_CATEGORY_NULLS,
  SUM(CASE WHEN DAY_TYPE IS NULL THEN 1 ELSE 0 END) AS DAY_TYPE_NULLS,
  SUM(CASE WHEN FLIGHT_STATUS IS NULL THEN 1 ELSE 0 END) AS FLIGHT_STATUS_NULLS,
  SUM(CASE WHEN WEATHER_IMPACT IS NULL THEN 1 ELSE 0 END) AS WEATHER_IMPACT_NULLS,
  SUM(CASE WHEN ON_TIME_FLAG IS NULL THEN 1 ELSE 0 END) AS ON_TIME_FLAG_NULLS,
  SUM(CASE WHEN LOAD_TIMESTAMP IS NULL THEN 1 ELSE 0 END) AS LOAD_TIMESTAMP_NULLS
FROM aviation_project.silver.flights_stream;


In [0]:
%sql
-- Date validity check
SELECT *
FROM aviation_project.silver.flights_stream
WHERE
  YEAR < 2000 OR YEAR > YEAR(current_date())
  OR MONTH NOT BETWEEN 1 AND 12
  OR DAY NOT BETWEEN 1 AND 31
  OR DAY_OF_WEEK NOT BETWEEN 1 AND 7;


In [0]:
%sql
-- Delay & Duration validity check
SELECT *
FROM aviation_project.silver.flights_stream
WHERE
  DEPARTURE_DELAY < -60
  OR ARRIVAL_DELAY < -60
  OR TAXI_OUT < 0
  OR TAXI_IN < 0
  OR AIR_TIME <= 0
  OR ELAPSED_TIME < AIR_TIME;


In [0]:
%sql
-- Cancellation rules
SELECT *
FROM aviation_project.silver.flights_stream
WHERE
  (CANCELLED = 1 AND (DEPARTURE_TIME IS NOT NULL OR ARRIVAL_TIME IS NOT NULL))
  OR
  (CANCELLED = 0 AND CANCELLATION_REASON IS NULL);


In [0]:
%sql
describe table aviation_project.silver.flights_stream

In [0]:
%sql
-- Weather delay should be there if the flight is impacted by weather
SELECT *
FROM aviation_project.silver.flights_stream
WHERE
  WEATHER_IMPACT = 1
  AND WEATHER_DELAY IS NULL;


In [0]:
%sql
SELECT *
FROM aviation_project.silver.flights_stream
WHERE
  departure_time >= arrival_time
  ;
