In [0]:
%sql
CREATE EXTERNAL LOCATION IF NOT EXISTS aviation_project_ext_lcn_silver
URL 'abfss://silver@revtraining.dfs.core.windows.net/'
WITH (CREDENTIAL aviation_project_cred);

In [0]:
%sql
create schema if not exists aviation_project.silver

**Airlines**

In [0]:
df_silver_airlines = spark.table("aviation_project.bronze.airlines")

In [0]:
from pyspark.sql.functions import col, when, initcap, length

df_silver_airlines_clean = (
    df_silver_airlines
    # AIRLINE_NAME
    .withColumn(
        "AIRLINE_NAME",
        when(col("AIRLINE").ilike("%American Airlines%"), "American Airlines")
        .when(col("AIRLINE").ilike("%United%"), "United Airlines")
        .when(col("AIRLINE").ilike("%Delta%"), "Delta Airlines")
        .otherwise(initcap(col("AIRLINE")))
    )

    # AIRLINE_CATEGORY
    .withColumn(
        "AIRLINE_CATEGORY",
        when(col("IATA_CODE").isin("AA", "UA", "DL", "WN"), "MAJOR")
        .when(col("IATA_CODE").isin("B6", "NK", "F9", "VX"), "LOW_COST")
        .otherwise("REGIONAL")
    )

    # CODE_VALIDATION
    .withColumn(
        "CODE_VALIDATION",
        when(length(col("IATA_CODE")) == 2, "VALID")
        .otherwise("INVALID")
    )
)


In [0]:
%sql
CREATE TABLE IF NOT EXISTS aviation_project.silver.airlines
   USING DELTA
   LOCATION 'abfss://silver@revtraining.dfs.core.windows.net/Tables/airlines'

In [0]:
%python
df_silver_airlines_clean.write.mode("overwrite").option(
       "overwriteSchema", "true"
   ).format("delta").save("abfss://silver@revtraining.dfs.core.windows.net/Tables/airlines")

**Airport**

In [0]:
df_silver_airports = spark.table("aviation_project.bronze.airports")
 

In [0]:
df_silver_airports.printSchema()

In [0]:
 # Transformations: basic cleaning only
from pyspark.sql.functions import col, initcap, upper, when

df_silver_airports_clean = (
    df_silver_airports
    .withColumn("CITY", initcap(col("CITY")))
    .withColumn("STATE", upper(col("STATE")))
    .withColumn("COUNTRY", upper(col("COUNTRY")))
    .withColumn(
        "LOCATION_VALIDATION",
        when(
            (col("LATITUDE").between(-90, 90)) &
            (col("LONGITUDE").between(-180, 180)) &
            (col("CITY").isNotNull()),
            "VALID"
        ).otherwise("INVALID")
    )
    .filter(col("IATA_CODE").isNotNull())
    .filter(col("CITY").isNotNull())
    .filter(col("STATE").isNotNull())
)

In [0]:
display(df_silver_airports_clean)

In [0]:
%sql
CREATE TABLE IF NOT EXISTS aviation_project.silver.airports
   USING DELTA
   LOCATION 'abfss://silver@revtraining.dfs.core.windows.net/Tables/airports'

In [0]:
%python
df_silver_airports_clean.write.mode("overwrite").option(
       "overwriteSchema", "true"
   ).format("delta").save("abfss://silver@revtraining.dfs.core.windows.net/Tables/airports")

In [0]:
%sql
select * from aviation_project.silver.airports

**Flight**

In [0]:
df_silver_flights = spark.table("aviation_project.bronze.flights")

In [0]:
from pyspark.sql.functions import (
    col, when, upper, coalesce, lit,
    current_timestamp
)

df_silver_flights_clean = (
    df_silver_flights

    # --- DERIVED FIELD (create first) ---
    .withColumn(
        "TOTAL_DELAY",
        col("DEPARTURE_DELAY") + col("ARRIVAL_DELAY")
    )

    # --- Distance Category ---
    .withColumn(
        "DISTANCE_CATEGORY",
        when(col("DISTANCE") < 500, "SHORT_HAUL")
        .when(col("DISTANCE").between(500, 1500), "MEDIUM_HAUL")
        .otherwise("LONG_HAUL")
    )

    # --- Day Type ---
    .withColumn(
        "DAY_TYPE",
        when(col("DAY_OF_WEEK").isin(1, 7), "WEEKEND")
        .otherwise("WEEKDAY")
    )

    # --- Flight Status ---
    .withColumn(
        "FLIGHT_STATUS",
        when(col("CANCELLED") == 1, "CANCELLED")
        .when(col("DIVERTED") == 1, "DIVERTED")
        .otherwise("COMPLETED")
    )

    # --- Weather Impact ---
    .withColumn(
        "WEATHER_IMPACT",
        when(col("WEATHER_DELAY") > 30, "WEATHER_HIGH")
        .when(col("WEATHER_DELAY") > 0, "WEATHER_LOW")
        .otherwise("NO_WEATHER_IMPACT")
    )

    # --- On-time Flag ---
    .withColumn(
        "ON_TIME_FLAG",
        when(col("TOTAL_DELAY") <= 0, lit(1))
        .otherwise(lit(0))
    )

    # --- UPPER CASE CLEANING ---
    .withColumn("AIRLINE", upper(col("AIRLINE")))
    .withColumn("ORIGIN_AIRPORT", upper(col("ORIGIN_AIRPORT")))
    .withColumn("DESTINATION_AIRPORT", upper(col("DESTINATION_AIRPORT")))

    # --- NULL HANDLING ---
    .withColumn("CANCELLATION_REASON", coalesce(col("CANCELLATION_REASON"), lit("NONE")))
    .withColumn("AIR_SYSTEM_DELAY", coalesce(col("AIR_SYSTEM_DELAY"), lit(0)))
    .withColumn("SECURITY_DELAY", coalesce(col("SECURITY_DELAY"), lit(0)))
    .withColumn("AIRLINE_DELAY", coalesce(col("AIRLINE_DELAY"), lit(0)))
    .withColumn("LATE_AIRCRAFT_DELAY", coalesce(col("LATE_AIRCRAFT_DELAY"), lit(0)))
    .withColumn("WEATHER_DELAY", coalesce(col("WEATHER_DELAY"), lit(0)))

    # --- LOAD TIMESTAMP ---
    .withColumn("LOAD_TIMESTAMP", current_timestamp())

    # --- DROP CRITICAL NULLS ---
    .dropna(subset=["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"])
)


In [0]:
%sql
CREATE TABLE IF NOT EXISTS aviation_project.silver.flights
   USING DELTA
   LOCATION 'abfss://silver@revtraining.dfs.core.windows.net/Tables/flights'

In [0]:
%python
df_silver_flights_clean.write.mode("overwrite").option(
       "overwriteSchema", "true"
   ).format("delta").save("abfss://silver@revtraining.dfs.core.windows.net/Tables/flights")