In [0]:
df_silver_airports = spark.table("aviation_project.bronze.airports")
 

**Transformations and Data Quality issues fixing**

In [0]:

%skip
from pyspark.sql.functions import col, initcap, upper, when

# filling the missing null values in longitude and latitude column

df_silver_airports_dq = df_silver_airports \
    .withColumn(
        "LONGITUDE",
        when(col("IATA_CODE") == "UST", -81.339722)
        .when(col("IATA_CODE") == "PBG", -73.468139)
        .when(col("IATA_CODE") == "ECP", -85.795611)
        .otherwise(col("LONGITUDE"))
    ) \
    .withColumn(
        "LATITUDE",
        when(col("IATA_CODE") == "UST", 29.959250)
        .when(col("IATA_CODE") == "PBG", 44.650944)
        .when(col("IATA_CODE") == "ECP", 30.358250)
        .otherwise(col("LATITUDE"))
    )

In [0]:
 # Transformations: basic cleaning only


df_silver_airports_clean = (
    df_silver_airports_dq
    .withColumn("CITY", initcap(col("CITY")))
    .withColumn("STATE", upper(col("STATE")))
    .withColumn("COUNTRY", upper(col("COUNTRY")))
    .withColumn(
        "LOCATION_VALIDATION",
        when(
            (col("LATITUDE").between(-90, 90)) &
            (col("LONGITUDE").between(-180, 180)) &
            (col("CITY").isNotNull()),
            "VALID"
        ).otherwise("INVALID")
    )
    .filter(col("IATA_CODE").isNotNull())
    .filter(col("CITY").isNotNull())
    .filter(col("STATE").isNotNull())
)

In [0]:
df_enriched = (
    df_silver_airports_clean
    # ---------------- AIRPORT REGION ----------------
    .withColumn(
        "AIRPORT_REGION",
        when(col("STATE").isin("CA", "OR", "WA"), "WEST")
        .when(col("STATE").isin("NY", "PA", "MA", "NJ"), "NORTHEAST")
        .when(col("STATE").isin("TX", "FL", "GA", "AL"), "SOUTH")
        .when(col("STATE").isin("IL", "OH", "MI", "WI"), "MIDWEST")
        .otherwise("INTERNATIONAL")
    )

    # ---------------- AIRPORT CONTINENT ----------------
    .withColumn(
        "AIRPORT_CONTINENT",
        when(col("COUNTRY").isin("USA", "CAN", "MEX"), "NORTH_AMERICA")
        .when(col("COUNTRY").isin("IND", "CHN", "JPN", "SGP"), "ASIA")
        .when(col("COUNTRY").isin("FRA", "DEU", "ESP", "ITA", "NLD"), "EUROPE")
        .otherwise("OTHER")
    )

    # ---------------- AIRPORT SIZE ----------------
    .withColumn(
        "AIRPORT_SIZE",
        when(col("STATE") == "CA", "LARGE")
        .when(col("STATE").isin("NY", "TX", "FL", "IL"), "MEDIUM")
        .when(col("COUNTRY") != "USA", "INTERNATIONAL")
        .otherwise("REGIONAL")
    )
)


In [0]:
%sql
CREATE TABLE IF NOT EXISTS aviation_project.silver.airports
   USING DELTA
   LOCATION 'abfss://silver@revtraining.dfs.core.windows.net/Tables/airports'

In [0]:
%python
df_enriched.write.mode("overwrite").option(
       "overwriteSchema", "true"
   ).format("delta").save("abfss://silver@revtraining.dfs.core.windows.net/Tables/airports")

In [0]:
%sql
select * from aviation_project.silver.airports limit 2

**Data Quality checks:**
  - Null Records
  - IATA_CODE character length of airports
  - Duplicates

In [0]:
 
%sql

-- find Null records

SELECT 
  SUM(CASE WHEN IATA_CODE IS NULL THEN 1 ELSE 0 END) AS IATA_nulls,
  SUM(CASE WHEN Airport IS NULL THEN 1 ELSE 0 END) AS Airport_nulls,
  SUM(CASE WHEN city IS NULL THEN 1 ELSE 0 END) AS city_nulls,
  SUM(CASE WHEN state IS NULL THEN 1 ELSE 0 END) AS state_nulls,
  SUM(CASE WHEN country IS NULL THEN 1 ELSE 0 END) AS country_nulls,
  SUM(CASE WHEN latitude IS NULL THEN 1 ELSE 0 END) AS latitude_nulls,
  SUM(CASE WHEN longitude IS NULL THEN 1 ELSE 0 END) AS longitude_nulls,
  SUM(CASE WHEN location_validation IS NULL THEN 1 ELSE 0 END) AS location_validation_nulls,
  SUM(CASE WHEN airport_region IS NULL THEN 1 ELSE 0 END) AS airport_region_nulls,
  SUM(CASE WHEN airport_continent IS NULL THEN 1 ELSE 0 END) AS airport_continent_nulls,
  SUM(CASE WHEN airport_size IS NULL THEN 1 ELSE 0 END) AS airport_size_nulls

FROM aviation_project.silver.airports;


In [0]:
%sql

-- IATA_CODE for airports should be exactly 3 characters long

select *
from aviation_project.silver.airports
where length(IATA_CODE) != 3

In [0]:
%skip
select * from aviation_project.silver.airports where location_validation = 'INVALID'

In [0]:
%sql

-- find duplicates at row level

select count(*) as dup_count, iata_code, airport, city, state, country, latitude, longitude

from aviation_project.silver.airports

group by iata_code, airport, city, state, country, latitude, longitude

HAVING count(*) > 1