In [0]:
df_silver_airlines = spark.table("aviation_project.bronze.airlines")

In [0]:
from pyspark.sql.functions import col, when, initcap, length, current_date

df_silver_airlines_clean = (
    df_silver_airlines
    # AIRLINE_NAME
    .withColumn(
        "AIRLINE_NAME",
        when(col("AIRLINE").ilike("%American Airlines%"), "American Airlines")
        .when(col("AIRLINE").ilike("%United%"), "United Airlines")
        .when(col("AIRLINE").ilike("%Delta%"), "Delta Airlines")
        .otherwise(initcap(col("AIRLINE")))
    )
    # AIRLINE_CATEGORY
    .withColumn(
        "AIRLINE_CATEGORY",
        when(col("IATA_CODE").isin("AA", "UA", "DL", "WN"), "MAJOR")
        .when(col("IATA_CODE").isin("B6", "NK", "F9", "VX"), "LOW_COST")
        .otherwise("REGIONAL")
    )
    # CODE_VALIDATION
    .withColumn(
        "CODE_VALIDATION",
        when(length(col("IATA_CODE")) == 2, "VALID")
        .otherwise("INVALID")
    )
    # RECORD_DATE
    .withColumn("RECORD_DATE", current_date())
)

In [0]:
%sql
CREATE TABLE IF NOT EXISTS aviation_project.silver.airlines
   USING DELTA
   LOCATION 'abfss://silver@revtraining.dfs.core.windows.net/Tables/airlines'

In [0]:
%python
df_silver_airlines_clean.write.mode("overwrite").option(
       "overwriteSchema", "true"
   ).format("delta").save("abfss://silver@revtraining.dfs.core.windows.net/Tables/airlines")

In [0]:
%sql
select * from aviation_project.silver.airlines limit 2

**Data Quality Checks:**
 - Null
 - Duplicate
 - IATA code character length for airlines
 - Data Type

In [0]:
 
%sql

-- find the null values in all the columns

SELECT 
  SUM(CASE WHEN IATA_CODE IS NULL THEN 1 ELSE 0 END) AS IATA_nulls,
  SUM(CASE WHEN Airline IS NULL THEN 1 ELSE 0 END) AS Airline_nulls,
  SUM(CASE WHEN Airline_name IS NULL THEN 1 ELSE 0 END) AS Airline_name_nulls,
  SUM(CASE WHEN Airline_category IS NULL THEN 1 ELSE 0 END) AS Airline_category_nulls,
  SUM(CASE WHEN code_validation IS NULL THEN 1 ELSE 0 END) AS code_validation_nulls,
  SUM(CASE WHEN record_date IS NULL THEN 1 ELSE 0 END) AS Airline_nulls

FROM aviation_project.silver.airlines;

In [0]:
%sql
-- duplicate record

select count(*),IATA_CODE, AIRLINE, AIRLINE_NAME, AIRLINE_CATEGORY, CODE_VALIDATION, RECORD_DATE

from aviation_project.silver.airlines
group by IATA_CODE,AIRLINE, AIRLINE_NAME, AIRLINE_CATEGORY, CODE_VALIDATION, RECORD_DATE
having count(*) > 1

In [0]:
%sql

select *
from aviation_project.silver.airlines
where length(IATA_CODE) != 2

In [0]:
%sql
Describe table aviation_project.silver.airlines