In [0]:
%python
df_airports =(spark.read.format("csv")
              .option("header", "true")
              .option("inferSchema", "true")
              .load("abfss://bronze@revtraining.dfs.core.windows.net/dataset/airports.csv")               

)

In [0]:
%sql
CREATE TABLE IF NOT EXISTS aviation_project.bronze.airports
   USING DELTA
   LOCATION 'abfss://bronze@revtraining.dfs.core.windows.net/Tables/airports'
;

In [0]:
%python
df_airports.write.mode("overwrite").option(
       "overwriteSchema", "true"
   ).format("delta").save("abfss://bronze@revtraining.dfs.core.windows.net/Tables/airports")

In [0]:
%sql
select * from aviation_project.bronze.airports limit 2

**Data Quality checks**
  - Null
  - Duplicate
  - Length of Iata code for airports (3)
  - Data Type of Columns

Null

In [0]:
 %sql

-- find the null values in all the columns

SELECT 
  SUM(CASE WHEN IATA_CODE IS NULL THEN 1 ELSE 0 END) AS IATA_nulls,
  SUM(CASE WHEN Airport IS NULL THEN 1 ELSE 0 END) AS Airport_nulls,
  SUM(CASE WHEN city IS NULL THEN 1 ELSE 0 END) AS city_nulls,
  SUM(CASE WHEN state IS NULL THEN 1 ELSE 0 END) AS state_nulls,
  SUM(CASE WHEN country IS NULL THEN 1 ELSE 0 END) AS country_nulls,
  SUM(CASE WHEN latitude IS NULL THEN 1 ELSE 0 END) AS latitude_nulls,
  SUM(CASE WHEN longitude IS NULL THEN 1 ELSE 0 END) AS longitude_nulls

FROM aviation_project.bronze.airports;


In [0]:
%sql
create or replace view aviation_project.bronze.airports_nulls as
select * from aviation_project.bronze.airports
where latitude is null or longitude is null or airport is null or city is null or state is null or country is null

Duplicate records

In [0]:
%sql

-- find duplicates at column level

with cte as (select iata_code , count(iata_code) over (partition by iata_code) as iata_code_dup,
airport , count(airport) over (partition by airport) as airport_dup,
latitude , count(latitude) over (partition by latitude) as latitude_dup,
longitude , count(longitude) over (partition by longitude) as longitude_dup

from aviation_project.bronze.airports
)

select * from cte where iata_code_dup > 1 or airport_dup > 1 or latitude_dup > 1 or longitude_dup > 1


In [0]:
%sql

-- find duplicates at row level

select count(*) as dup_count, iata_code, airport, city, state, country, latitude, longitude

from aviation_project.bronze.airports

group by iata_code, airport, city, state, country, latitude, longitude

HAVING count(*) > 1

Length of iata code for airports which is 3 letters

In [0]:
%sql

select *
from aviation_project.bronze.airports
where length(IATA_CODE) != 3

Data Type

In [0]:
%sql

-- Data types of column

DESCRIBE TABLE aviation_project.bronze.airports

**Data Fixes Required:**
  - Fill the null values in longitude and latitude