In [0]:
 %sql
CREATE OR REPLACE TABLE aviation_project.gold.dim_flights (
    dim_flight_id       BIGINT GENERATED ALWAYS AS IDENTITY,
    flight_number       STRING,
    tail_number         STRING,
    origin_airport      STRING,
    destination_airport STRING,
    departure_time      INT,
    arrival_time        INT,
    total_delay         INT,
    flight_status       STRING,
    effective_start     TIMESTAMP,
    effective_end       TIMESTAMP,
    is_current          STRING
)
USING DELTA;


In [0]:
%sql

INSERT INTO aviation_project.gold.dim_flights (
    flight_number, tail_number, origin_airport, destination_airport,
    departure_time, arrival_time, total_delay, flight_status,
    effective_start, effective_end, is_current
)
SELECT
    flight_number, tail_number, origin_airport, destination_airport,
    departure_time, arrival_time, total_delay, flight_status,
    CURRENT_TIMESTAMP(), NULL, 'Y'
FROM aviation_project.silver.flights;

In [0]:
%sql
describe aviation_project.gold.dim_flights

In [0]:
%sql
select count(*) from aviation_project.gold.dim_flights 

IS DISTINCT Return TRUE if tgt.departure_time and src.departure_time are different, even when one of them is NULL


In [0]:

%sql
MERGE INTO aviation_project.gold.dim_flights tgt
  USING aviation_project.silver.flights_stream src
  ON tgt.flight_number = src.flight_number
    AND tgt.flight_number = src.flight_number
    And tgt.tail_number = src.tail_number
     AND tgt.is_current = 'Y'

  WHEN MATCHED AND (
        tgt.departure_time IS DISTINCT FROM src.departure_time 
    OR  tgt.arrival_time   IS DISTINCT FROM src.arrival_time
    OR  tgt.total_delay    IS DISTINCT FROM src.total_delay
    OR  tgt.flight_status  IS DISTINCT FROM src.flight_status
  )
  THEN UPDATE SET
      effective_end = CURRENT_TIMESTAMP(),
      is_current = 'N'

  WHEN NOT MATCHED THEN
    INSERT (
      flight_number, tail_number, origin_airport, destination_airport,
      departure_time, arrival_time, total_delay, flight_status,
      effective_start, effective_end, is_current
    )
    VALUES (
      src.flight_number, src.tail_number, src.origin_airport, src.destination_airport,
      src.departure_time, src.arrival_time, src.total_delay, src.flight_status,
      CURRENT_TIMESTAMP(), NULL, 'Y'
    );


In [0]:
%sql
select * from aviation_project.gold.dim_flights
where flight_number in (1010, 3444, 1509, 5658)
order by flight_number asc 

In [0]:
%sql
select * from aviation_project.gold.dim_flights
where is_current= 'Y'

In [0]:
%skip
select * from aviation_project.silver.flights_stream_copy

**Data Quality Check:**
  - SCD Type2 Validation
  - Primary key uniqueness
  -

# SCD Type **2 Validation:**
**Scenario 1 (INSERT):** 

- ex - flight_number(1010) is different
- ex - flight_name(3444) is same but tail number(N918M2) is different

**Scenario 2 (UPDATE):**
- ex (cancelled due to weather condition) - flight number - 1509 (tail_number - N6716C) - Cancelled
- ex (delayed due to late arrival time) - flight number - 5658 (tail_number - N719SK)  - arrived 30 mins late

In [0]:
%sql
INSERT INTO aviation_project.silver.flights_stream (
    YEAR,
    MONTH,
    DAY,
    DAY_OF_WEEK,
    AIRLINE,
    FLIGHT_NUMBER,
    TAIL_NUMBER,
    ORIGIN_AIRPORT,
    DESTINATION_AIRPORT,
    SCHEDULED_DEPARTURE,
    DEPARTURE_TIME,
    DEPARTURE_DELAY,
    TAXI_OUT,
    WHEELS_OFF,
    SCHEDULED_TIME,
    ELAPSED_TIME,
    AIR_TIME,
    DISTANCE,
    WHEELS_ON,
    TAXI_IN,
    SCHEDULED_ARRIVAL,
    ARRIVAL_TIME,
    ARRIVAL_DELAY,
    DIVERTED,
    CANCELLED,
    CANCELLATION_REASON,
    AIR_SYSTEM_DELAY,
    SECURITY_DELAY,
    AIRLINE_DELAY,
    LATE_AIRCRAFT_DELAY,
    WEATHER_DELAY,
    TOTAL_DELAY,
    DISTANCE_CATEGORY,
    DAY_TYPE,
    FLIGHT_STATUS,
    WEATHER_IMPACT,
    ON_TIME_FLAG,
    LOAD_TIMESTAMP
)
VALUES
(
    2015, 1, 10, 6, 'DL', 1509, 'N6716C', 'MCO', 'MSP',
    1619, 1608, -11, 17, 1625, 215, 206, 184, 1310,
    1829, 5, 1854, 1834, -20, 0, 0,
    'NONE', 0, 0, 0, 0, 0,
    -31, 'MEDIUM_HAUL', 'WEEKDAY', 'CANCELLED',
    'Heavy Rain', 1,
    TIMESTAMP '2025-12-29 09:29:32.794'
),
(
    2015, 1, 10, 6, 'MQ', 3444, 'N918M2', 'BMI', 'DFW',
    1618, 1615, -3, 13, 1628, 137, 140, 109, 690,
    1817, 18, 1835, 1835, 0, 0, 0,
    'NONE', 0, 0, 0, 0, 0,
    -3, 'MEDIUM_HAUL', 'WEEKDAY', 'COMPLETED',
    'NO_WEATHER_IMPACT', 1,
    TIMESTAMP '2025-12-29 09:29:32.794'
),
(
    2015, 1, 10, 6, 'OO', 5658, 'N719SK', 'DEN', 'COS',
    1620, 1620, 0, 18, 1638, 41, 44, 17, 73,
    1655, 9, 1701, 1731, 30, 0, 0,
    'NONE', 0, 0, 0, 0, 0,
    3, 'SHORT_HAUL', 'WEEKDAY', 'DELAYED',
    'NO_WEATHER_IMPACT', 0,
    TIMESTAMP '2025-12-29 09:29:32.794'
),
(
    2015, 1, 10, 6, 'DL', 1010, 'N6716C', 'MCO', 'MSP',
    1619, 1608, -11, 17, 1625, 215, 206, 184, 1310,
    1829, 5, 1854, 1834, -20, 0, 0,
    'NONE', 0, 0, 0, 0, 0,
    -31, 'MEDIUM_HAUL', 'WEEKDAY', 'CANCELLED',
    'NO_WEATHER_IMPACT', 1,
    TIMESTAMP '2025-12-29 09:29:32.794'
);


**Question** - Good practice to use same codes in multiple cells like here we have used Merge statements??

In [0]:

%sql
MERGE INTO aviation_project.gold.dim_flights tgt
  USING aviation_project.silver.flights_stream src
  ON tgt.flight_number = src.flight_number
    AND tgt.flight_number = src.flight_number
    And tgt.tail_number = src.tail_number
     AND tgt.is_current = 'Y'

  WHEN MATCHED AND (
        tgt.departure_time IS DISTINCT FROM src.departure_time 
    OR  tgt.arrival_time   IS DISTINCT FROM src.arrival_time
    OR  tgt.total_delay    IS DISTINCT FROM src.total_delay
    OR  tgt.flight_status  IS DISTINCT FROM src.flight_status
  )
  THEN UPDATE SET
      effective_end = CURRENT_TIMESTAMP(),
      is_current = 'N'

  WHEN NOT MATCHED THEN
    INSERT (
      flight_number, tail_number, origin_airport, destination_airport,
      departure_time, arrival_time, total_delay, flight_status,
      effective_start, effective_end, is_current
    )
    VALUES (
      src.flight_number, src.tail_number, src.origin_airport, src.destination_airport,
      src.departure_time, src.arrival_time, src.total_delay, src.flight_status,
      CURRENT_TIMESTAMP(), NULL, 'Y'
    );


In [0]:

%skip
MERGE INTO aviation_project.gold.dim_flights tgt
  USING aviation_project.silver.flights_stream src
  ON tgt.flight_number = src.flight_number
    AND tgt.flight_number = src.flight_number
    And tgt.tail_number = src.tail_number
     AND tgt.is_current = 'Y'

  WHEN MATCHED AND (
        tgt.departure_time IS DISTINCT FROM src.departure_time 
    OR  tgt.arrival_time   IS DISTINCT FROM src.arrival_time
    OR  tgt.total_delay    IS DISTINCT FROM src.total_delay
    OR  tgt.flight_status  IS DISTINCT FROM src.flight_status
  )
  THEN UPDATE SET
      effective_end = CURRENT_TIMESTAMP(),
      is_current = 'N',
      

  WHEN NOT MATCHED THEN
    INSERT (
      flight_number, tail_number, origin_airport, destination_airport,
      departure_time, arrival_time, total_delay, flight_status,
      effective_start, effective_end, is_current
    )
    VALUES (
      src.flight_number, src.tail_number, src.origin_airport, src.destination_airport,
      src.departure_time, src.arrival_time, src.total_delay, src.flight_status,
      CURRENT_TIMESTAMP(), NULL, 'Y'
    );


Insert section from MERGE statement is working fine as shown below.

But, Update section is updating only the existing record we need insert the latest record along with is_current= Y and start_date as current date.

In [0]:
%sql
select * from aviation_project.gold.dim_flights
where flight_number in (1010, 3444, 1509, 5658)
order by flight_number asc 

In the belwo insert section, we have skipped dim_flight_id since it will be auto-generated increamentally

In [0]:
%sql
INSERT INTO aviation_project.gold.dim_flights
(flight_number, tail_number, origin_airport, destination_airport, departure_time, arrival_time, total_delay, flight_status, effective_start, effective_end, is_current)
SELECT DISTINCT
src.flight_number, src.tail_number, src.origin_airport, src.destination_airport,
src.departure_time, src.arrival_time, src.total_delay, src.flight_status,
CURRENT_TIMESTAMP(), NULL, 'Y'
FROM aviation_project.silver.flights_stream src
INNER JOIN aviation_project.gold.dim_flights tgt
  ON src.flight_number = tgt.flight_number
  AND src.tail_number = tgt.tail_number
WHERE tgt.is_current = 'N'
  AND tgt.flight_number NOT IN (
    SELECT flight_number FROM aviation_project.gold.dim_flights WHERE is_current = 'Y'
  )

In [0]:
%sql
select * from aviation_project.gold.dim_flights
where flight_number in (1010, 3444, 1509, 5658)
order by flight_number asc 

In [0]:
%sql
-- SCD Type 2 consistency check
SELECT flight_number, COUNT(*) AS current_cnt
FROM aviation_project.gold.dim_flights
WHERE is_current = true
GROUP BY flight_number
HAVING COUNT(*) > 1;


In [0]:
%sql
describe table aviation_project.gold.dim_flights

In [0]:
%sql
-- Null values check

SELECT
  COUNT(*) AS total_records,

  SUM(CASE WHEN dim_flight_id IS NULL THEN 1 ELSE 0 END) AS dim_flight_id_nulls,
  SUM(CASE WHEN flight_number IS NULL THEN 1 ELSE 0 END) AS flight_number_nulls,
  SUM(CASE WHEN tail_number IS NULL THEN 1 ELSE 0 END) AS tail_number_nulls,
  SUM(CASE WHEN origin_airport IS NULL THEN 1 ELSE 0 END) AS origin_airport_nulls,
  SUM(CASE WHEN destination_airport IS NULL THEN 1 ELSE 0 END) AS destination_airport_nulls,
  SUM(CASE WHEN departure_time IS NULL THEN 1 ELSE 0 END) AS departure_time_nulls,
  SUM(CASE WHEN arrival_time IS NULL THEN 1 ELSE 0 END) AS arrival_time_nulls,
  SUM(CASE WHEN total_delay IS NULL THEN 1 ELSE 0 END) AS total_delay_nulls,
  SUM(CASE WHEN flight_status IS NULL THEN 1 ELSE 0 END) AS flight_status_nulls,
  SUM(CASE WHEN effective_start IS NULL THEN 1 ELSE 0 END) AS effective_start_nulls,
  SUM(CASE WHEN effective_end IS NULL THEN 1 ELSE 0 END) AS effective_end_nulls,
  SUM(CASE WHEN is_current IS NULL THEN 1 ELSE 0 END) AS is_current_nulls
FROM aviation_project.gold.dim_flights;


In [0]:
%sql

-- primary key uniqueness check

select count(dim_flight_id) as cnt, dim_flight_id
from aviation_project.gold.dim_flights
group by dim_flight_id
having cnt > 1

In [0]:
%sql
-- departure time must be before arrival time

SELECT *
FROM aviation_project.gold.dim_flights
WHERE
  departure_time >= arrival_time
  ;
