# Entity Decomposition

The `airlines` entity in the raw dataset contains two types of properties: Properties that change on a daily basis or each time there is a new flight event (e.g. `fl_date`, `dep_delay`, `actual_elapsed_time`, etc.) and properties that are relatively static and don't change or change slowly over months or year (e.g. `op_carrier_fl_num`, `origin_airport_id`, `dest_airport_id`, etc.).    

In [None]:
%%bigquery
select * from airline_raw.airlines
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fl_date,op_carrier_airline_id,tail_num,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,dest_airport_id,dest_airport_seq_id,...,cancelled,cancellation_code,crs_elapsed_time,actual_elapsed_time,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,load_time
0,2018/8/1,20368,241NV,1776,11697,1169706,32467,FLL,12544,1254403,...,0,,119,114,,,,,,2024-01-22 02:20:30.385902+00:00
1,2018/8/2,20368,224NV,1776,11697,1169706,32467,FLL,12544,1254403,...,0,,119,123,,,,,,2024-01-22 02:20:30.385902+00:00
2,2018/8/3,20368,222NV,1276,11697,1169706,32467,FLL,12544,1254403,...,0,,119,112,,,,,,2024-01-22 02:20:30.385902+00:00
3,2018/8/4,20368,229NV,1776,11697,1169706,32467,FLL,12544,1254403,...,0,,119,111,,,,,,2024-01-22 02:20:30.385902+00:00
4,2018/8/5,20368,229NV,1776,11697,1169706,32467,FLL,12544,1254403,...,0,,119,113,,,,,,2024-01-22 02:20:30.385902+00:00


In [None]:
%%bigquery
select distinct op_carrier_airline_id,
op_carrier_fl_num,
origin_airport_id,
origin_airport_seq_id,
origin_city_market_id,
origin,
dest_airport_id,
dest_airport_seq_id,
dest_city_market_id,
dest
from airline_raw.airlines
order by op_carrier_airline_id, op_carrier_fl_num, origin, dest
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,op_carrier_airline_id,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,dest_airport_id,dest_airport_seq_id,dest_city_market_id,dest
0,19393,1,10821,1082106,30852,BWI,14635,1463502,31714,RSW
1,19393,1,11259,1125903,30194,DAL,12191,1219102,31453,HOU
2,19393,1,12191,1219102,31453,HOU,10821,1082106,30852,BWI
3,19393,1,12191,1219102,31453,HOU,11140,1114008,31140,CRP
4,19393,2,10140,1014005,30140,ABQ,11292,1129202,30325,DEN


In [None]:
%%bigquery
select (select count(*) from airline_raw.airlines) as total_flight_count,
  (select count(*) from (
    select distinct op_carrier_airline_id,
      op_carrier_fl_num,
      origin_airport_id,
      origin_airport_seq_id,
      origin_city_market_id,
      origin,
      dest_airport_id,
      dest_airport_seq_id,
      dest_city_market_id,
      dest
      from airline_raw.airlines)) as unique_flight_count

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_flight_count,unique_flight_count
0,701352,54540


# Primary Key

Before we can take apart the airlines table, we need to decide what its primary key should be. Note that BIRD didn't specify a PK for this table. We would ideally like to have a single field represent the PK so we don't have to concatenate multiple fields together to refer to a unique flight.

In [None]:
%%bigquery
select op_carrier_airline_id,	op_carrier_fl_num, origin, dest, concat(op_carrier_airline_id,	op_carrier_fl_num, origin, dest) as flight_number
from airline_raw.airlines
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,op_carrier_airline_id,op_carrier_fl_num,origin,dest,flight_number
0,20368,1776,FLL,USA,203681776FLLUSA
1,20368,1776,FLL,USA,203681776FLLUSA
2,20368,1276,FLL,USA,203681276FLLUSA
3,20368,1776,FLL,USA,203681776FLLUSA
4,20368,1776,FLL,USA,203681776FLLUSA


In [None]:
%%bigquery
select count(*) as unique_flights
from
  (select distinct op_carrier_airline_id,	op_carrier_fl_num, origin, dest, concat(op_carrier_airline_id,	op_carrier_fl_num, origin, dest)
  from airline_raw.airlines)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,unique_flights
0,54540


Create an airlines staging table with a flight number to be able to identify a unique flight by one field. Note that this will be an intermediate table from which we will create the final staging tables. We will follow our naming convention of lowercasing raw and intermediate table names.

In [1]:
%%bigquery
create or replace table airline_stg.airlines as
  select concat(op_carrier_airline_id, op_carrier_fl_num, origin, dest) as fl_num,
      op_carrier_airline_id,
      op_carrier_fl_num,
      origin_airport_id,
      origin_airport_seq_id,
      origin_city_market_id,
      origin,
      dest_airport_id,
      dest_airport_seq_id,
      dest_city_market_id,
      dest,
      fl_date,
      tail_num,
      crs_dep_time,
      dep_time,
      dep_delay,
      dep_delay_new,
      arr_time,
      arr_delay,
      arr_delay_new,
      cancelled,
      cancellation_code,
      crs_elapsed_time,
      actual_elapsed_time,
      carrier_delay,
      weather_delay,
      nas_delay,
      security_delay,
      late_aircraft_delay,
      'bird' as data_source,
      load_time
  from airline_raw.airlines


Query is running:   0%|          |

In [None]:
%%bigquery
select * from airline_stg.airlines
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fl_num,op_carrier_airline_id,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,dest_airport_id,dest_airport_seq_id,dest_city_market_id,...,cancelled,cancellation_code,crs_elapsed_time,actual_elapsed_time,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,load_time
0,20368639USASFB,20368,639,12544,1254403,32544,USA,14761,1476106,34761,...,0,,87,80,,,,,,2024-01-26 22:23:44.946037+00:00
1,20368639USASFB,20368,639,12544,1254403,32544,USA,14761,1476106,34761,...,0,,87,77,,,,,,2024-01-26 22:23:44.946037+00:00
2,20368639USASFB,20368,639,12544,1254403,32544,USA,14761,1476106,34761,...,0,,87,86,0.0,0.0,0.0,0.0,23.0,2024-01-26 22:23:44.946037+00:00
3,20368639USASFB,20368,639,12544,1254403,32544,USA,14761,1476106,34761,...,0,,87,81,0.0,47.0,0.0,0.0,20.0,2024-01-26 22:23:44.946037+00:00
4,20368639USASFB,20368,639,12544,1254403,32544,USA,14761,1476106,34761,...,0,,87,82,,,,,,2024-01-26 22:23:44.946037+00:00


# Flight

Now we are ready to create the Flight table which will have a record for each unique flight across time:

In [2]:
%%bigquery
create or replace table airline_stg.Flight as
  select distinct fl_num,
  op_carrier_airline_id,
  op_carrier_fl_num,
  origin_airport_id,
  origin_airport_seq_id,
  origin_city_market_id,
  origin,
  dest_airport_id,
  dest_airport_seq_id,
  dest_city_market_id,
  dest,
  data_source,
  load_time
  from airline_stg.airlines


Query is running:   0%|          |

In [None]:
%%bigquery
select * from airline_stg.Flight
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fl_num,op_carrier_airline_id,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,dest_airport_id,dest_airport_seq_id,dest_city_market_id,dest,load_time
0,203044256ABEDTW,20304,4256,10135,1013505,30135,ABE,11433,1143302,31295,DTW,2024-01-26 22:23:44.946037+00:00
1,203044248ABEDTW,20304,4248,10135,1013505,30135,ABE,11433,1143302,31295,DTW,2024-01-26 22:23:44.946037+00:00
2,203044250ABEDTW,20304,4250,10135,1013505,30135,ABE,11433,1143302,31295,DTW,2024-01-26 22:23:44.946037+00:00
3,203044249ABEDTW,20304,4249,10135,1013505,30135,ABE,11433,1143302,31295,DTW,2024-01-26 22:23:44.946037+00:00
4,203682125ABESFB,20368,2125,10135,1013505,30135,ABE,14761,1476106,34761,SFB,2024-01-26 22:23:44.946037+00:00


In [None]:
%%bigquery
select count(*) as unique_flights from airline_stg.Flight

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,unique_flights
0,54540


# Flight_History

This table will have a record for each instance of a flight, or flights over time:

In [3]:
%%bigquery
create or replace table airline_stg.Flight_History as
  select fl_date,
      fl_num,
      tail_num,
      crs_dep_time,
      dep_time,
      dep_delay,
      dep_delay_new,
      arr_time,
      arr_delay,
      arr_delay_new,
      cancelled,
      cancellation_code,
      crs_elapsed_time,
      actual_elapsed_time,
      carrier_delay,
      weather_delay,
      nas_delay,
      security_delay,
      late_aircraft_delay,
      data_source,
      load_time
  from airline_stg.airlines

Query is running:   0%|          |

In [4]:
%%bigquery
select * from airline_stg.Flight_History
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fl_date,fl_num,tail_num,crs_dep_time,dep_time,dep_delay,dep_delay_new,arr_time,arr_delay,arr_delay_new,...,cancellation_code,crs_elapsed_time,actual_elapsed_time,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,data_source,load_time
0,2018/8/2,203635018JFKDTW,N279PQ,1655,,,,,,,...,C,150,,,,,,,bird,2024-01-26 22:23:44.946037+00:00
1,2018/8/8,205006259IADDTW,N656CA,956,,,,,,,...,A,94,,,,,,,bird,2024-01-26 22:23:44.946037+00:00
2,2018/8/10,196872198SEAPDX,N624QX,2230,,,,,,,...,C,55,,,,,,,bird,2024-01-26 22:23:44.946037+00:00
3,2018/8/28,196872478STSPDX,N421QX,1830,,,,,,,...,A,98,,,,,,,bird,2024-01-26 22:23:44.946037+00:00
4,2018/8/17,199771551EWRPDX,,1915,,,,,,,...,B,356,,,,,,,bird,2024-01-26 22:23:44.946037+00:00


In [5]:
%%bigquery
select (select count(*) from airline_stg.Flight_History) as flight_history_count,
(select count(*) from airline_stg.airlines) as original_flight_count

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,flight_history_count,original_flight_count
0,701352,701352


# Primary Keys

In [6]:
%%bigquery
alter table airline_stg.Flight
  add primary key (fl_num) not enforced;

Query is running:   0%|          |

In [7]:
%%bigquery
select fl_num, count(*) duplicate_records
from airline_stg.Flight
group by fl_num
having count(*) > 1
order by count(*) desc

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,fl_num,duplicate_records


In [8]:
%%bigquery
alter table airline_stg.Flight_History
  add primary key (fl_date, fl_num) not enforced;

Query is running:   0%|          |

In [9]:
%%bigquery
select fl_date, fl_num, count(*) duplicate_records
from airline_stg.Flight_History
group by fl_date, fl_num
having count(*) > 1


Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,fl_date,fl_num,duplicate_records


# Foreign Keys

In [10]:
%%bigquery
alter table airline_stg.Flight_History add foreign key (fl_num)
  references airline_stg.Flight (fl_num) not enforced;

Query is running:   0%|          |

In [11]:
%%bigquery
select count(*) orphan_records
from airline_stg.Flight_History
where fl_num not in (select fl_num from airline_stg.Flight)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orphan_records
0,0


In [13]:
%%bigquery
alter table airline_stg.Flight add foreign key (op_carrier_airline_id)
  references airline_stg.Air_Carrier (airline_id) not enforced;

Query is running:   0%|          |

In [None]:
%%bigquery
select count(*) orphan_records
from airline_stg.Flight
where op_carrier_airline_id not in (select airline_id from airline_stg.Air_Carrier)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orphan_records
0,0


Note: We can't define and check the foreign keys `Flight.origin` and `Flight.dest` at this point because we have not yet created the combined `Airport` table. This work will be done as part of Project 3.

# Cleanup

Delete the intermediate staging table:

In [14]:
%%bigquery
drop table airline_stg.airlines

Query is running:   0%|          |