# Entity Decomposition

The `airlines` entity in the raw dataset contains two types of properties: Properties that change on a daily basis or each time there is a new flight event (e.g. `fl_date`, `dep_delay`, `actual_elapsed_time`, etc.) and properties that are relatively static and don't change or change slowly over months or year (e.g. `op_carrier_fl_num`, `origin_airport_id`, `dest_airport_id`, etc.).    

In [None]:
%%bigquery
select * from airline_raw.airlines
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fl_date,op_carrier_airline_id,tail_num,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,dest_airport_id,dest_airport_seq_id,...,cancelled,cancellation_code,crs_elapsed_time,actual_elapsed_time,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,load_time
0,2018/8/1,20368,241NV,1776,11697,1169706,32467,FLL,12544,1254403,...,0,,119,114,,,,,,2024-01-22 02:20:30.385902+00:00
1,2018/8/2,20368,224NV,1776,11697,1169706,32467,FLL,12544,1254403,...,0,,119,123,,,,,,2024-01-22 02:20:30.385902+00:00
2,2018/8/3,20368,222NV,1276,11697,1169706,32467,FLL,12544,1254403,...,0,,119,112,,,,,,2024-01-22 02:20:30.385902+00:00
3,2018/8/4,20368,229NV,1776,11697,1169706,32467,FLL,12544,1254403,...,0,,119,111,,,,,,2024-01-22 02:20:30.385902+00:00
4,2018/8/5,20368,229NV,1776,11697,1169706,32467,FLL,12544,1254403,...,0,,119,113,,,,,,2024-01-22 02:20:30.385902+00:00


In [None]:
%%bigquery
select distinct op_carrier_airline_id,
op_carrier_fl_num,
origin_airport_id,
origin_airport_seq_id,
origin_city_market_id,
origin,
dest_airport_id,
dest_airport_seq_id,
dest_city_market_id,
dest
from airline_raw.airlines
order by op_carrier_airline_id, op_carrier_fl_num, origin, dest
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,op_carrier_airline_id,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,dest_airport_id,dest_airport_seq_id,dest_city_market_id,dest
0,19393,1,10821,1082106,30852,BWI,14635,1463502,31714,RSW
1,19393,1,11259,1125903,30194,DAL,12191,1219102,31453,HOU
2,19393,1,12191,1219102,31453,HOU,10821,1082106,30852,BWI
3,19393,1,12191,1219102,31453,HOU,11140,1114008,31140,CRP
4,19393,2,10140,1014005,30140,ABQ,11292,1129202,30325,DEN


In [None]:
%%bigquery
select (select count(*) from airline_raw.airlines) as total_flight_count,
  (select count(*) from (
    select distinct op_carrier_airline_id,
      op_carrier_fl_num,
      origin_airport_id,
      origin_airport_seq_id,
      origin_city_market_id,
      origin,
      dest_airport_id,
      dest_airport_seq_id,
      dest_city_market_id,
      dest
      from airline_raw.airlines)) as unique_flight_count

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_flight_count,unique_flight_count
0,701352,54540


# Primary Key

Before we can take apart the airlines table, we need to decide what its primary key should be. Note that BIRD didn't specify a PK for this table. We would ideally like to have a single field represent the PK so we don't have to concatenate multiple fields together to refer to a unique flight.

In [None]:
%%bigquery
select op_carrier_airline_id,	op_carrier_fl_num, origin, dest, concat(op_carrier_airline_id,	op_carrier_fl_num, origin, dest) as flight_number
from airline_raw.airlines
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,op_carrier_airline_id,op_carrier_fl_num,origin,dest,flight_number
0,20368,1776,FLL,USA,203681776FLLUSA
1,20368,1776,FLL,USA,203681776FLLUSA
2,20368,1276,FLL,USA,203681276FLLUSA
3,20368,1776,FLL,USA,203681776FLLUSA
4,20368,1776,FLL,USA,203681776FLLUSA


In [None]:
%%bigquery
select count(*) as unique_flights
from
  (select distinct op_carrier_airline_id,	op_carrier_fl_num, origin, dest, concat(op_carrier_airline_id,	op_carrier_fl_num, origin, dest)
  from airline_raw.airlines)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,unique_flights
0,54540


Create an airlines staging table with a flight number to be able to identify a unique flight by one field. Note that this will be an intermediate table from which we will create the final staging tables. We will follow our naming convention of lowercasing raw and intermediate table names.

In [20]:
%%bigquery
create or replace table airline_stg.airlines as
  select concat(op_carrier_airline_id, op_carrier_fl_num, origin, dest) as fl_num,
      op_carrier_airline_id,
      op_carrier_fl_num,
      origin_airport_id,
      origin_airport_seq_id,
      origin_city_market_id,
      origin,
      dest_airport_id,
      dest_airport_seq_id,
      dest_city_market_id,
      dest,
      fl_date,
      tail_num,
      crs_dep_time,
      dep_time,
      dep_delay,
      dep_delay_new,
      arr_time,
      arr_delay,
      arr_delay_new,
      cancelled,
      cancellation_code,
      crs_elapsed_time,
      actual_elapsed_time,
      carrier_delay,
      weather_delay,
      nas_delay,
      security_delay,
      late_aircraft_delay,
      'bird' as data_source,
      load_time
  from airline_raw.airlines


Query is running:   0%|          |

In [3]:
%%bigquery
select * from airline_stg.airlines
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fl_num,op_carrier_airline_id,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,dest_airport_id,dest_airport_seq_id,dest_city_market_id,...,cancellation_code,crs_elapsed_time,actual_elapsed_time,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,data_source,load_time
0,19790572ABEATL,19790,572,10135,1013505,30135,ABE,10397,1039707,30397,...,,121,125,,,,,,bird,2024-01-26 22:23:44.946037+00:00
1,19790572ABEATL,19790,572,10135,1013505,30135,ABE,10397,1039707,30397,...,,121,115,,,,,,bird,2024-01-26 22:23:44.946037+00:00
2,19790572ABEATL,19790,572,10135,1013505,30135,ABE,10397,1039707,30397,...,,121,121,,,,,,bird,2024-01-26 22:23:44.946037+00:00
3,19790572ABEATL,19790,572,10135,1013505,30135,ABE,10397,1039707,30397,...,,121,118,,,,,,bird,2024-01-26 22:23:44.946037+00:00
4,19790572ABEATL,19790,572,10135,1013505,30135,ABE,10397,1039707,30397,...,,121,112,,,,,,bird,2024-01-26 22:23:44.946037+00:00


# Flight

Now we are ready to create the Flight table which will have only one record per flight (regardless of flight schedule). We want the destination table to only contain usable fields, so we are going to exclude the fields `origin_airport_id`, `origin_airport_seq_id`, `origin_city_market_id`,
`dest_airport_id`, `dest_airport_seq_id`,
`dest_city_market_id` from the destination table as those identifiers don't appear in any of the parent tables.

In [4]:
%%bigquery
create or replace table airline_stg.Flight as
  select distinct fl_num,
  op_carrier_airline_id,
  op_carrier_fl_num,
  origin as origin_airport,
  dest as dest_airport,
  data_source,
  load_time
  from airline_stg.airlines


Query is running:   0%|          |

In [5]:
%%bigquery
select * from airline_stg.Flight
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fl_num,op_carrier_airline_id,op_carrier_fl_num,origin_airport,dest_airport,data_source,load_time
0,202374645ORDABE,20237,4645,ORD,ABE,bird,2024-01-26 22:23:44.946037+00:00
1,19790949ATLABE,19790,949,ATL,ABE,bird,2024-01-26 22:23:44.946037+00:00
2,197901794ATLABE,19790,1794,ATL,ABE,bird,2024-01-26 22:23:44.946037+00:00
3,200464852ORDABE,20046,4852,ORD,ABE,bird,2024-01-26 22:23:44.946037+00:00
4,200464830ORDABE,20046,4830,ORD,ABE,bird,2024-01-26 22:23:44.946037+00:00


In [6]:
%%bigquery
select count(*) as unique_flights from airline_stg.Flight

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,unique_flights
0,54540


# Flight_History

This table will track the flights over time and have a record per scheduled flight.

Before creating the destination table, we want to check and see if there are any fields that only contain null values. We will exclude any fields which can't be derived and are completely null.

In [23]:
%%bigquery
select count(*) dep_time_not_null
from airline_stg.airlines
where dep_time is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,dep_time_not_null
0,687789


In [24]:
%%bigquery
select count(*) dep_delay_not_null
from airline_stg.airlines
where dep_delay is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,dep_delay_not_null
0,687153


In [26]:
%%bigquery
select count(*) dep_delay_new_not_null
from airline_stg.airlines
where dep_delay_new is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,dep_delay_new_not_null
0,687153


In [27]:
%%bigquery
select count(*) cancelled_not_null
from airline_stg.airlines
where cancelled is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,cancelled_not_null
0,701352


In [28]:
%%bigquery
select count(*) cancellation_code_not_null
from airline_stg.airlines
where cancellation_code is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,cancellation_code_not_null
0,14270


In [29]:
%%bigquery
select count(*) crs_elapsed_time_not_null
from airline_stg.airlines
where crs_elapsed_time is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,crs_elapsed_time_not_null
0,701352


In [30]:
%%bigquery
select count(*) actual_elapsed_time_not_null
from airline_stg.airlines
where actual_elapsed_time is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,actual_elapsed_time_not_null
0,684838


In [31]:
%%bigquery
select count(*) carrier_delay_not_null
from airline_stg.airlines
where carrier_delay is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,carrier_delay_not_null
0,157214


In [32]:
%%bigquery
select count(*) weather_delay_not_null
from airline_stg.airlines
where weather_delay is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,weather_delay_not_null
0,157214


In [33]:
%%bigquery
select count(*) nas_delay_not_null
from airline_stg.airlines
where nas_delay is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,nas_delay_not_null
0,157214


In [34]:
%%bigquery
select count(*) security_delay_not_null
from airline_stg.airlines
where security_delay is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,security_delay_not_null
0,157214


In [35]:
%%bigquery
select count(*) late_aircraft_delay_not_null
from airline_stg.airlines
where late_aircraft_delay is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,late_aircraft_delay_not_null
0,157214


As all the fields have values in them, we will preserve them in the destination table.

In [7]:
%%bigquery
create or replace table airline_stg.Flight_History as
  select fl_date,
      fl_num,
      tail_num,
      crs_dep_time,
      dep_time,
      dep_delay,
      dep_delay_new,
      arr_time,
      arr_delay,
      arr_delay_new,
      cancelled,
      cancellation_code,
      crs_elapsed_time,
      actual_elapsed_time,
      carrier_delay,
      weather_delay,
      nas_delay,
      security_delay,
      late_aircraft_delay,
      data_source,
      load_time
  from airline_stg.airlines

Query is running:   0%|          |

In [8]:
%%bigquery
select * from airline_stg.Flight_History
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fl_date,fl_num,tail_num,crs_dep_time,dep_time,dep_delay,dep_delay_new,arr_time,arr_delay,arr_delay_new,...,cancellation_code,crs_elapsed_time,actual_elapsed_time,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,data_source,load_time
0,2018/8/21,202374675ABEORD,,1735,,,,,,,...,C,135,,,,,,,bird,2024-01-26 22:23:44.946037+00:00
1,2018/8/17,197902155DTWEWR,N946AT,2002,,,,,,,...,A,110,,,,,,,bird,2024-01-26 22:23:44.946037+00:00
2,2018/8/6,203983623DTWLGA,N834AE,1844,,,,,,,...,A,120,,,,,,,bird,2024-01-26 22:23:44.946037+00:00
3,2018/8/29,203983888DTWLGA,N840AE,1737,,,,,,,...,A,113,,,,,,,bird,2024-01-26 22:23:44.946037+00:00
4,2018/8/13,204523410DTWEWR,N733YX,1315,,,,,,,...,C,106,,,,,,,bird,2024-01-26 22:23:44.946037+00:00


In [9]:
%%bigquery
select (select count(*) from airline_stg.Flight_History) as flight_history_count,
(select count(*) from airline_stg.airlines) as original_flight_count

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,flight_history_count,original_flight_count
0,701352,701352


# Primary Keys

In [10]:
%%bigquery
alter table airline_stg.Flight
  add primary key (fl_num) not enforced;

Query is running:   0%|          |

In [11]:
%%bigquery
select fl_num, count(*) duplicate_records
from airline_stg.Flight
group by fl_num
having count(*) > 1
order by count(*) desc

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,fl_num,duplicate_records


In [12]:
%%bigquery
alter table airline_stg.Flight_History
  add primary key (fl_date, fl_num) not enforced;

Query is running:   0%|          |

In [13]:
%%bigquery
select fl_date, fl_num, count(*) duplicate_records
from airline_stg.Flight_History
group by fl_date, fl_num
having count(*) > 1


Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,fl_date,fl_num,duplicate_records


# Foreign Keys

In [14]:
%%bigquery
alter table airline_stg.Flight_History add foreign key (fl_num)
  references airline_stg.Flight (fl_num) not enforced;

Query is running:   0%|          |

In [15]:
%%bigquery
select count(*) orphan_records
from airline_stg.Flight_History
where fl_num not in (select fl_num from airline_stg.Flight)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orphan_records
0,0


In [16]:
%%bigquery
alter table airline_stg.Flight add foreign key (op_carrier_airline_id)
  references airline_stg.Air_Carrier (airline_id) not enforced;

Query is running:   0%|          |

In [17]:
%%bigquery
select count(*) orphan_records
from airline_stg.Flight
where op_carrier_airline_id not in (select airline_id from airline_stg.Air_Carrier)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orphan_records
0,0


Note: We can't define and check the foreign keys `Flight.origin` and `Flight.dest` at this point because we have not yet created the combined `Airport` table. This work will be done as part of Project 3.

# Cleanup

Delete the intermediate staging table:

In [18]:
%%bigquery
drop table airline_stg.airlines

Query is running:   0%|          |