consumer airfare report:
https://data.transportation.gov/Aviation/Consumer-Airfare-Report-Table-2-Top-1-000-City-Pai/wqw2-rjgd/about_data

on time monthly data:
https://www.transtats.bts.gov/Fields.asp?gnoyr_VQ=FGK

airline codes:
https://www.bansard.com/sites/default/files/download_documents/Bansard-airlines-codes-IATA-ICAO.xlsx&ved=2ahUKEwj2zPThgpSIAxWLITQIHdmMGsEQFnoECAwQAQ&usg=AOvVaw1t6UbPkojIZrVMuOTDghg9

In [1]:
import sys
import os

# Get the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.pardir))

# Add the project root to sys.path
sys.path.append(PROJECT_ROOT)

# Now you can import the config module
from config.config import DATA_PATH

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from urllib.request import urlretrieve
import holidays
import re


In [3]:
pd.set_option('display.max_columns', None)

# Import Flight Performance Data

In [4]:
import pandas as pd
import os

def read_and_combine_csv(base_path, data_folder):
    """
    Reads multiple CSV files from a specified directory and combines them into a single DataFrame.

    Args:
    base_path (str): The base directory path where data folders are stored.
    data_folder (str): The folder name containing the CSV files to read.

    Returns:
    DataFrame: A combined DataFrame containing data from all CSV files in the specified directory.
    """
    # Construct the full path to the folder containing CSV files
    folder_path = os.path.join(base_path, data_folder)
    
    # Initialize an empty list to store individual DataFrames
    dataframes_list = []
    
    # Iterate through each file in the directory
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path, low_memory=False)
            dataframes_list.append(df)
    
    # Concatenate all DataFrames in the list into one DataFrame
    combined_df = pd.concat(dataframes_list, ignore_index=True)
    
    return combined_df

In [5]:
# Assume DATA_PATH is already imported from your config
flight_data = '2023-performance-data'
df_2023 = read_and_combine_csv(DATA_PATH+'/raw', flight_data)

# Check the combined DataFrame
print(df_2023.head())

   Year  Quarter  Month  DayofMonth  DayOfWeek  FlightDate  \
0  2023        3      8           6          7  2023-08-06   
1  2023        3      8           7          1  2023-08-07   
2  2023        3      8           9          3  2023-08-09   
3  2023        3      8          10          4  2023-08-10   
4  2023        3      8          12          6  2023-08-12   

  Marketing_Airline_Network Operated_or_Branded_Code_Share_Partners  \
0                        NK                                      NK   
1                        NK                                      NK   
2                        NK                                      NK   
3                        NK                                      NK   
4                        NK                                      NK   

   DOT_ID_Marketing_Airline IATA_Code_Marketing_Airline  \
0                     20416                          NK   
1                     20416                          NK   
2                     2

In [6]:
df_codes = pd.read_csv(DATA_PATH + '/raw/L_AIRLINE_ID.csv')

## Inspect Data Quality

In [7]:
df_2023.shape

(7278739, 120)

In [8]:
df_2023.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,Originally_Scheduled_Code_Share_Airline,DOT_ID_Originally_Scheduled_Code_Share_Airline,IATA_Code_Originally_Scheduled_Code_Share_Airline,Flight_Num_Originally_Scheduled_Code_Share_Airline,Operating_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,Flight_Number_Operating_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,CancellationCode,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,TotalAddGTime,LongestAddGTime,DivAirportLandings,DivReachedDest,DivActualElapsedTime,DivArrDelay,DivDistance,Div1Airport,Div1AirportID,Div1AirportSeqID,Div1WheelsOn,Div1TotalGTime,Div1LongestGTime,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum,Div3Airport,Div3AirportID,Div3AirportSeqID,Div3WheelsOn,Div3TotalGTime,Div3LongestGTime,Div3WheelsOff,Div3TailNum,Div4Airport,Div4AirportID,Div4AirportSeqID,Div4WheelsOn,Div4TotalGTime,Div4LongestGTime,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Duplicate,Unnamed: 119
0,2023,3,8,6,7,2023-08-06,NK,NK,20416,NK,2252,,,,,NK,20416,NK,N978NK,2252,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2359,,,,,,2300-2359,,,,,722,,,,,,0700-0759,1.0,B,0.0,263.0,,,1.0,1916.0,8,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,
1,2023,3,8,7,1,2023-08-07,NK,NK,20416,NK,2252,,,,,NK,20416,NK,N974NK,2252,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2359,115.0,76.0,76.0,1.0,5.0,2300-2359,19.0,134.0,822.0,9.0,722,831.0,69.0,69.0,1.0,4.0,0700-0759,0.0,,0.0,263.0,256.0,228.0,1.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,
2,2023,3,8,9,3,2023-08-09,NK,NK,20416,NK,2252,,,,,NK,20416,NK,N519NK,2252,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2310,2259.0,-11.0,0.0,0.0,-1.0,2300-2359,14.0,2313.0,605.0,10.0,628,615.0,-13.0,0.0,0.0,-1.0,0600-0659,0.0,,0.0,258.0,256.0,232.0,1.0,1916.0,8,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,
3,2023,3,8,10,4,2023-08-10,NK,NK,20416,NK,2252,,,,,NK,20416,NK,N532NK,2252,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2310,2302.0,-8.0,0.0,0.0,-1.0,2300-2359,12.0,2314.0,551.0,7.0,628,558.0,-30.0,0.0,0.0,-2.0,0600-0659,0.0,,0.0,258.0,236.0,217.0,1.0,1916.0,8,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,
4,2023,3,8,12,6,2023-08-12,NK,NK,20416,NK,2252,,,,,NK,20416,NK,N529NK,2252,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2310,2314.0,4.0,4.0,0.0,0.0,2300-2359,18.0,2332.0,618.0,6.0,628,624.0,-4.0,0.0,0.0,-1.0,0600-0659,0.0,,0.0,258.0,250.0,226.0,1.0,1916.0,8,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,


In [9]:
df_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7278739 entries, 0 to 7278738
Columns: 120 entries, Year to Unnamed: 119
dtypes: float64(71), int64(22), object(27)
memory usage: 6.5+ GB


In [10]:
df_2023.columns = df_2023.columns.str.strip()

## Missing Values

In [11]:
null_vals = df_2023.isna().sum().reset_index()
null_vals.columns = ['column', 'num_missing']
null_vals['perc_missing'] = round(null_vals['num_missing'] / len(df_2023) *100, 3)
missing_over_50 = null_vals[null_vals['perc_missing'] > 50].sort_values('perc_missing', ascending=False)
print(missing_over_50.shape)
missing_over_50

(58, 3)


Unnamed: 0,column,num_missing,perc_missing
119,Unnamed: 119,7278739,100.0
106,Div4TotalGTime,7278739,100.0
94,Div3Airport,7278737,100.0
95,Div3AirportID,7278737,100.0
96,Div3AirportSeqID,7278737,100.0
97,Div3WheelsOn,7278737,100.0
98,Div3TotalGTime,7278737,100.0
99,Div3LongestGTime,7278737,100.0
100,Div3WheelsOff,7278738,100.0
101,Div3TailNum,7278738,100.0


Fifty-eight columns are missing over 50% of their values, 53 of which are missing over 90% of their values. Most of these columns are dealing with details about diverted flights. There is a column, `Diverted`, which captures whether or not a flight was diverted and it does not contain any missing values. At this point, it will be best to remove the columns with over 90% of their values missing. 

The different delay types (`LateAircraftDelay`, `SecurityDelay`, `NASDelay`, `WeatherDelay`, `CarrierDelay`) may come in handy as I would expect most flights do not experience delays. These different types of delays may inform some segmentation of delayed flights later on. 

In [12]:
# drop columns with over 80% missing values
cols_to_drop = null_vals[null_vals['perc_missing'] >= 80]['column'].tolist()

df = df_2023.drop(columns=cols_to_drop)

In [13]:
missing_under_50 = null_vals[(null_vals['perc_missing'] <= 50) & (null_vals['perc_missing'] > 0)].sort_values('perc_missing', ascending=False)
print(missing_under_50.shape)
missing_under_50


(17, 3)


Unnamed: 0,column,num_missing,perc_missing
61,AirTime,111695,1.535
60,ActualElapsedTime,111695,1.535
54,ArrivalDelayGroups,111695,1.535
53,ArrDel15,111695,1.535
52,ArrDelayMinutes,111695,1.535
51,ArrDelay,111695,1.535
48,TaxiIn,95647,1.314
50,ArrTime,95646,1.314
47,WheelsOn,95647,1.314
46,WheelsOff,93119,1.279


Seventeen columns have less than 2% of their values missing. 

Earlier I deleted rows with missing values because of the % being insignificant. However, some of these missing observations likely overlap with the 11,032 cancelled flights. Others are just missing informaiton for some reason. I found this out because when I wanted to see the counts of cancelled vs not cancelled flights, I only saw details for not cancelled flights. 

It would make sense that cancelled flights would have missing information in other columns. I want to identify which flights were cancelled and how many there were. 

In [14]:
df['Cancelled'].value_counts()

Cancelled
0.0    7184842
1.0      93897
Name: count, dtype: int64

In [15]:
cancelled_flights = df['Cancelled'] == 1.0
print('Total number of cancelled flights: ', len(df[cancelled_flights]))

Total number of cancelled flights:  93897


In [16]:
not_cancelled = df[~cancelled_flights]
not_cancelled_missing = not_cancelled.isna().sum().reset_index()
not_cancelled_missing.columns = ['Column', 'Count_Missing']
not_cancelled_missing[not_cancelled_missing['Count_Missing'] > 0]

Unnamed: 0,Column,Count_Missing
43,WheelsOn,1750
44,TaxiIn,1750
46,ArrTime,1749
47,ArrDelay,17798
48,ArrDelayMinutes,17798
49,ArrDel15,17798
50,ArrivalDelayGroups,17798
54,CRSElapsedTime,1
55,ActualElapsedTime,17798
56,AirTime,17798


There were over 93,000 cancelled flights in 2023. However, some of the data is still missing information for several columns even after excluding the cancelled flights. 

I still maintain that the different delay types (`LateAircraftDelay`, `SecurityDelay`, `NASDelay`, `WeatherDelay`, `CarrierDelay`) may come in handy as I would expect most flights do not experience delays.

However, I believe I should remove observations that are missing data in the other columns. 

In [17]:
missing_critical_data = ((df['Cancelled'] == 0) & (df['ArrTime'].isna())) | ((df['Cancelled'] == 0) & (df['ArrDelayMinutes'].isna()))
print('Total non-cancelled flights with missing data: ', missing_critical_data.sum())

Total non-cancelled flights with missing data:  17798


In [18]:
# drop rows missing less than 3% of values
df = df[~missing_critical_data]

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7260941 entries, 0 to 7278738
Data columns (total 67 columns):
 #   Column                                   Dtype  
---  ------                                   -----  
 0   Year                                     int64  
 1   Quarter                                  int64  
 2   Month                                    int64  
 3   DayofMonth                               int64  
 4   DayOfWeek                                int64  
 5   FlightDate                               object 
 6   Marketing_Airline_Network                object 
 7   Operated_or_Branded_Code_Share_Partners  object 
 8   DOT_ID_Marketing_Airline                 int64  
 9   IATA_Code_Marketing_Airline              object 
 10  Flight_Number_Marketing_Airline          int64  
 11  Operating_Airline                        object 
 12  DOT_ID_Operating_Airline                 int64  
 13  IATA_Code_Operating_Airline              object 
 14  Tail_Number            

## Data Types

The `FlightDate` column will need to be converted to a datetime object. I'll be able to see if there are any inconsistencies with the range of dates. I expect only flight details for January 2023.

In [20]:
# convert FlightDate to datetime object
df['FlightDate'] = pd.to_datetime(df['FlightDate'], errors='coerce')
df['FlightDate'].info()

<class 'pandas.core.series.Series'>
Index: 7260941 entries, 0 to 7278738
Series name: FlightDate
Non-Null Count    Dtype         
--------------    -----         
7260941 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 110.8 MB


In [21]:
df['FlightDate'].describe()

count                          7260941
mean     2023-07-03 18:25:02.977754112
min                2023-01-01 00:00:00
25%                2023-04-05 00:00:00
50%                2023-07-05 00:00:00
75%                2023-10-02 00:00:00
max                2023-12-31 00:00:00
Name: FlightDate, dtype: object

The range of flight dates is as expected, ranging from January 1st to December 31st. 

## Redundant Columns with Flight Operations

The first few rows of the dataframe seemed to have similar information for `Marketing_Airline_Network`, `Operated_or_Branded_Code_Share_Partners`, and `IATA_Code_Marketing_Airline`. There also appear to be identical data for the `Operating_Airline` and `IATA_Code_Operating_Airline` columns. I want to take a closer look at these columns. 

In [22]:
df[['Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners', 'IATA_Code_Marketing_Airline', 'Operating_Airline', 'IATA_Code_Operating_Airline']]

Unnamed: 0,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,IATA_Code_Marketing_Airline,Operating_Airline,IATA_Code_Operating_Airline
0,NK,NK,NK,NK,NK
1,NK,NK,NK,NK,NK
2,NK,NK,NK,NK,NK
3,NK,NK,NK,NK,NK
4,NK,NK,NK,NK,NK
...,...,...,...,...,...
7278734,AA,AA_CODESHARE,AA,OH,OH
7278735,AA,AA_CODESHARE,AA,OH,OH
7278736,AA,AA_CODESHARE,AA,OH,OH
7278737,AA,AA_CODESHARE,AA,OH,OH


After reading documentation, the Marketing Airline Network and/or IATA Code is not always unique as it can be assigned to different airlines over time. Also, these codes may not always be unique. The identify unique airline carriers, the `DOT_ID_Marketing_Airline` column should be used as this is unique to the airline carrier and is defined as one holding and reporting under the same DOT certificate regardless of its Code, Name, or holding company/corporation.

The `Operated_or_Branded_Code_Share_Partners` column identifies whether the flight was operate by one of the airline's code share partners. A codeshare flight is an agreement between airlines to sell seats on each other’s flights. This gives the appearance of airlines flying to more destinations. By doing so, the airlines typically share the revenue on that ticket. The `DOT_ID_Operating_Airline` column should be used to identify which airline carrier is oeprating the flight. 

Creating a new column denoting whether a flight was a code share flight will potentially allow for segregating flights based on differing operations. 

In [23]:
# Compare the two columns to find where they are different
code_share_flights = (df['Marketing_Airline_Network'] != df['Operated_or_Branded_Code_Share_Partners'])
print('The total number of code share flights: ', len(df[code_share_flights]))
print('The percent of flights that are code share flights: ', round(len(df[code_share_flights])/len(df) * 100, 2), '%')


The total number of code share flights:  2019136
The percent of flights that are code share flights:  27.81 %


Over a quarter of all flights in 2023 were code share flights. This seems like a significant proportion. 

In [24]:
df['Code_Share_Flight'] = code_share_flights
df['Code_Share_Flight'] = df['Code_Share_Flight'].astype(int)

In [25]:
df['Code_Share_Flight'].head()

0    0
1    0
2    0
3    0
4    0
Name: Code_Share_Flight, dtype: int64

I will drop the `Operated_or_Branded_Code_Share_Partners` column now that I have capture whether or not a flight was a code share flight. 

In [26]:
ops_columns_to_drop = ['Operated_or_Branded_Code_Share_Partners']

In [27]:
# Compare the two columns to find where they are different
differences = df['Marketing_Airline_Network'] != df['IATA_Code_Marketing_Airline']

# Count the number of differences
count_differences = differences.sum()

print(f"Number of differences between the two columns: {count_differences}")


Number of differences between the two columns: 0


Since Marketing Airline Network Code and the IATA Code don't have any differences, keeping both columns is unnecessary. I will drop the `Marketing_Airline_Network`column.

In [28]:
ops_columns_to_drop.append('Marketing_Airline_Network')

In [29]:
# Compare the two columns to find where they are different
differences = df['Flight_Number_Marketing_Airline'] != df['Flight_Number_Operating_Airline']

# Count the number of differences
count_differences = differences.sum()

print(f"Number of differences between the two columns: {count_differences}")


Number of differences between the two columns: 575


The flight numbers are identical for nearly all observations except for 575 flights. I will only keep the `Flight_Number_Marketing_Airline` column.

In [30]:
ops_columns_to_drop.append('Flight_Number_Operating_Airline')

In [31]:
# Compare the two columns to find where they are different
differences = df['Operating_Airline'] != df['IATA_Code_Operating_Airline']

# Count the number of differences
count_differences = differences.sum()

print(f"Number of differences between the two columns: {count_differences}")


Number of differences between the two columns: 0


Since Operating Airline Code and the IATA Code don't have any differences, keeping both columns is unnecessary. I will drop the `Operating_Airline`column.

In [32]:
ops_columns_to_drop.append('Operating_Airline')

In [33]:
df = df.drop(columns=ops_columns_to_drop)

In [34]:
df.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Duplicate,Code_Share_Flight
0,2023,3,8,6,7,2023-08-06,20416,NK,2252,20416,NK,N978NK,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2359,,,,,,2300-2359,,,,,722,,,,,,0700-0759,1.0,0.0,263.0,,,1.0,1916.0,8,,,,,,0.0,N,0
1,2023,3,8,7,1,2023-08-07,20416,NK,2252,20416,NK,N974NK,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2359,115.0,76.0,76.0,1.0,5.0,2300-2359,19.0,134.0,822.0,9.0,722,831.0,69.0,69.0,1.0,4.0,0700-0759,0.0,0.0,263.0,256.0,228.0,1.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,N,0
2,2023,3,8,9,3,2023-08-09,20416,NK,2252,20416,NK,N519NK,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2310,2259.0,-11.0,0.0,0.0,-1.0,2300-2359,14.0,2313.0,605.0,10.0,628,615.0,-13.0,0.0,0.0,-1.0,0600-0659,0.0,0.0,258.0,256.0,232.0,1.0,1916.0,8,,,,,,0.0,N,0
3,2023,3,8,10,4,2023-08-10,20416,NK,2252,20416,NK,N532NK,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2310,2302.0,-8.0,0.0,0.0,-1.0,2300-2359,12.0,2314.0,551.0,7.0,628,558.0,-30.0,0.0,0.0,-2.0,0600-0659,0.0,0.0,258.0,236.0,217.0,1.0,1916.0,8,,,,,,0.0,N,0
4,2023,3,8,12,6,2023-08-12,20416,NK,2252,20416,NK,N529NK,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,2310,2314.0,4.0,4.0,0.0,0.0,2300-2359,18.0,2332.0,618.0,6.0,628,624.0,-4.0,0.0,0.0,-1.0,0600-0659,0.0,0.0,258.0,250.0,226.0,1.0,1916.0,8,,,,,,0.0,N,0


In [35]:
airport_details = df[['OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'Origin', 'OriginCityName', 'OriginState', 'OriginStateFips', 'OriginStateName']].drop_duplicates()
airport_details = airport_details.rename(columns={
    'OriginAirportID':'Airport_ID',
    'OriginAirportSeqID':'Airport_Seq_ID',
    'Origin_City_Market_ID':'City_Market_ID',
    'Origin':'Airport_Code',
    'OriginCityName':'City_Name',
    'OriginState':'State_Code',
    'OriginStateFips':'State_Fips', 
    'OriginStateName':'State_Name'
})

In [36]:
airport_details

Unnamed: 0,Airport_ID,Airport_Seq_ID,OriginCityMarketID,Airport_Code,City_Name,State_Code,State_Fips,State_Name
0,12889,1288904,32211,LAS,"Las Vegas, NV",NV,32,Nevada
24,11697,1169706,32467,FLL,"Fort Lauderdale, FL",FL,12,Florida
55,13204,1320402,31454,MCO,"Orlando, FL",FL,12,Florida
77,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia
120,10423,1042302,30423,AUS,"Austin, TX",TX,48,Texas
...,...,...,...,...,...,...,...,...
3872263,11315,1131503,31315,DIK,"Dickinson, ND",ND,38,North Dakota
3888468,13256,1325602,33256,MFE,"Mission/McAllen/Edinburg, TX",TX,48,Texas
4187478,13983,1398305,33983,OWB,"Owensboro, KY",KY,21,Kentucky
4294964,13459,1345903,33459,MQT,"Marquette, MI",MI,26,Michigan


## Redundant Columns with Origin and Destination Details


`OriginAirportID` and `OriginAirportSeqID` both provide identification numbers used by the DOT to identify a unique airport. The additional numbers in the `OriginAirportSeqID` column provide information about the airport at a given point in time. Since both of these point to the same unique airport, we only need one of the columns. Similarly, `DestAirportID` and `DestAirportSeqID` provide information about the unique destination airport. We only need one of the columns. I will drop both columns with SeqID information. 

In [37]:
location_details_to_drop = ['OriginAirportSeqID', 'DestAirportSeqID']

The `OriginCityName` and `DestCityName` include the state or country names after the comma. Since this information is already tracked in the `OriginState` column, we don't also need state informatino in the city column.

In [38]:
# Split the 'OriginCityName' into two columns but handle cases where there might not be a comma
df['Origin_City'] = df['OriginCityName'].str.split(',').str[0]
df['PotentialState'] = df['OriginCityName'].str.split(',').str[1]

# Fill NaNs in 'PotentialState' with an empty string or a placeholder if no comma was present
df['PotentialState'] = df['PotentialState'].fillna('')

# Trim any whitespace that might be around the state names
df['PotentialState'] = df['PotentialState'].str.strip()

# Proceed with your state comparison and adjustment
# This assumes you have an 'OriginState' column to compare with
df['CheckStateMatch'] = df['PotentialState'] == df['OriginState']
df['Origin_City'] = df.apply(lambda x: x['OriginCityName'] if not x['CheckStateMatch'] else x['Origin_City'], axis=1)


In [39]:
df[df['CheckStateMatch'] == False][['Origin', 'Origin_City', 'OriginState', 'PotentialState']].value_counts()

Origin  Origin_City     OriginState  PotentialState
DCA     Washington, DC  VA           DC                143090
IAD     Washington, DC  VA           DC                 75090
CVG     Cincinnati, OH  KY           OH                 41702
Name: count, dtype: int64

In [40]:
df['Origin_City'] = df['Origin_City'].str.replace(r'Cincinnati, OH', 'Hebron', regex=True)

In [41]:
df[df['CheckStateMatch'] == False][['Origin_City', 'OriginState']].value_counts()

Origin_City     OriginState
Washington, DC  VA             218180
Hebron          KY              41702
Name: count, dtype: int64

In [42]:
# Split the 'OriginCityName' into two columns but handle cases where there might not be a comma
df['Destination_City'] = df['DestCityName'].str.split(',').str[0]
df['PotentialState_D'] = df['DestCityName'].str.split(',').str[1]

# Fill NaNs in 'PotentialState' with an empty string or a placeholder if no comma was present
df['PotentialState_D'] = df['PotentialState_D'].fillna('')

# Trim any whitespace that might be around the state names
df['PotentialState_D'] = df['PotentialState_D'].str.strip()

# Proceed with your state comparison and adjustment
# This assumes you have an 'OriginState' column to compare with
df['CheckStateMatch_D'] = df['PotentialState_D'] == df['DestState']
df['Destination_City'] = df.apply(lambda x: x['DestCityName'] if not x['CheckStateMatch_D'] else x['Destination_City'], axis=1)


In [43]:
df['Destination_City'].head()

0    Charlotte
1    Charlotte
2    Charlotte
3    Charlotte
4    Charlotte
Name: Destination_City, dtype: object

In [44]:
df[df['CheckStateMatch'] == False][['Destination_City', 'DestState']].value_counts()
df['Destination_City'] = df['Destination_City'].str.replace(r'Cincinnati, OH', 'Hebron', regex=True)
df[df['CheckStateMatch_D'] == False][['Destination_City', 'DestState']].value_counts()

Destination_City  DestState
Washington, DC    VA           218077
Hebron            KY            41735
Name: count, dtype: int64

In [45]:
location_details_to_drop.extend(['OriginCityName', 'PotentialState', 'CheckStateMatch', 'DestCityName', 'PotentialState_D', 'CheckStateMatch_D'])

In [46]:
origin_states = df.groupby(['OriginState', 'OriginStateName', 'OriginStateFips', 'OriginWac']).size().reset_index()
origin_states.rename(columns={0:'Count'}, inplace=True)
origin_states.sort_values('OriginStateName').head()

Unnamed: 0,OriginState,OriginStateName,OriginStateFips,OriginWac,Count
1,AL,Alabama,1,51,36418
0,AK,Alaska,2,1,42728
3,AZ,Arizona,4,81,205794
2,AR,Arkansas,5,71,28750
4,CA,California,6,91,747100


In [47]:
dest_states = df.groupby(['DestState', 'DestStateName', 'DestStateFips', 'DestWac']).size().reset_index()
dest_states.rename(columns={0:'Count'}, inplace=True)
states_df = origin_states.merge(dest_states, left_on=['OriginStateFips', 'OriginWac'], right_on=['DestStateFips', 'DestWac'], suffixes=('_origin', '_dest'), how='outer')
states_df

Unnamed: 0,OriginState,OriginStateName,OriginStateFips,OriginWac,Count_origin,DestState,DestStateName,DestStateFips,DestWac,Count_dest
0,AK,Alaska,2,1,42728,AK,Alaska,2,1,42707
1,AL,Alabama,1,51,36418,AL,Alabama,1,51,36443
2,AR,Arkansas,5,71,28750,AR,Arkansas,5,71,28748
3,AZ,Arizona,4,81,205794,AZ,Arizona,4,81,205985
4,CA,California,6,91,747100,CA,California,6,91,747618
5,CO,Colorado,8,82,334351,CO,Colorado,8,82,333737
6,CT,Connecticut,9,11,22858,CT,Connecticut,9,11,22877
7,FL,Florida,12,33,612995,FL,Florida,12,33,612351
8,GA,Georgia,13,34,363670,GA,Georgia,13,34,363813
9,HI,Hawaii,15,2,132336,HI,Hawaii,15,2,132270


There don't appear to be any data entry issues with state abbreviations, state names or their FIPS codes. I will remove the FIPS and State Names columns and only keep the abbreviations. The FIPS and State Names will be preserved in a dictionary for use later if needed. 

In [48]:
states_dict = {row['OriginState']: (row['OriginStateName'], row['OriginStateFips'], row['OriginWac'])
               for index, row in origin_states.iterrows()}
states_dict

{'AK': ('Alaska', 2, 1),
 'AL': ('Alabama', 1, 51),
 'AR': ('Arkansas', 5, 71),
 'AZ': ('Arizona', 4, 81),
 'CA': ('California', 6, 91),
 'CO': ('Colorado', 8, 82),
 'CT': ('Connecticut', 9, 11),
 'FL': ('Florida', 12, 33),
 'GA': ('Georgia', 13, 34),
 'HI': ('Hawaii', 15, 2),
 'IA': ('Iowa', 19, 61),
 'ID': ('Idaho', 16, 83),
 'IL': ('Illinois', 17, 41),
 'IN': ('Indiana', 18, 42),
 'KS': ('Kansas', 20, 62),
 'KY': ('Kentucky', 21, 52),
 'LA': ('Louisiana', 22, 72),
 'MA': ('Massachusetts', 25, 13),
 'MD': ('Maryland', 24, 35),
 'ME': ('Maine', 23, 12),
 'MI': ('Michigan', 26, 43),
 'MN': ('Minnesota', 27, 63),
 'MO': ('Missouri', 29, 64),
 'MS': ('Mississippi', 28, 53),
 'MT': ('Montana', 30, 84),
 'NC': ('North Carolina', 37, 36),
 'ND': ('North Dakota', 38, 66),
 'NE': ('Nebraska', 31, 65),
 'NH': ('New Hampshire', 33, 14),
 'NJ': ('New Jersey', 34, 21),
 'NM': ('New Mexico', 35, 86),
 'NV': ('Nevada', 32, 85),
 'NY': ('New York', 36, 22),
 'OH': ('Ohio', 39, 44),
 'OK': ('Oklahoma

In [49]:
location_details_to_drop.extend(['OriginStateName', 'OriginStateFips', 'OriginWac', 'DestStateName', 'DestStateFips', 'DestWac'])
df = df.drop(columns=location_details_to_drop)

## Cleaning Departure and Arrival Performance Data

I will be able to apply data cleaning steps to the Depature-related columns. I will then need to separate out the cancelled flights to apply the changes to the Arrival-related columns. Cancelled flights are missing information about arrivals, so I need to preserve these NaN values for now.

In [50]:
df.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Duplicate,Code_Share_Flight,Origin_City,Destination_City
0,2023,3,8,6,7,2023-08-06,20416,NK,2252,20416,NK,N978NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2359,,,,,,2300-2359,,,,,722,,,,,,0700-0759,1.0,0.0,263.0,,,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte
1,2023,3,8,7,1,2023-08-07,20416,NK,2252,20416,NK,N974NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2359,115.0,76.0,76.0,1.0,5.0,2300-2359,19.0,134.0,822.0,9.0,722,831.0,69.0,69.0,1.0,4.0,0700-0759,0.0,0.0,263.0,256.0,228.0,1.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,N,0,Las Vegas,Charlotte
2,2023,3,8,9,3,2023-08-09,20416,NK,2252,20416,NK,N519NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2310,2259.0,-11.0,0.0,0.0,-1.0,2300-2359,14.0,2313.0,605.0,10.0,628,615.0,-13.0,0.0,0.0,-1.0,0600-0659,0.0,0.0,258.0,256.0,232.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte
3,2023,3,8,10,4,2023-08-10,20416,NK,2252,20416,NK,N532NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2310,2302.0,-8.0,0.0,0.0,-1.0,2300-2359,12.0,2314.0,551.0,7.0,628,558.0,-30.0,0.0,0.0,-2.0,0600-0659,0.0,0.0,258.0,236.0,217.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte
4,2023,3,8,12,6,2023-08-12,20416,NK,2252,20416,NK,N529NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2310,2314.0,4.0,4.0,0.0,0.0,2300-2359,18.0,2332.0,618.0,6.0,628,624.0,-4.0,0.0,0.0,-1.0,0600-0659,0.0,0.0,258.0,250.0,226.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte


In [51]:
df['CRSDepTime'].min()

1

In [52]:
df['CRSArrTime'].max()

2400

Several time columns are currently represented as integers and floats instead of time objects. 

In [53]:
"""def convert_to_time(df, col):

    # conver time columns to integers, then strings and fill left spaces with 0
    df[col] = df[col].astype(int).astype(str).str.zfill(4)
    
    # insert a colon to create a time string format
    df[col] = df[col].str[:2] + ':' + df[col].str[2:]
    
    # Replace "24:00" with "00:00" to avoid conversion errors
    df[col] = df[col].replace('24:00', '00:00')

    # convert string to datetime time object
    df[col] = pd.to_datetime(df[col], format='%H:%M').dt.time

df_not_cancelled = df[~cancelled_flights].copy()

df_cancelled = df[cancelled_flights].copy()

# convert departure and arrival columns for not cancelled flights
columns_to_convert = ['CRSDepTime', 'DepTime', 'WheelsOff', 'WheelsOn', 'CRSArrTime', 'ArrTime']

for col in columns_to_convert:
    convert_to_time(df_not_cancelled, col)

# convert scheduled departure and arrival columns
columns_to_convert = ['CRSDepTime', 'CRSArrTime']

for col in columns_to_convert:
    convert_to_time(df_cancelled, col)    

# convert columns into integers since they represent boolean details
df_not_cancelled[['Cancelled', 'Diverted', 'DepDel15', 'ArrDel15']] = df_not_cancelled[['Cancelled', 'Diverted', 'DepDel15', 'ArrDel15']].astype(int)
# convert columns into integers since they represent boolean details
df_cancelled[['Cancelled', 'Diverted']] = df_cancelled[['Cancelled', 'Diverted']].astype(int)

# merge
df_merged = pd.concat([df_not_cancelled, df_cancelled])
df_merged"""

'def convert_to_time(df, col):\n\n    # conver time columns to integers, then strings and fill left spaces with 0\n    df[col] = df[col].astype(int).astype(str).str.zfill(4)\n    \n    # insert a colon to create a time string format\n    df[col] = df[col].str[:2] + \':\' + df[col].str[2:]\n    \n    # Replace "24:00" with "00:00" to avoid conversion errors\n    df[col] = df[col].replace(\'24:00\', \'00:00\')\n\n    # convert string to datetime time object\n    df[col] = pd.to_datetime(df[col], format=\'%H:%M\').dt.time\n\ndf_not_cancelled = df[~cancelled_flights].copy()\n\ndf_cancelled = df[cancelled_flights].copy()\n\n# convert departure and arrival columns for not cancelled flights\ncolumns_to_convert = [\'CRSDepTime\', \'DepTime\', \'WheelsOff\', \'WheelsOn\', \'CRSArrTime\', \'ArrTime\']\n\nfor col in columns_to_convert:\n    convert_to_time(df_not_cancelled, col)\n\n# convert scheduled departure and arrival columns\ncolumns_to_convert = [\'CRSDepTime\', \'CRSArrTime\']\n\nfor col 

The above code has been refactored to deal with flights that depart but return and are cancelled. I added an if / else clause in the `convert_to_time` function to handle the null values during the conversion. 

In [54]:
"""def convert_to_time(df, col):
    # Apply the conversion only to non-NaN values
    df[col] = df[col].apply(lambda x: str(int(x)).zfill(4) if pd.notna(x) else x)
    
    # Insert a colon to create a time string format
    df[col] = df[col].apply(lambda x: x[:2] + ':' + x[2:] if pd.notna(x) else x)
    
    # Replace "24:00" with "00:00" to avoid conversion errors
    df[col] = df[col].replace('24:00', '00:00')
    
    # Convert string to datetime time object, but skip NaN values
    df[col] = pd.to_datetime(df[col], format='%H:%M', errors='coerce').dt.time


# convert scheduled departure and arrival columns
columns_to_convert = ['CRSDepTime', 'DepTime', 'WheelsOff', 'WheelsOn', 'CRSArrTime', 'ArrTime']

for col in columns_to_convert:
    convert_to_time(df, col)"""

'def convert_to_time(df, col):\n    # Apply the conversion only to non-NaN values\n    df[col] = df[col].apply(lambda x: str(int(x)).zfill(4) if pd.notna(x) else x)\n    \n    # Insert a colon to create a time string format\n    df[col] = df[col].apply(lambda x: x[:2] + \':\' + x[2:] if pd.notna(x) else x)\n    \n    # Replace "24:00" with "00:00" to avoid conversion errors\n    df[col] = df[col].replace(\'24:00\', \'00:00\')\n    \n    # Convert string to datetime time object, but skip NaN values\n    df[col] = pd.to_datetime(df[col], format=\'%H:%M\', errors=\'coerce\').dt.time\n\n\n# convert scheduled departure and arrival columns\ncolumns_to_convert = [\'CRSDepTime\', \'DepTime\', \'WheelsOff\', \'WheelsOn\', \'CRSArrTime\', \'ArrTime\']\n\nfor col in columns_to_convert:\n    convert_to_time(df, col)'

In [55]:
cols_to_convert = ['Cancelled', 'Diverted']

for col in cols_to_convert:
    df[col] = df[col].apply(lambda x: int(x) if pd.notna(x) else x)

For the `DepDelay` vs `DepDelayMinutes` and `ArrDelay` vs `ArrDelayMinutes` columns, the only difference is that in the Minutes columns, early departures or arrivals are set to 0 instead of a negative number. This is redundant information so we can remove these columns.

In [56]:
# delete redundant columns
df_merged = df.drop(columns=['DepDelayMinutes', 'ArrDelayMinutes'])

In [57]:
df_merged['DepDelay'].describe().apply(lambda x: round(x, 2))

count    7170781.00
mean          12.16
std           55.71
min          -99.00
25%           -6.00
50%           -2.00
75%            9.00
max         5764.00
Name: DepDelay, dtype: float64

The CSRDepTime, DepTime, CSRArrTime and ArrTime are all in the local times for their respective airports. I will need to convert all date and time information to UTC in order to later merge with weather data. First, I want to convert the columns I mentioned to datetime columns by merging the time columns with FlightDate. I will then need to adjust rows that have next day arrivals (or several days later due to prolonged delays).

In [58]:
df_merged.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Duplicate,Code_Share_Flight,Origin_City,Destination_City
0,2023,3,8,6,7,2023-08-06,20416,NK,2252,20416,NK,N978NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2359,,,,,2300-2359,,,,,722,,,,,0700-0759,1,0,263.0,,,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte
1,2023,3,8,7,1,2023-08-07,20416,NK,2252,20416,NK,N974NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2359,115.0,76.0,1.0,5.0,2300-2359,19.0,134.0,822.0,9.0,722,831.0,69.0,1.0,4.0,0700-0759,0,0,263.0,256.0,228.0,1.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,N,0,Las Vegas,Charlotte
2,2023,3,8,9,3,2023-08-09,20416,NK,2252,20416,NK,N519NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2310,2259.0,-11.0,0.0,-1.0,2300-2359,14.0,2313.0,605.0,10.0,628,615.0,-13.0,0.0,-1.0,0600-0659,0,0,258.0,256.0,232.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte
3,2023,3,8,10,4,2023-08-10,20416,NK,2252,20416,NK,N532NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2310,2302.0,-8.0,0.0,-1.0,2300-2359,12.0,2314.0,551.0,7.0,628,558.0,-30.0,0.0,-2.0,0600-0659,0,0,258.0,236.0,217.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte
4,2023,3,8,12,6,2023-08-12,20416,NK,2252,20416,NK,N529NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2310,2314.0,4.0,0.0,0.0,2300-2359,18.0,2332.0,618.0,6.0,628,624.0,-4.0,0.0,-1.0,0600-0659,0,0,258.0,250.0,226.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte


In [59]:
df_merged[['CRSDepTime', 'DepTime', 'DepDelay', 'CRSArrTime', 'ArrTime', 'ArrDelay']].isna().sum()

CRSDepTime        0
DepTime       90100
DepDelay      90160
CRSArrTime        0
ArrTime       93897
ArrDelay      93897
dtype: int64

In [60]:
df_merged[['CRSDepTime', 'DepTime', 'DepDelay', 'CRSArrTime', 'ArrTime', 'ArrDelay']].describe().apply(lambda x: round(x, 2))

Unnamed: 0,CRSDepTime,DepTime,DepDelay,CRSArrTime,ArrTime,ArrDelay
count,7260941.0,7170841.0,7170781.0,7260941.0,7167044.0,7167044.0
mean,1332.12,1334.32,12.16,1486.99,1457.96,6.63
std,493.84,508.97,55.71,522.36,546.11,57.57
min,1.0,1.0,-99.0,1.0,1.0,-119.0
25%,911.0,913.0,-6.0,1100.0,1043.0,-15.0
50%,1324.0,1326.0,-2.0,1515.0,1500.0,-6.0
75%,1740.0,1748.0,9.0,1923.0,1916.0,9.0
max,2359.0,2400.0,5764.0,2400.0,2400.0,5780.0


#### ***Why is there a difference between the number of flights missing departure data vs arrival data?***

In [61]:
dep_missing = df_merged['DepTime'].isna()
df_merged[dep_missing].groupby('Cancelled')['DepTime'].size()

Cancelled
1    90100
Name: DepTime, dtype: int64

In [62]:
arr_missing = df_merged['ArrTime'].isna()
df_merged[arr_missing].groupby('Cancelled')['ArrTime'].size()

Cancelled
1    93897
Name: ArrTime, dtype: int64

In [63]:
# find the difference between missing ArrTime data and DepTime data
arr_dep_diff = df_merged['ArrTime'].isna().sum() - df_merged['DepTime'].isna().sum()
arr_dep_diff

3797

In [64]:
missing_critical_data = ((df_merged['ArrTime'].isna()) & (df_merged['DepTime'].notna()))

df_merged[missing_critical_data]['Cancelled'].value_counts()

Cancelled
1    3797
Name: count, dtype: int64

All the rows misisng `DepTime` are cancelled flights. Similarly all the rows missing `ArrTime` are cancelled flights. There are 3,797 rows that have `DepTime` data and are missing `ArrTime` data, and all of these flights were cancelled flights. It is likely that these flights were going to take off, left the gate, but did not actually get in the air and were subsequently cancelled, therefore they would have depature data but not arrival data. I'll replace the `DepTime` for these rows with np.nan values since they are all cancelled flights.

In [65]:
def create_scheduled_actual_datetime(df, date_col, scheduled_dep_time_col, scheduled_arr_time_col, dep_delay_col, arr_delay_col):
    
    # Replace '2400' with '0000' to represent midnight
    df[scheduled_dep_time_col] = df[scheduled_dep_time_col].replace(2400, 0)
    df[scheduled_arr_time_col] = df[scheduled_arr_time_col].replace(2400, 0)

    # Combine FlightDate with CSRDepTime and CSRArrTime to create datetime objects for scheduled departures and arrivals
    df['scheduled_departure_datetime'] = pd.to_datetime(df[date_col].astype(str) + ' ' + df[scheduled_dep_time_col].astype(str).str.zfill(4), format='%Y-%m-%d %H%M', utc=False)
    df['scheduled_arrival_datetime'] = pd.to_datetime(df[date_col].astype(str) + ' ' + df[scheduled_arr_time_col].astype(str).str.zfill(4), format='%Y-%m-%d %H%M', utc=False)

    df.loc[df['scheduled_arrival_datetime'] < df['scheduled_departure_datetime'], 'scheduled_arrival_datetime'] += pd.Timedelta(days=1)

    # Replace actual departure times with NaN for canceled flights with DepTime but no ArrTime
    df.loc[(df['ArrTime'].isna()) & (df['DepTime'].notna()) & (df['Cancelled'] == 1), 'DepTime'] = np.nan
    
    # Only process rows where DepDelay and ArrDelay are not missing (skip canceled flights or incomplete records)
    df_completed_flights = df.dropna(subset=[dep_delay_col, arr_delay_col]).copy()

    # Adjust for delays in actual departure and arrival times (accounting for multi-day delays)
    df_completed_flights['actual_departure_datetime'] = df_completed_flights['scheduled_departure_datetime'] + pd.to_timedelta(df_completed_flights[dep_delay_col], unit='m')
    df_completed_flights['actual_arrival_datetime'] = df_completed_flights['scheduled_arrival_datetime'] + pd.to_timedelta(df_completed_flights[arr_delay_col], unit='m')

    # Add the actual datetime columns to the original dataframe
    df['actual_departure_datetime'] = df_completed_flights['actual_departure_datetime']
    df['actual_arrival_datetime'] = df_completed_flights['actual_arrival_datetime']
    
    return df

In [66]:
df_dt_fixed = create_scheduled_actual_datetime(df_merged, 'FlightDate', 'CRSDepTime', 'CRSArrTime', 'DepDelay', 'ArrDelay')

In [67]:
df_dt_fixed[['FlightDate', 'CRSDepTime', 'scheduled_departure_datetime', 'CRSArrTime', 'scheduled_arrival_datetime', 'DepDelay', 'ArrDelay', 'DepTime', 'actual_departure_datetime', 'ArrTime', 'actual_arrival_datetime']].head(20)

Unnamed: 0,FlightDate,CRSDepTime,scheduled_departure_datetime,CRSArrTime,scheduled_arrival_datetime,DepDelay,ArrDelay,DepTime,actual_departure_datetime,ArrTime,actual_arrival_datetime
0,2023-08-06,2359,2023-08-06 23:59:00,722,2023-08-07 07:22:00,,,,NaT,,NaT
1,2023-08-07,2359,2023-08-07 23:59:00,722,2023-08-08 07:22:00,76.0,69.0,115.0,2023-08-08 01:15:00,831.0,2023-08-08 08:31:00
2,2023-08-09,2310,2023-08-09 23:10:00,628,2023-08-10 06:28:00,-11.0,-13.0,2259.0,2023-08-09 22:59:00,615.0,2023-08-10 06:15:00
3,2023-08-10,2310,2023-08-10 23:10:00,628,2023-08-11 06:28:00,-8.0,-30.0,2302.0,2023-08-10 23:02:00,558.0,2023-08-11 05:58:00
4,2023-08-12,2310,2023-08-12 23:10:00,628,2023-08-13 06:28:00,4.0,-4.0,2314.0,2023-08-12 23:14:00,624.0,2023-08-13 06:24:00
5,2023-08-13,2310,2023-08-13 23:10:00,628,2023-08-14 06:28:00,53.0,72.0,3.0,2023-08-14 00:03:00,740.0,2023-08-14 07:40:00
6,2023-08-14,2310,2023-08-14 23:10:00,628,2023-08-15 06:28:00,-4.0,0.0,2306.0,2023-08-14 23:06:00,628.0,2023-08-15 06:28:00
7,2023-08-15,2310,2023-08-15 23:10:00,628,2023-08-16 06:28:00,2.0,4.0,2312.0,2023-08-15 23:12:00,632.0,2023-08-16 06:32:00
8,2023-08-16,2310,2023-08-16 23:10:00,628,2023-08-17 06:28:00,6.0,8.0,2316.0,2023-08-16 23:16:00,636.0,2023-08-17 06:36:00
9,2023-08-17,2310,2023-08-17 23:10:00,628,2023-08-18 06:28:00,5.0,19.0,2315.0,2023-08-17 23:15:00,647.0,2023-08-18 06:47:00


In [68]:
"""def cross_validate_datetime(df, date_col, dep_time_col, arr_time_col, cancelled_col):
    errors = []

    # Replace '2400' with '0000' to represent midnight
    df[dep_time_col] = df[dep_time_col].replace(2400, 0)
    df[arr_time_col] = df[arr_time_col].replace(2400, 0)

    # Only convert DepTime and ArrTime for non-canceled flights
    df_valid_times = df[df[cancelled_col] == 0].copy()

    # Convert DepTime and ArrTime to integers and then to datetime
    df_valid_times[dep_time_col] = df_valid_times[dep_time_col].dropna().astype(int)
    df_valid_times[arr_time_col] = df_valid_times[arr_time_col].dropna().astype(int)

    df_valid_times['DepTime_converted'] = pd.to_datetime(
        df_valid_times[date_col].astype(str) + ' ' + df_valid_times[dep_time_col].astype(str).str.zfill(4),
        format='%Y-%m-%d %H%M', errors='coerce'
    )
    df_valid_times['ArrTime_converted'] = pd.to_datetime(
        df_valid_times[date_col].astype(str) + ' ' + df_valid_times[arr_time_col].astype(str).str.zfill(4),
        format='%Y-%m-%d %H%M', errors='coerce'
    )
    # Adjust for next-day departures (positive DepDelay and DepTime earlier than CRSDepTime)
    df_valid_times.loc[
        (df_valid_times['DepDelay'] > 0) & 
        (df_valid_times['DepTime_converted'] < df_valid_times['scheduled_departure_datetime']), 
        'DepTime_converted'
    ] += pd.Timedelta(days=1)

    # Adjust for next-day arrivals (similar to the original function)
    df_valid_times.loc[df_valid_times['ArrTime_converted'] < df_valid_times['DepTime_converted'], 'ArrTime_converted'] += pd.Timedelta(days=1)

     # Validate: DepTime should equal actual_departure_datetime, accounting for NaN vs NaT
    incorrect_departure = df_valid_times[
        (~(df_valid_times['DepTime_converted'].isna() & df_valid_times['actual_departure_datetime'].isna())) & 
        (df_valid_times['DepTime_converted'] != df_valid_times['actual_departure_datetime'])
    ]
    if not incorrect_departure.empty:
        errors.append(f"Found {len(incorrect_departure)} rows where DepTime does not match actual_departure_datetime.")

    # Validate: ArrTime should equal actual_arrival_datetime, accounting for NaN vs NaT
    incorrect_arrival = df_valid_times[
        (~(df_valid_times['ArrTime_converted'].isna() & df_valid_times['actual_arrival_datetime'].isna())) & 
        (df_valid_times['ArrTime_converted'] != df_valid_times['actual_arrival_datetime'])
    ]
    if not incorrect_arrival.empty:
        errors.append(f"Found {len(incorrect_arrival)} rows where ArrTime does not match actual_arrival_datetime.")

    
    # Clean up temporary columns
    #df_valid_times.drop(columns=['DepTime_converted', 'ArrTime_converted'], inplace=True)

    df['DepTime_converted'] = df_valid_times['DepTime_converted']
    df['ArrTime_converted'] = df_valid_times['ArrTime_converted']
     
    if errors:
        return errors
    else:
        return "All validations passed."""


'def cross_validate_datetime(df, date_col, dep_time_col, arr_time_col, cancelled_col):\n    errors = []\n\n    # Replace \'2400\' with \'0000\' to represent midnight\n    df[dep_time_col] = df[dep_time_col].replace(2400, 0)\n    df[arr_time_col] = df[arr_time_col].replace(2400, 0)\n\n    # Only convert DepTime and ArrTime for non-canceled flights\n    df_valid_times = df[df[cancelled_col] == 0].copy()\n\n    # Convert DepTime and ArrTime to integers and then to datetime\n    df_valid_times[dep_time_col] = df_valid_times[dep_time_col].dropna().astype(int)\n    df_valid_times[arr_time_col] = df_valid_times[arr_time_col].dropna().astype(int)\n\n    df_valid_times[\'DepTime_converted\'] = pd.to_datetime(\n        df_valid_times[date_col].astype(str) + \' \' + df_valid_times[dep_time_col].astype(str).str.zfill(4),\n        format=\'%Y-%m-%d %H%M\', errors=\'coerce\'\n    )\n    df_valid_times[\'ArrTime_converted\'] = pd.to_datetime(\n        df_valid_times[date_col].astype(str) + \' \' + df

In [69]:
# Split the 'DepTimeBlk' into two columns to see if all blocks are 59 minutes long or are there some that are longer and shorter
df_dt_fixed['DeptTimeBlk_Start'] = df_dt_fixed['DepTimeBlk'].str.split('-').str[0].astype(int)
df_dt_fixed['DeptTimeBlk_End'] = df_dt_fixed['DepTimeBlk'].str.split('-').str[1].astype(int)

# Split the 'ArrTimeBlk' into two columns to see if all blocks are 59 minutes long or are there some that are longer and shorter
df_dt_fixed['ArrTimeBlk_Start'] = df_dt_fixed['ArrTimeBlk'].str.split('-').str[0].astype(int)
df_dt_fixed['ArrTimeBlk_End'] = df_dt_fixed['ArrTimeBlk'].str.split('-').str[1].astype(int)

In [70]:
df_dt_fixed['ArrivalBlock_Total'] = df_dt_fixed['ArrTimeBlk_End'] - df_dt_fixed['ArrTimeBlk_Start']
df_dt_fixed['DepartureBlock_Total'] = df_dt_fixed['DeptTimeBlk_End'] - df_dt_fixed['DeptTimeBlk_Start']

In [71]:
df_dt_fixed[['DepartureBlock_Total', 'ArrivalBlock_Total']].describe().apply(lambda x: x.apply('{0:.2f}'.format))

Unnamed: 0,DepartureBlock_Total,ArrivalBlock_Total
count,7260941.0,7260941.0
mean,74.18,72.08
std,85.69,79.72
min,59.0,59.0
25%,59.0,59.0
50%,59.0,59.0
75%,59.0,59.0
max,558.0,558.0


I'm curious to know why some flights have 558 minutes for their scheduled block. Let's take a look.

In [72]:
df_dt_fixed.sort_values('DepartureBlock_Total', ascending=False).head(10)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Duplicate,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,DeptTimeBlk_Start,DeptTimeBlk_End,ArrTimeBlk_Start,ArrTimeBlk_End,ArrivalBlock_Total,DepartureBlock_Total
4564220,2023,2,5,28,7,2023-05-28,19790,DL,1116,19790,DL,N681DA,15304,33195,TPA,FL,10397,30397,ATL,GA,520,513.0,-7.0,0.0,-1.0,0001-0559,12.0,525.0,626.0,5.0,650,631.0,-19.0,0.0,-2.0,0600-0659,0,0,90.0,78.0,61.0,1.0,406.0,2,,,,,,0.0,N,0,Tampa,Atlanta,2023-05-28 05:20:00,2023-05-28 06:50:00,2023-05-28 05:13:00,2023-05-28 06:31:00,1,559,600,659,59,558
1631850,2023,3,7,25,2,2023-07-25,19393,WN,3486,19393,WN,N8511K,13232,30977,MDW,IL,13796,32457,OAK,CA,525,528.0,3.0,0.0,0.0,0001-0559,9.0,537.0,730.0,7.0,750,737.0,-13.0,0.0,-1.0,0700-0759,0,0,265.0,249.0,233.0,1.0,1844.0,8,,,,,,0.0,N,0,Chicago,Oakland,2023-07-25 05:25:00,2023-07-25 07:50:00,2023-07-25 05:28:00,2023-07-25 07:37:00,1,559,700,759,59,558
1631812,2023,3,7,25,2,2023-07-25,19393,WN,3781,19393,WN,N8643A,13232,30977,MDW,IL,12953,31703,LGA,NY,520,517.0,-3.0,0.0,-1.0,0001-0559,10.0,527.0,803.0,7.0,825,810.0,-15.0,0.0,-1.0,0800-0859,0,0,125.0,113.0,96.0,1.0,725.0,3,,,,,,0.0,N,0,Chicago,New York,2023-07-25 05:20:00,2023-07-25 08:25:00,2023-07-25 05:17:00,2023-07-25 08:10:00,1,559,800,859,59,558
2971774,2023,2,6,9,5,2023-06-09,19393,WN,1653,19393,WN,N8560Z,13198,33198,MCI,MO,12191,31453,HOU,TX,520,524.0,4.0,0.0,0.0,0001-0559,13.0,537.0,708.0,6.0,720,714.0,-6.0,0.0,-1.0,0700-0759,0,0,120.0,110.0,91.0,1.0,666.0,3,,,,,,0.0,N,0,Kansas City,Houston,2023-06-09 05:20:00,2023-06-09 07:20:00,2023-06-09 05:24:00,2023-06-09 07:14:00,1,559,700,759,59,558
1631821,2023,3,7,25,2,2023-07-25,19393,WN,3773,19393,WN,N7841A,13232,30977,MDW,IL,13198,33198,MCI,MO,550,544.0,-6.0,0.0,-1.0,0001-0559,9.0,553.0,650.0,4.0,710,654.0,-16.0,0.0,-2.0,0700-0759,0,0,80.0,70.0,57.0,1.0,405.0,2,,,,,,0.0,N,0,Chicago,Kansas City,2023-07-25 05:50:00,2023-07-25 07:10:00,2023-07-25 05:44:00,2023-07-25 06:54:00,1,559,700,759,59,558
1631825,2023,3,7,25,2,2023-07-25,19393,WN,1865,19393,WN,N408WN,13232,30977,MDW,IL,13204,31454,MCO,FL,530,551.0,21.0,1.0,1.0,0001-0559,8.0,559.0,902.0,7.0,900,909.0,9.0,0.0,0.0,0900-0959,0,0,150.0,138.0,123.0,1.0,990.0,4,,,,,,0.0,N,0,Chicago,Orlando,2023-07-25 05:30:00,2023-07-25 09:00:00,2023-07-25 05:51:00,2023-07-25 09:09:00,1,559,900,959,59,558
2971759,2023,2,6,9,5,2023-06-09,19393,WN,1126,19393,WN,N8853Q,13198,33198,MCI,MO,11259,30194,DAL,TX,540,540.0,0.0,0.0,0.0,0001-0559,15.0,555.0,659.0,4.0,710,703.0,-7.0,0.0,-1.0,0700-0759,0,0,90.0,83.0,64.0,1.0,461.0,2,,,,,,0.0,N,0,Kansas City,Dallas,2023-06-09 05:40:00,2023-06-09 07:10:00,2023-06-09 05:40:00,2023-06-09 07:03:00,1,559,700,759,59,558
2971756,2023,2,6,9,5,2023-06-09,19393,WN,3352,19393,WN,N400WN,13198,33198,MCI,MO,10821,30852,BWI,MD,530,533.0,3.0,0.0,0.0,0001-0559,12.0,545.0,854.0,2.0,850,856.0,6.0,0.0,0.0,0800-0859,0,0,140.0,143.0,129.0,1.0,967.0,4,,,,,,0.0,N,0,Kansas City,Baltimore,2023-06-09 05:30:00,2023-06-09 08:50:00,2023-06-09 05:33:00,2023-06-09 08:56:00,1,559,800,859,59,558
1631846,2023,3,7,25,2,2023-07-25,19393,WN,3667,19393,WN,N244WN,13232,30977,MDW,IL,13495,33495,MSY,LA,545,539.0,-6.0,0.0,-1.0,0001-0559,10.0,549.0,733.0,4.0,800,737.0,-23.0,0.0,-2.0,0800-0859,0,0,135.0,118.0,104.0,1.0,825.0,4,,,,,,0.0,N,0,Chicago,New Orleans,2023-07-25 05:45:00,2023-07-25 08:00:00,2023-07-25 05:39:00,2023-07-25 07:37:00,1,559,800,859,59,558
2971743,2023,2,6,9,5,2023-06-09,19393,WN,1615,19393,WN,N8550Q,13158,33158,MAF,TX,12191,31453,HOU,TX,530,,-7.0,0.0,-1.0,0001-0559,,,,,655,,,,,0600-0659,1,0,85.0,,,1.0,441.0,2,,,,,,0.0,N,0,Midland/Odessa,Houston,2023-06-09 05:30:00,2023-06-09 06:55:00,NaT,NaT,1,559,600,659,59,558


In [73]:
from datetime import time

after_midnight_flights = df_dt_fixed[df_dt_fixed['scheduled_departure_datetime'].dt.time > time(0, 0)]
after_midnight_flights.sort_values('scheduled_departure_datetime').head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Duplicate,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,DeptTimeBlk_Start,DeptTimeBlk_End,ArrTimeBlk_Start,ArrTimeBlk_End,ArrivalBlock_Total,DepartureBlock_Total
5798101,2023,1,1,1,7,2023-01-01,19930,AS,150,19930,AS,N588AS,10299,30299,ANC,AK,12892,32575,LAX,CA,5,2357.0,-8.0,0.0,-1.0,0001-0559,12.0,9.0,557.0,17.0,620,614.0,-6.0,0.0,-1.0,0600-0659,0,0,315.0,317.0,288.0,1.0,2345.0,10,,,,,,0.0,N,0,Anchorage,Los Angeles,2023-01-01 00:05:00,2023-01-01 06:20:00,2022-12-31 23:57:00,2023-01-01 06:14:00,1,559,600,659,59,558
5634850,2023,1,1,1,7,2023-01-01,20436,F9,1524,20436,F9,N381FR,14771,32457,SFO,CA,10397,30397,ATL,GA,9,130.0,81.0,1.0,5.0,0001-0559,12.0,142.0,843.0,15.0,749,858.0,69.0,1.0,4.0,0700-0759,0,0,280.0,268.0,241.0,1.0,2139.0,9,49.0,0.0,0.0,0.0,20.0,0.0,N,0,San Francisco,Atlanta,2023-01-01 00:09:00,2023-01-01 07:49:00,2023-01-01 01:30:00,2023-01-01 08:58:00,1,559,700,759,59,558
5823363,2023,1,1,1,7,2023-01-01,20409,B6,956,20409,B6,N594JB,11292,30325,DEN,CO,12953,31703,LGA,NY,20,18.0,-2.0,0.0,-1.0,0001-0559,11.0,29.0,527.0,7.0,558,534.0,-24.0,0.0,-2.0,0001-0559,0,0,218.0,196.0,178.0,1.0,1620.0,7,,,,,,0.0,N,0,Denver,New York,2023-01-01 00:20:00,2023-01-01 05:58:00,2023-01-01 00:18:00,2023-01-01 05:34:00,1,559,1,559,558,558
5719531,2023,1,1,1,7,2023-01-01,19805,AA,500,19805,AA,N457AM,11292,30325,DEN,CO,13303,32467,MIA,FL,20,18.0,-2.0,0.0,-1.0,0001-0559,12.0,30.0,540.0,4.0,607,544.0,-23.0,0.0,-2.0,0600-0659,0,0,227.0,206.0,190.0,1.0,1709.0,7,,,,,,0.0,N,0,Denver,Miami,2023-01-01 00:20:00,2023-01-01 06:07:00,2023-01-01 00:18:00,2023-01-01 05:44:00,1,559,600,659,59,558
5610819,2023,1,1,1,7,2023-01-01,19790,DL,1099,19790,DL,N381DZ,12892,32575,LAX,CA,13487,31650,MSP,MN,20,13.0,-7.0,0.0,-1.0,0001-0559,11.0,24.0,515.0,9.0,603,524.0,-39.0,0.0,-2.0,0600-0659,0,0,223.0,191.0,171.0,1.0,1535.0,7,,,,,,0.0,N,0,Los Angeles,Minneapolis,2023-01-01 00:20:00,2023-01-01 06:03:00,2023-01-01 00:13:00,2023-01-01 05:24:00,1,559,600,659,59,558


Flights that depart or arrive after midnight and before 6AM have 558 minutes blocked for their departure/arrival time. All other flights only have 59 minutes for their scheduled block. This is likely due to allow for logistics and scheduling for airport operations for these overnight/really early flights. It may be helpful to categorize flights as overnight departure or overnight arrival events compared to all other flights. 

In [74]:
overnight_departures = (df_dt_fixed['DepartureBlock_Total'] > 60)
overnight_arrivals = (df_dt_fixed['ArrivalBlock_Total'] > 60)

In [75]:
df_dt_fixed['Overnight_Depature'] = overnight_departures.astype(int)
df_dt_fixed['Overnight_Arrival'] = overnight_arrivals.astype(int)

In [76]:
df_dt_fixed.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Duplicate,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,DeptTimeBlk_Start,DeptTimeBlk_End,ArrTimeBlk_Start,ArrTimeBlk_End,ArrivalBlock_Total,DepartureBlock_Total,Overnight_Depature,Overnight_Arrival
0,2023,3,8,6,7,2023-08-06,20416,NK,2252,20416,NK,N978NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2359,,,,,2300-2359,,,,,722,,,,,0700-0759,1,0,263.0,,,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte,2023-08-06 23:59:00,2023-08-07 07:22:00,NaT,NaT,2300,2359,700,759,59,59,0,0
1,2023,3,8,7,1,2023-08-07,20416,NK,2252,20416,NK,N974NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2359,115.0,76.0,1.0,5.0,2300-2359,19.0,134.0,822.0,9.0,722,831.0,69.0,1.0,4.0,0700-0759,0,0,263.0,256.0,228.0,1.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,N,0,Las Vegas,Charlotte,2023-08-07 23:59:00,2023-08-08 07:22:00,2023-08-08 01:15:00,2023-08-08 08:31:00,2300,2359,700,759,59,59,0,0
2,2023,3,8,9,3,2023-08-09,20416,NK,2252,20416,NK,N519NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2310,2259.0,-11.0,0.0,-1.0,2300-2359,14.0,2313.0,605.0,10.0,628,615.0,-13.0,0.0,-1.0,0600-0659,0,0,258.0,256.0,232.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte,2023-08-09 23:10:00,2023-08-10 06:28:00,2023-08-09 22:59:00,2023-08-10 06:15:00,2300,2359,600,659,59,59,0,0
3,2023,3,8,10,4,2023-08-10,20416,NK,2252,20416,NK,N532NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2310,2302.0,-8.0,0.0,-1.0,2300-2359,12.0,2314.0,551.0,7.0,628,558.0,-30.0,0.0,-2.0,0600-0659,0,0,258.0,236.0,217.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte,2023-08-10 23:10:00,2023-08-11 06:28:00,2023-08-10 23:02:00,2023-08-11 05:58:00,2300,2359,600,659,59,59,0,0
4,2023,3,8,12,6,2023-08-12,20416,NK,2252,20416,NK,N529NK,12889,32211,LAS,NV,11057,31057,CLT,NC,2310,2314.0,4.0,0.0,0.0,2300-2359,18.0,2332.0,618.0,6.0,628,624.0,-4.0,0.0,-1.0,0600-0659,0,0,258.0,250.0,226.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte,2023-08-12 23:10:00,2023-08-13 06:28:00,2023-08-12 23:14:00,2023-08-13 06:24:00,2300,2359,600,659,59,59,0,0


In [77]:
flight_perf_to_drop = ['FlightDate', 'CRSDepTime', 'DepTime', 'WheelsOff', 'WheelsOn', 'CRSArrTime', 'ArrTime',
                       'DepTimeBlk', 'ArrTimeBlk', 'DeptTimeBlk_Start', 'DeptTimeBlk_End', 'ArrTimeBlk_Start', 'ArrTimeBlk_End', 'ArrivalBlock_Total', 'DepartureBlock_Total']
flights_reduced = df_dt_fixed.drop(columns=flight_perf_to_drop)

In [78]:
flights_reduced.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,TaxiIn,ArrDelay,ArrDel15,ArrivalDelayGroups,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Duplicate,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,Overnight_Depature,Overnight_Arrival
0,2023,3,8,6,7,20416,NK,2252,20416,NK,N978NK,12889,32211,LAS,NV,11057,31057,CLT,NC,,,,,,,,,1,0,263.0,,,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte,2023-08-06 23:59:00,2023-08-07 07:22:00,NaT,NaT,0,0
1,2023,3,8,7,1,20416,NK,2252,20416,NK,N974NK,12889,32211,LAS,NV,11057,31057,CLT,NC,76.0,1.0,5.0,19.0,9.0,69.0,1.0,4.0,0,0,263.0,256.0,228.0,1.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,N,0,Las Vegas,Charlotte,2023-08-07 23:59:00,2023-08-08 07:22:00,2023-08-08 01:15:00,2023-08-08 08:31:00,0,0
2,2023,3,8,9,3,20416,NK,2252,20416,NK,N519NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-11.0,0.0,-1.0,14.0,10.0,-13.0,0.0,-1.0,0,0,258.0,256.0,232.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte,2023-08-09 23:10:00,2023-08-10 06:28:00,2023-08-09 22:59:00,2023-08-10 06:15:00,0,0
3,2023,3,8,10,4,20416,NK,2252,20416,NK,N532NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-8.0,0.0,-1.0,12.0,7.0,-30.0,0.0,-2.0,0,0,258.0,236.0,217.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte,2023-08-10 23:10:00,2023-08-11 06:28:00,2023-08-10 23:02:00,2023-08-11 05:58:00,0,0
4,2023,3,8,12,6,20416,NK,2252,20416,NK,N529NK,12889,32211,LAS,NV,11057,31057,CLT,NC,4.0,0.0,0.0,18.0,6.0,-4.0,0.0,-1.0,0,0,258.0,250.0,226.0,1.0,1916.0,8,,,,,,0.0,N,0,Las Vegas,Charlotte,2023-08-12 23:10:00,2023-08-13 06:28:00,2023-08-12 23:14:00,2023-08-13 06:24:00,0,0


Ok all of the time columns have been converted. Now all the times and dates are in local times for each airport. I will need to add timezone information after getting the timezones for each airport. 

## Check for duplicates

In [79]:
flights_reduced.duplicated().sum()

0

In [80]:
flights_reduced['Duplicate'].value_counts()

Duplicate
N    7260941
Name: count, dtype: int64

In [81]:
flights_reduced = flights_reduced.drop(columns='Duplicate')

## Check column values

In [82]:
flights_reduced['Flights'].value_counts()

Flights
1.0    7260941
Name: count, dtype: int64

The `Flights` column only contains the same value across all rows and is therefore not providing any new information.

In [83]:
flights_reduced.drop(columns='Flights', inplace=True)

In [84]:
# check unique values for day of
np.sort(flights_reduced['DayOfWeek'].unique())

array([1, 2, 3, 4, 5, 6, 7])

There are only 7 unique values for day of week, as I would expect. According to the data dictionary, 1 = Monday and 7 = Sunday. They reserve 9 for unknown day of week, however, there are no occurences of an unknown day of week in this dataset. 

In [85]:
flights_reduced.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,TaxiIn,ArrDelay,ArrDel15,ArrivalDelayGroups,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,Overnight_Depature,Overnight_Arrival
0,2023,3,8,6,7,20416,NK,2252,20416,NK,N978NK,12889,32211,LAS,NV,11057,31057,CLT,NC,,,,,,,,,1,0,263.0,,,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-06 23:59:00,2023-08-07 07:22:00,NaT,NaT,0,0
1,2023,3,8,7,1,20416,NK,2252,20416,NK,N974NK,12889,32211,LAS,NV,11057,31057,CLT,NC,76.0,1.0,5.0,19.0,9.0,69.0,1.0,4.0,0,0,263.0,256.0,228.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,0,Las Vegas,Charlotte,2023-08-07 23:59:00,2023-08-08 07:22:00,2023-08-08 01:15:00,2023-08-08 08:31:00,0,0
2,2023,3,8,9,3,20416,NK,2252,20416,NK,N519NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-11.0,0.0,-1.0,14.0,10.0,-13.0,0.0,-1.0,0,0,258.0,256.0,232.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-09 23:10:00,2023-08-10 06:28:00,2023-08-09 22:59:00,2023-08-10 06:15:00,0,0
3,2023,3,8,10,4,20416,NK,2252,20416,NK,N532NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-8.0,0.0,-1.0,12.0,7.0,-30.0,0.0,-2.0,0,0,258.0,236.0,217.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-10 23:10:00,2023-08-11 06:28:00,2023-08-10 23:02:00,2023-08-11 05:58:00,0,0
4,2023,3,8,12,6,20416,NK,2252,20416,NK,N529NK,12889,32211,LAS,NV,11057,31057,CLT,NC,4.0,0.0,0.0,18.0,6.0,-4.0,0.0,-1.0,0,0,258.0,250.0,226.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-12 23:10:00,2023-08-13 06:28:00,2023-08-12 23:14:00,2023-08-13 06:24:00,0,0


# Review IATA Codes Data

I'll inpsect the IATA Codes dataset and fill in the Marketing and Operating Airline codes with the airline names for readability.

In [86]:
df_codes.head()

Unnamed: 0,Code,Description
0,19031,Mackey International Inc.: MAC
1,19032,Munz Northern Airlines Inc.: XY
2,19033,Cochise Airlines Inc.: COC
3,19034,Golden Gate Airlines Inc.: GSA
4,19035,Aeromech Inc.: RZZ


In [87]:
df_codes.shape

(1737, 2)

In [88]:
df_codes['Airline_Name'] = df_codes['Description'].str.split(':').str[0].str.strip()
df_codes['IATA_Code'] = df_codes['Description'].str.split(':').str[1].str.strip()

In [89]:
df_codes.head()

Unnamed: 0,Code,Description,Airline_Name,IATA_Code
0,19031,Mackey International Inc.: MAC,Mackey International Inc.,MAC
1,19032,Munz Northern Airlines Inc.: XY,Munz Northern Airlines Inc.,XY
2,19033,Cochise Airlines Inc.: COC,Cochise Airlines Inc.,COC
3,19034,Golden Gate Airlines Inc.: GSA,Golden Gate Airlines Inc.,GSA
4,19035,Aeromech Inc.: RZZ,Aeromech Inc.,RZZ


In [90]:
duplicate_iata = df_codes[df_codes['IATA_Code'].duplicated(keep=False)]
sorted_duplicates = duplicate_iata.sort_values(by='IATA_Code')
sorted_duplicates

Unnamed: 0,Code,Description,Airline_Name,IATA_Code


## Replace IATA Codes in flights df with airline names

In [91]:
# create dictionary of codes and airlines
iata_dict = df_codes.set_index('IATA_Code')['Airline_Name'].to_dict()
iata_dict

{'MAC': 'Mackey International Inc.',
 'XY': 'Munz Northern Airlines Inc.',
 'COC': 'Cochise Airlines Inc.',
 'GSA': 'Golden Gate Airlines Inc.',
 'RZZ': 'Aeromech Inc.',
 'GLW': 'Golden West Airlines Co.',
 'PRN': 'Puerto Rico Intl Airlines',
 'STZ': 'Air America Inc.',
 'SWT': 'Swift Aire Lines Inc.',
 'TSF': 'American Central Airlines',
 'VEZ': 'Valdez Airlines',
 'WEB': 'Southeast Alaska Airlines',
 'AAR': 'Altair Airlines Inc.',
 'CHI': 'Chitina Air Service',
 'MRC': 'Marco Island Airways Inc.',
 'OHZ': 'Caribbean Air Services Inc.',
 'PRO': 'Sundance Airlines',
 'SAI': 'Seair Alaska Airlines Inc.',
 'SLZ': 'Southeast Airlines Inc.',
 'AAZ': 'Alaska Aeronautical Indust.',
 'IMP': 'Imperial Airlines Inc.',
 'TWA': 'Trans Western Airlines Utah',
 'WRT': 'Wright Airlines Inc.',
 'COL': 'Presidential Express',
 'MVA': 'Mississippi Valley Airlines',
 'CHL': 'Channel Flying Inc.',
 'JCZ': 'Rocky Mountain Airways Inc.',
 'MIS': 'Midstate Airlines Inc.',
 'STG': 'Sedalia Marshall Boonvl St

In [92]:
flights_iata = flights_reduced

In [93]:
flights_iata['Airline_Mkt'] = flights_iata['IATA_Code_Marketing_Airline'].map(iata_dict)
flights_iata['Airline_Ops'] = flights_iata['IATA_Code_Operating_Airline'].map(iata_dict)

In [94]:
flights_iata[['Airline_Mkt']].isna().sum()

Airline_Mkt    0
dtype: int64

In [95]:
flights_iata[['Airline_Ops']].isna().sum()

Airline_Ops    0
dtype: int64

## Merge IATA Code data to flight performance data

In [96]:
# drop IATA codes because we have a dictionary with these details if needed
flights_iata_clean = flights_iata.drop(columns=['IATA_Code_Marketing_Airline', 'IATA_Code_Operating_Airline'])

In [97]:
flights_iata_clean.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,TaxiIn,ArrDelay,ArrDel15,ArrivalDelayGroups,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,Overnight_Depature,Overnight_Arrival,Airline_Mkt,Airline_Ops
0,2023,3,8,6,7,20416,2252,20416,N978NK,12889,32211,LAS,NV,11057,31057,CLT,NC,,,,,,,,,1,0,263.0,,,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-06 23:59:00,2023-08-07 07:22:00,NaT,NaT,0,0,Spirit Air Lines,Spirit Air Lines
1,2023,3,8,7,1,20416,2252,20416,N974NK,12889,32211,LAS,NV,11057,31057,CLT,NC,76.0,1.0,5.0,19.0,9.0,69.0,1.0,4.0,0,0,263.0,256.0,228.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,0,Las Vegas,Charlotte,2023-08-07 23:59:00,2023-08-08 07:22:00,2023-08-08 01:15:00,2023-08-08 08:31:00,0,0,Spirit Air Lines,Spirit Air Lines
2,2023,3,8,9,3,20416,2252,20416,N519NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-11.0,0.0,-1.0,14.0,10.0,-13.0,0.0,-1.0,0,0,258.0,256.0,232.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-09 23:10:00,2023-08-10 06:28:00,2023-08-09 22:59:00,2023-08-10 06:15:00,0,0,Spirit Air Lines,Spirit Air Lines
3,2023,3,8,10,4,20416,2252,20416,N532NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-8.0,0.0,-1.0,12.0,7.0,-30.0,0.0,-2.0,0,0,258.0,236.0,217.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-10 23:10:00,2023-08-11 06:28:00,2023-08-10 23:02:00,2023-08-11 05:58:00,0,0,Spirit Air Lines,Spirit Air Lines
4,2023,3,8,12,6,20416,2252,20416,N529NK,12889,32211,LAS,NV,11057,31057,CLT,NC,4.0,0.0,0.0,18.0,6.0,-4.0,0.0,-1.0,0,0,258.0,250.0,226.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-12 23:10:00,2023-08-13 06:28:00,2023-08-12 23:14:00,2023-08-13 06:24:00,0,0,Spirit Air Lines,Spirit Air Lines


# Import geographical data for airports

https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FLL&QO_fu146_anzr=N8vn6v10%20f722146%20gnoyr5

In [98]:
df_airports = pd.read_csv(DATA_PATH + '/raw/T_MASTER_CORD.csv')

In [99]:
df_airports.head()

Unnamed: 0,AIRPORT_SEQ_ID,AIRPORT_ID,AIRPORT,DISPLAY_AIRPORT_NAME,DISPLAY_AIRPORT_CITY_NAME_FULL,AIRPORT_WAC,AIRPORT_COUNTRY_NAME,AIRPORT_COUNTRY_CODE_ISO,AIRPORT_STATE_NAME,AIRPORT_STATE_CODE,AIRPORT_STATE_FIPS,CITY_MARKET_ID,DISPLAY_CITY_MARKET_NAME_FULL,CITY_MARKET_WAC,LAT_DEGREES,LAT_HEMISPHERE,LAT_MINUTES,LAT_SECONDS,LATITUDE,LON_DEGREES,LON_HEMISPHERE,LON_MINUTES,LON_SECONDS,LONGITUDE,AIRPORT_START_DATE,AIRPORT_THRU_DATE,AIRPORT_IS_CLOSED,AIRPORT_IS_LATEST
0,1000101,10001,01A,Afognak Lake Airport,"Afognak Lake, AK",1,United States,US,Alaska,AK,2.0,30001,"Afognak Lake, AK",1,58.0,N,6.0,34.0,58.109444,152.0,W,54.0,24.0,-152.906667,7/1/2007 12:00:00 AM,,0,1
1,1000301,10003,03A,Bear Creek Mining Strip,"Granite Mountain, AK",1,United States,US,Alaska,AK,2.0,30003,"Granite Mountain, AK",1,65.0,N,32.0,53.0,65.548056,161.0,W,4.0,18.0,-161.071667,7/1/2007 12:00:00 AM,,0,1
2,1000401,10004,04A,Lik Mining Camp,"Lik, AK",1,United States,US,Alaska,AK,2.0,30004,"Lik, AK",1,68.0,N,5.0,0.0,68.083333,163.0,W,10.0,0.0,-163.166667,7/1/2007 12:00:00 AM,,0,1
3,1000501,10005,05A,Little Squaw Airport,"Little Squaw, AK",1,United States,US,Alaska,AK,2.0,30005,"Little Squaw, AK",1,67.0,N,34.0,12.0,67.57,148.0,W,11.0,2.0,-148.183889,8/1/2007 12:00:00 AM,,0,1
4,1000601,10006,06A,Kizhuyak Bay,"Kizhuyak, AK",1,United States,US,Alaska,AK,2.0,30006,"Kizhuyak, AK",1,57.0,N,44.0,43.0,57.745278,152.0,W,52.0,58.0,-152.882778,10/1/2007 12:00:00 AM,,0,1


In [100]:
df_airports['AIRPORT_COUNTRY_NAME'].value_counts()

AIRPORT_COUNTRY_NAME
United States     7293
Canada             935
Australia          581
Brazil             458
France             449
                  ... 
Carriacou            1
Wake Island          1
Kyrgyzstan           1
Norfolk Island       1
Macau                1
Name: count, Length: 256, dtype: int64

In [101]:
df_airports.shape

(19197, 28)

We only need US airports for the purpose of this project. I will drop all rows for international airports.

In [102]:
df_airports = df_airports[df_airports['AIRPORT_COUNTRY_NAME'] == 'United States']

In [103]:
df_airports.columns

Index(['AIRPORT_SEQ_ID', 'AIRPORT_ID', 'AIRPORT', 'DISPLAY_AIRPORT_NAME',
       'DISPLAY_AIRPORT_CITY_NAME_FULL', 'AIRPORT_WAC', 'AIRPORT_COUNTRY_NAME',
       'AIRPORT_COUNTRY_CODE_ISO', 'AIRPORT_STATE_NAME', 'AIRPORT_STATE_CODE',
       'AIRPORT_STATE_FIPS', 'CITY_MARKET_ID', 'DISPLAY_CITY_MARKET_NAME_FULL',
       'CITY_MARKET_WAC', 'LAT_DEGREES', 'LAT_HEMISPHERE', 'LAT_MINUTES',
       'LAT_SECONDS', 'LATITUDE', 'LON_DEGREES', 'LON_HEMISPHERE',
       'LON_MINUTES', 'LON_SECONDS', 'LONGITUDE', 'AIRPORT_START_DATE',
       'AIRPORT_THRU_DATE', 'AIRPORT_IS_CLOSED', 'AIRPORT_IS_LATEST'],
      dtype='object')

In [104]:
df_airports.drop(columns=[
    'AIRPORT_COUNTRY_NAME', 
    'AIRPORT_COUNTRY_CODE_ISO', 
    'LAT_DEGREES', 
    'LAT_HEMISPHERE', 
    'LAT_MINUTES',
    'LAT_SECONDS', 
    'LON_DEGREES', 
    'LON_HEMISPHERE',
    'LON_MINUTES', 
    'LON_SECONDS'
],
inplace=True)

In [105]:
df_airports.head()

Unnamed: 0,AIRPORT_SEQ_ID,AIRPORT_ID,AIRPORT,DISPLAY_AIRPORT_NAME,DISPLAY_AIRPORT_CITY_NAME_FULL,AIRPORT_WAC,AIRPORT_STATE_NAME,AIRPORT_STATE_CODE,AIRPORT_STATE_FIPS,CITY_MARKET_ID,DISPLAY_CITY_MARKET_NAME_FULL,CITY_MARKET_WAC,LATITUDE,LONGITUDE,AIRPORT_START_DATE,AIRPORT_THRU_DATE,AIRPORT_IS_CLOSED,AIRPORT_IS_LATEST
0,1000101,10001,01A,Afognak Lake Airport,"Afognak Lake, AK",1,Alaska,AK,2.0,30001,"Afognak Lake, AK",1,58.109444,-152.906667,7/1/2007 12:00:00 AM,,0,1
1,1000301,10003,03A,Bear Creek Mining Strip,"Granite Mountain, AK",1,Alaska,AK,2.0,30003,"Granite Mountain, AK",1,65.548056,-161.071667,7/1/2007 12:00:00 AM,,0,1
2,1000401,10004,04A,Lik Mining Camp,"Lik, AK",1,Alaska,AK,2.0,30004,"Lik, AK",1,68.083333,-163.166667,7/1/2007 12:00:00 AM,,0,1
3,1000501,10005,05A,Little Squaw Airport,"Little Squaw, AK",1,Alaska,AK,2.0,30005,"Little Squaw, AK",1,67.57,-148.183889,8/1/2007 12:00:00 AM,,0,1
4,1000601,10006,06A,Kizhuyak Bay,"Kizhuyak, AK",1,Alaska,AK,2.0,30006,"Kizhuyak, AK",1,57.745278,-152.882778,10/1/2007 12:00:00 AM,,0,1


In [106]:
df_airports['AIRPORT'].nunique()

2964

In [107]:
len(df_airports['AIRPORT'])

7293

In [108]:
df_airports.sort_values('AIRPORT', ascending=False).head(20)

Unnamed: 0,AIRPORT_SEQ_ID,AIRPORT_ID,AIRPORT,DISPLAY_AIRPORT_NAME,DISPLAY_AIRPORT_CITY_NAME_FULL,AIRPORT_WAC,AIRPORT_STATE_NAME,AIRPORT_STATE_CODE,AIRPORT_STATE_FIPS,CITY_MARKET_ID,DISPLAY_CITY_MARKET_NAME_FULL,CITY_MARKET_WAC,LATITUDE,LONGITUDE,AIRPORT_START_DATE,AIRPORT_THRU_DATE,AIRPORT_IS_CLOSED,AIRPORT_IS_LATEST
19196,9999901,99999,ZZZ,Unknown Point in Alaska,Unknown Point in Alaska,1,Alaska,AK,2.0,99999,Unknown Point in Alaska,1,,,1/1/1950 12:00:00 AM,,0,1
18104,1636101,16361,ZZV,Zanesville Municipal,"Zanesville, OH",44,Ohio,OH,39.0,36361,"Zanesville, OH",44,39.942778,-81.8925,1/1/1950 12:00:00 AM,6/30/2011 12:00:00 AM,0,0
18108,1636105,16361,ZZV,Zanesville Municipal,"Zanesville, OH",44,Ohio,OH,39.0,36361,"Zanesville, OH",44,39.944444,-81.891944,12/1/2017 12:00:00 AM,9/30/2022 12:00:00 AM,0,0
18107,1636104,16361,ZZV,Zanesville Municipal,"Zanesville, OH",44,Ohio,OH,39.0,36361,"Zanesville, OH",44,39.944444,-81.892222,11/1/2017 12:00:00 AM,11/30/2017 12:00:00 AM,0,0
18106,1636103,16361,ZZV,Zanesville Municipal,"Zanesville, OH",44,Ohio,OH,39.0,36361,"Zanesville, OH",44,39.944444,-81.891944,10/1/2017 12:00:00 AM,10/31/2017 12:00:00 AM,0,0
18105,1636102,16361,ZZV,Zanesville Municipal,"Zanesville, OH",44,Ohio,OH,39.0,36361,"Zanesville, OH",44,39.944444,-81.892222,7/1/2011 12:00:00 AM,9/30/2017 12:00:00 AM,0,0
18109,1636106,16361,ZZV,Zanesville Municipal,"Zanesville, OH",44,Ohio,OH,39.0,36361,"Zanesville, OH",44,39.944444,-81.892222,10/1/2022 12:00:00 AM,,0,1
18097,1635802,16358,ZXZ,Waterville Airport,"Waterville, WA",93,Washington,WA,53.0,36358,"Waterville, WA",93,47.656111,-120.056389,7/1/2011 12:00:00 AM,,0,1
18096,1635801,16358,ZXZ,Waterville Airport,"Waterville, WA",93,Washington,WA,53.0,36358,"Waterville, WA",93,47.654167,-120.054167,3/1/1990 12:00:00 AM,6/30/2011 12:00:00 AM,0,0
18095,1635703,16357,ZXY,Blake Field,"Delta, CO",82,Colorado,CO,8.0,36357,"Delta, CO",82,38.785556,-108.061944,7/1/2019 12:00:00 AM,,0,1


An airport can have several records based on slight changes such as their registered longitude and latitude. I only want to keep active airports and their latest location. All other airport records will not be necesary at this time. 

In [109]:
current_airports = (df_airports['AIRPORT_IS_LATEST'] == 1) & (df_airports['AIRPORT_IS_CLOSED'] == 0)
df_airports[current_airports]['AIRPORT'].nunique()

2804

In [110]:
airports_clean = df_airports[current_airports].dropna(subset=['LATITUDE', 'LONGITUDE'])

### Adding timezone information for each airport

In [111]:
from timezonefinder import TimezoneFinder


In [112]:
def assign_timezones(df, lon_col, lat_col):
    ''''''
    tf = TimezoneFinder()

    df['TIMEZONE'] = df.apply(lambda row: tf.timezone_at(lng=row[lon_col], lat=row[lat_col]), axis=1)
    return df

In [113]:
airports = assign_timezones(airports_clean, 'LONGITUDE', 'LATITUDE')

In [114]:
airports.head()

Unnamed: 0,AIRPORT_SEQ_ID,AIRPORT_ID,AIRPORT,DISPLAY_AIRPORT_NAME,DISPLAY_AIRPORT_CITY_NAME_FULL,AIRPORT_WAC,AIRPORT_STATE_NAME,AIRPORT_STATE_CODE,AIRPORT_STATE_FIPS,CITY_MARKET_ID,DISPLAY_CITY_MARKET_NAME_FULL,CITY_MARKET_WAC,LATITUDE,LONGITUDE,AIRPORT_START_DATE,AIRPORT_THRU_DATE,AIRPORT_IS_CLOSED,AIRPORT_IS_LATEST,TIMEZONE
0,1000101,10001,01A,Afognak Lake Airport,"Afognak Lake, AK",1,Alaska,AK,2.0,30001,"Afognak Lake, AK",1,58.109444,-152.906667,7/1/2007 12:00:00 AM,,0,1,America/Anchorage
1,1000301,10003,03A,Bear Creek Mining Strip,"Granite Mountain, AK",1,Alaska,AK,2.0,30003,"Granite Mountain, AK",1,65.548056,-161.071667,7/1/2007 12:00:00 AM,,0,1,America/Anchorage
2,1000401,10004,04A,Lik Mining Camp,"Lik, AK",1,Alaska,AK,2.0,30004,"Lik, AK",1,68.083333,-163.166667,7/1/2007 12:00:00 AM,,0,1,America/Nome
3,1000501,10005,05A,Little Squaw Airport,"Little Squaw, AK",1,Alaska,AK,2.0,30005,"Little Squaw, AK",1,67.57,-148.183889,8/1/2007 12:00:00 AM,,0,1,America/Anchorage
4,1000601,10006,06A,Kizhuyak Bay,"Kizhuyak, AK",1,Alaska,AK,2.0,30006,"Kizhuyak, AK",1,57.745278,-152.882778,10/1/2007 12:00:00 AM,,0,1,America/Anchorage


In [115]:
airports['TIMEZONE'].value_counts()

TIMEZONE
America/New_York                  703
America/Chicago                   675
America/Anchorage                 412
America/Los_Angeles               345
America/Denver                    203
America/Nome                      100
America/Sitka                      86
America/Phoenix                    59
America/Detroit                    53
America/Juneau                     38
America/Indiana/Indianapolis       29
America/Boise                      28
Pacific/Honolulu                   21
America/Puerto_Rico                14
America/Adak                        5
America/St_Thomas                   5
Pacific/Pago_Pago                   4
Pacific/Guam                        3
America/Menominee                   3
America/Kentucky/Louisville         3
Pacific/Saipan                      3
America/Metlakatla                  2
America/Yakutat                     2
America/Indiana/Vincennes           2
Pacific/Wake                        1
Etc/GMT+10                          1
Pac

In [116]:
airports.to_csv(DATA_PATH + '/interim/airports.csv', index=False)

In [117]:
airports_sub = airports[['AIRPORT_ID', 'AIRPORT', 'TIMEZONE']]
airports_sub.head()

Unnamed: 0,AIRPORT_ID,AIRPORT,TIMEZONE
0,10001,01A,America/Anchorage
1,10003,03A,America/Anchorage
2,10004,04A,America/Nome
3,10005,05A,America/Anchorage
4,10006,06A,America/Anchorage


#### Save updated flights df that has timezone corrections and UTC conversion

In [118]:
flights_iata_clean.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,TaxiIn,ArrDelay,ArrDel15,ArrivalDelayGroups,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,Overnight_Depature,Overnight_Arrival,Airline_Mkt,Airline_Ops
0,2023,3,8,6,7,20416,2252,20416,N978NK,12889,32211,LAS,NV,11057,31057,CLT,NC,,,,,,,,,1,0,263.0,,,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-06 23:59:00,2023-08-07 07:22:00,NaT,NaT,0,0,Spirit Air Lines,Spirit Air Lines
1,2023,3,8,7,1,20416,2252,20416,N974NK,12889,32211,LAS,NV,11057,31057,CLT,NC,76.0,1.0,5.0,19.0,9.0,69.0,1.0,4.0,0,0,263.0,256.0,228.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,0,Las Vegas,Charlotte,2023-08-07 23:59:00,2023-08-08 07:22:00,2023-08-08 01:15:00,2023-08-08 08:31:00,0,0,Spirit Air Lines,Spirit Air Lines
2,2023,3,8,9,3,20416,2252,20416,N519NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-11.0,0.0,-1.0,14.0,10.0,-13.0,0.0,-1.0,0,0,258.0,256.0,232.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-09 23:10:00,2023-08-10 06:28:00,2023-08-09 22:59:00,2023-08-10 06:15:00,0,0,Spirit Air Lines,Spirit Air Lines
3,2023,3,8,10,4,20416,2252,20416,N532NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-8.0,0.0,-1.0,12.0,7.0,-30.0,0.0,-2.0,0,0,258.0,236.0,217.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-10 23:10:00,2023-08-11 06:28:00,2023-08-10 23:02:00,2023-08-11 05:58:00,0,0,Spirit Air Lines,Spirit Air Lines
4,2023,3,8,12,6,20416,2252,20416,N529NK,12889,32211,LAS,NV,11057,31057,CLT,NC,4.0,0.0,0.0,18.0,6.0,-4.0,0.0,-1.0,0,0,258.0,250.0,226.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-12 23:10:00,2023-08-13 06:28:00,2023-08-12 23:14:00,2023-08-13 06:24:00,0,0,Spirit Air Lines,Spirit Air Lines


In [119]:
def add_timezones(flights_df, airports_df):

    origin_merge = pd.merge(flights_df, airports_df, how='left', left_on=['OriginAirportID', 'Origin'], right_on=['AIRPORT_ID', 'AIRPORT'])
    origin_merge.rename(columns={'TIMEZONE':'Origin_Timezone'}, inplace=True)
    origin_merge = origin_merge.drop(columns=['AIRPORT_ID', 'AIRPORT'])
    destination_merge = pd.merge(origin_merge, airports_df, how='left', left_on=['DestAirportID', 'Dest'], right_on=['AIRPORT_ID', 'AIRPORT'])
    destination_merge.rename(columns={'TIMEZONE':'Destination_Timezone'}, inplace=True)
    flights_local_time = destination_merge.drop(columns=['AIRPORT_ID', 'AIRPORT'])

    return flights_local_time

In [120]:
flights_local_time = add_timezones(flights_iata_clean, airports_sub)

In [121]:
flights_local_time[(flights_local_time['OriginState'] == 'WA') & (flights_local_time['DestState'] == 'MN')].head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,TaxiIn,ArrDelay,ArrDel15,ArrivalDelayGroups,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,Overnight_Depature,Overnight_Arrival,Airline_Mkt,Airline_Ops,Origin_Timezone,Destination_Timezone
171429,2023,3,8,1,2,19790,360,19790,N921DU,14747,30559,SEA,WA,13487,31650,MSP,MN,-1.0,0.0,-1.0,36.0,6.0,-14.0,0.0,-1.0,0,0,212.0,199.0,157.0,1399.0,6,,,,,,0.0,0,Seattle,Minneapolis,2023-08-01 08:11:00,2023-08-01 13:43:00,2023-08-01 08:10:00,2023-08-01 13:29:00,0,0,Delta Air Lines Inc.,Delta Air Lines Inc.,America/Los_Angeles,America/Chicago
171430,2023,3,8,2,3,19790,360,19790,N853DN,14747,30559,SEA,WA,13487,31650,MSP,MN,-8.0,0.0,-1.0,18.0,16.0,-27.0,0.0,-2.0,0,0,212.0,193.0,159.0,1399.0,6,,,,,,0.0,0,Seattle,Minneapolis,2023-08-02 08:11:00,2023-08-02 13:43:00,2023-08-02 08:03:00,2023-08-02 13:16:00,0,0,Delta Air Lines Inc.,Delta Air Lines Inc.,America/Los_Angeles,America/Chicago
171431,2023,3,8,3,4,19790,360,19790,N806DN,14747,30559,SEA,WA,13487,31650,MSP,MN,825.0,1.0,12.0,30.0,12.0,822.0,1.0,12.0,0,0,212.0,209.0,167.0,1399.0,6,822.0,0.0,0.0,0.0,0.0,0.0,0,Seattle,Minneapolis,2023-08-03 08:11:00,2023-08-03 13:43:00,2023-08-03 21:56:00,2023-08-04 03:25:00,0,0,Delta Air Lines Inc.,Delta Air Lines Inc.,America/Los_Angeles,America/Chicago
171432,2023,3,8,4,5,19790,360,19790,N848DN,14747,30559,SEA,WA,13487,31650,MSP,MN,-3.0,0.0,-1.0,28.0,8.0,-10.0,0.0,-1.0,0,0,212.0,205.0,169.0,1399.0,6,,,,,,0.0,0,Seattle,Minneapolis,2023-08-04 08:11:00,2023-08-04 13:43:00,2023-08-04 08:08:00,2023-08-04 13:33:00,0,0,Delta Air Lines Inc.,Delta Air Lines Inc.,America/Los_Angeles,America/Chicago
171433,2023,3,8,5,6,19790,360,19790,N868DN,14747,30559,SEA,WA,13487,31650,MSP,MN,-4.0,0.0,-1.0,19.0,4.0,-15.0,0.0,-1.0,0,0,212.0,201.0,178.0,1399.0,6,,,,,,0.0,0,Seattle,Minneapolis,2023-08-05 08:11:00,2023-08-05 13:43:00,2023-08-05 08:07:00,2023-08-05 13:28:00,0,0,Delta Air Lines Inc.,Delta Air Lines Inc.,America/Los_Angeles,America/Chicago


In [122]:
flights_local_time[['scheduled_departure_datetime', 'scheduled_arrival_datetime', 'actual_departure_datetime', 'actual_arrival_datetime']].dtypes

scheduled_departure_datetime    datetime64[ns]
scheduled_arrival_datetime      datetime64[ns]
actual_departure_datetime       datetime64[ns]
actual_arrival_datetime         datetime64[ns]
dtype: object

In [123]:
def convert_flight_times_to_utc(df):
    """
    Converts flight scheduled and actual departure/arrival times to UTC and rounds 
    the scheduled times to the nearest hour for weather data joining.

    The function handles timezone conversions for both departure and arrival times 
    based on the airport timezones (Origin_Timezone and Destination_Timezone). It 
    also addresses ambiguous and nonexistent times during daylight savings transitions.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing the following columns:
        - 'scheduled_departure_datetime': datetime column for the scheduled departure time.
        - 'actual_departure_datetime': datetime column for the actual departure time.
        - 'scheduled_arrival_datetime': datetime column for the scheduled arrival time.
        - 'actual_arrival_datetime': datetime column for the actual arrival time.
        - 'Origin_Timezone': column containing the timezone of the origin airport.
        - 'Destination_Timezone': column containing the timezone of the destination airport.

    Returns:
    --------
    df : pandas.DataFrame
        The original DataFrame with the following additional columns:
        - 'scheduled_departure_datetime_utc': converted and rounded scheduled departure time in UTC.
        - 'actual_departure_datetime_utc': converted actual departure time in UTC.
        - 'scheduled_arrival_datetime_utc': converted and rounded scheduled arrival time in UTC.
        - 'actual_arrival_datetime_utc': converted actual arrival time in UTC.

    Notes:
    ------
    - The 'ambiguous' parameter is set to True, which assumes the earlier time for ambiguous DST transitions.
    - The 'nonexistent' parameter is set to 'shift_forward' to handle nonexistent times during the DST jump forward.
    - The scheduled times are rounded to the nearest hour after conversion to UTC for easier alignment with external data (e.g., weather data).
    """
    
    # Localize scheduled depature based on Origin_Timezone
    df['scheduled_departure_datetime'] = df['scheduled_departure_datetime'].dt.tz_localize(None)
    # Convert scheduled departure to UTC
    df['scheduled_departure_datetime_utc'] = df.groupby('Origin_Timezone')['scheduled_departure_datetime'].transform(
        lambda x: x.dt.tz_localize(x.name, ambiguous=True, nonexistent='shift_forward').dt.tz_convert('UTC')
    )
    # Round scheduled departure time to nearest hour for joining with weather data
    df['scheduled_departure_datetime_utc'] = df['scheduled_departure_datetime_utc'].round('h')
    
    # Localize actual depature based on Origin_Timezone
    df['actual_departure_datetime'] = df['actual_departure_datetime'].dt.tz_localize(None)
    # Convert actual departure to UTC
    df['actual_departure_datetime_utc'] = df.groupby('Origin_Timezone')['actual_departure_datetime'].transform(
        lambda x: x.dt.tz_localize(x.name, ambiguous=True, nonexistent='shift_forward').dt.tz_convert('UTC')
    )

    # Localize scheduled arrival based on Destination_Timezone
    df['scheduled_arrival_datetime'] = df['scheduled_arrival_datetime'].dt.tz_localize(None)
    # Convert scheduled arrival to UTC
    df['scheduled_arrival_datetime_utc'] = df.groupby('Destination_Timezone')['scheduled_arrival_datetime'].transform(
        lambda x: x.dt.tz_localize(x.name, ambiguous=True, nonexistent='shift_forward').dt.tz_convert('UTC')
    )
    # Round scheduled arrival time to nearest hour for joining with weather data
    df['scheduled_arrival_datetime_utc'] = df['scheduled_arrival_datetime_utc'].round('h')
    
    # Localize actual arrival based on Destination_Timezone
    df['actual_arrival_datetime'] = df['actual_arrival_datetime'].dt.tz_localize(None)
    # Convert actual arrival to UTC
    df['actual_arrival_datetime_utc'] = df.groupby('Destination_Timezone')['actual_arrival_datetime'].transform(
        lambda x: x.dt.tz_localize(x.name, ambiguous=True, nonexistent='shift_forward').dt.tz_convert('UTC')
    )
    
    return df

In [124]:
flights_utc = convert_flight_times_to_utc(flights_local_time)

In [125]:
flights_utc.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,TaxiIn,ArrDelay,ArrDel15,ArrivalDelayGroups,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,Overnight_Depature,Overnight_Arrival,Airline_Mkt,Airline_Ops,Origin_Timezone,Destination_Timezone,scheduled_departure_datetime_utc,actual_departure_datetime_utc,scheduled_arrival_datetime_utc,actual_arrival_datetime_utc
0,2023,3,8,6,7,20416,2252,20416,N978NK,12889,32211,LAS,NV,11057,31057,CLT,NC,,,,,,,,,1,0,263.0,,,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-06 23:59:00,2023-08-07 07:22:00,NaT,NaT,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-07 07:00:00+00:00,NaT,2023-08-07 11:00:00+00:00,NaT
1,2023,3,8,7,1,20416,2252,20416,N974NK,12889,32211,LAS,NV,11057,31057,CLT,NC,76.0,1.0,5.0,19.0,9.0,69.0,1.0,4.0,0,0,263.0,256.0,228.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,0,Las Vegas,Charlotte,2023-08-07 23:59:00,2023-08-08 07:22:00,2023-08-08 01:15:00,2023-08-08 08:31:00,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-08 07:00:00+00:00,2023-08-08 08:15:00+00:00,2023-08-08 11:00:00+00:00,2023-08-08 12:31:00+00:00
2,2023,3,8,9,3,20416,2252,20416,N519NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-11.0,0.0,-1.0,14.0,10.0,-13.0,0.0,-1.0,0,0,258.0,256.0,232.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-09 23:10:00,2023-08-10 06:28:00,2023-08-09 22:59:00,2023-08-10 06:15:00,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-10 06:00:00+00:00,2023-08-10 05:59:00+00:00,2023-08-10 10:00:00+00:00,2023-08-10 10:15:00+00:00
3,2023,3,8,10,4,20416,2252,20416,N532NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-8.0,0.0,-1.0,12.0,7.0,-30.0,0.0,-2.0,0,0,258.0,236.0,217.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-10 23:10:00,2023-08-11 06:28:00,2023-08-10 23:02:00,2023-08-11 05:58:00,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-11 06:00:00+00:00,2023-08-11 06:02:00+00:00,2023-08-11 10:00:00+00:00,2023-08-11 09:58:00+00:00
4,2023,3,8,12,6,20416,2252,20416,N529NK,12889,32211,LAS,NV,11057,31057,CLT,NC,4.0,0.0,0.0,18.0,6.0,-4.0,0.0,-1.0,0,0,258.0,250.0,226.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-12 23:10:00,2023-08-13 06:28:00,2023-08-12 23:14:00,2023-08-13 06:24:00,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-13 06:00:00+00:00,2023-08-13 06:14:00+00:00,2023-08-13 10:00:00+00:00,2023-08-13 10:24:00+00:00


In [126]:
def add_holidays(df, date_col):
    """
    Adds a new column 'is_holiday' to the DataFrame, indicating whether the given date
    in the specified column is a US holiday.

    This function iterates over the specified date column and checks if each date is
    a recognized holiday in the United States, as defined by the `holidays.US()` library.
    If the date is a holiday, the corresponding value in the new 'is_holiday' column
    will be 1, otherwise it will be 0.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing a date column.
    date_col : str
        The column name in the DataFrame that contains the dates to check.

    Returns:
    --------
    df : pandas.DataFrame
        The original DataFrame with an additional 'is_holiday' column, where
        1 indicates a holiday and 0 indicates a non-holiday.

    """
    
    us_holidays = holidays.US()

    df['is_holiday'] = [1 if date in us_holidays else 0 for date in df[date_col]]

    return df

In [127]:
flights_transformed = add_holidays(flights_utc, 'scheduled_departure_datetime')
flights_transformed['is_holiday'].value_counts()

is_holiday
0    7015530
1     245411
Name: count, dtype: int64

In [128]:
flights_transformed.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,TaxiIn,ArrDelay,ArrDel15,ArrivalDelayGroups,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,Overnight_Depature,Overnight_Arrival,Airline_Mkt,Airline_Ops,Origin_Timezone,Destination_Timezone,scheduled_departure_datetime_utc,actual_departure_datetime_utc,scheduled_arrival_datetime_utc,actual_arrival_datetime_utc,is_holiday
0,2023,3,8,6,7,20416,2252,20416,N978NK,12889,32211,LAS,NV,11057,31057,CLT,NC,,,,,,,,,1,0,263.0,,,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-06 23:59:00,2023-08-07 07:22:00,NaT,NaT,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-07 07:00:00+00:00,NaT,2023-08-07 11:00:00+00:00,NaT,0
1,2023,3,8,7,1,20416,2252,20416,N974NK,12889,32211,LAS,NV,11057,31057,CLT,NC,76.0,1.0,5.0,19.0,9.0,69.0,1.0,4.0,0,0,263.0,256.0,228.0,1916.0,8,3.0,0.0,1.0,0.0,65.0,0.0,0,Las Vegas,Charlotte,2023-08-07 23:59:00,2023-08-08 07:22:00,2023-08-08 01:15:00,2023-08-08 08:31:00,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-08 07:00:00+00:00,2023-08-08 08:15:00+00:00,2023-08-08 11:00:00+00:00,2023-08-08 12:31:00+00:00,0
2,2023,3,8,9,3,20416,2252,20416,N519NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-11.0,0.0,-1.0,14.0,10.0,-13.0,0.0,-1.0,0,0,258.0,256.0,232.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-09 23:10:00,2023-08-10 06:28:00,2023-08-09 22:59:00,2023-08-10 06:15:00,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-10 06:00:00+00:00,2023-08-10 05:59:00+00:00,2023-08-10 10:00:00+00:00,2023-08-10 10:15:00+00:00,0
3,2023,3,8,10,4,20416,2252,20416,N532NK,12889,32211,LAS,NV,11057,31057,CLT,NC,-8.0,0.0,-1.0,12.0,7.0,-30.0,0.0,-2.0,0,0,258.0,236.0,217.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-10 23:10:00,2023-08-11 06:28:00,2023-08-10 23:02:00,2023-08-11 05:58:00,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-11 06:00:00+00:00,2023-08-11 06:02:00+00:00,2023-08-11 10:00:00+00:00,2023-08-11 09:58:00+00:00,0
4,2023,3,8,12,6,20416,2252,20416,N529NK,12889,32211,LAS,NV,11057,31057,CLT,NC,4.0,0.0,0.0,18.0,6.0,-4.0,0.0,-1.0,0,0,258.0,250.0,226.0,1916.0,8,,,,,,0.0,0,Las Vegas,Charlotte,2023-08-12 23:10:00,2023-08-13 06:28:00,2023-08-12 23:14:00,2023-08-13 06:24:00,0,0,Spirit Air Lines,Spirit Air Lines,America/Los_Angeles,America/New_York,2023-08-13 06:00:00+00:00,2023-08-13 06:14:00+00:00,2023-08-13 10:00:00+00:00,2023-08-13 10:24:00+00:00,0


In [129]:
def rename_flights_columns(df):
    """
    Renames specific columns in the DataFrame and applies two renaming patterns to other columns.

    This function first explicitly renames certain columns with predefined names, then applies the following transformations:
    
    1. Columns that are in camelCase or PascalCase (e.g., 'DayOfWeek') are converted to snake_case (e.g., 'day_of_week').
       
    2. Columns that contain underscores but are in mixed case (e.g., 'Overnight_Departure') are converted to lowercase (e.g., 'overnight_departure').
    
    All other columns are left unchanged.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the columns to be renamed.
    
    Returns:
    --------
    pandas.DataFrame
        The DataFrame with renamed columns.
    """

    df.rename(columns={
        'DayofMonth': 'day_of_month',
        'NASDelay': 'nas_delay',
        'DOT_ID_Marketing_Airline': 'marketing_airline_id',
        'DOT_ID_Operating_Airline': 'operating_airline_id',
        'CRSElapsedTime': 'scheduled_elapsed_time' 
    }, inplace=True)

    def convert_to_snake_case(name):
        # Convert 'DayOfWeek' to 'day_of_week'
        snake_case = re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
        snake_case = snake_case.replace('_i_d', '_id')
        return snake_case

    new_columns = []
    
    for col in df.columns:
        # First pattern: Capitalized words followed by another capitalized word without spaces
        if re.match(r'^[A-Z][a-z]+[A-Z]', col):
            new_columns.append(convert_to_snake_case(col))
        # Second pattern: Columns with underscores and mixed case
        elif '_' in col and not col.islower():
            new_columns.append(col.lower())
        else:
            # Leave other columns unchanged
            new_columns.append(col.lower())

    # Rename the columns in the DataFrame
    df.columns = new_columns
    return df


In [130]:
flights_clean = rename_flights_columns(flights_transformed)

In [131]:
flights_clean.columns

Index(['year', 'quarter', 'month', 'day_of_month', 'day_of_week',
       'marketing_airline_id', 'flight_number_marketing_airline',
       'operating_airline_id', 'tail_number', 'origin_airport_id',
       'origin_city_market_id', 'origin', 'origin_state', 'dest_airport_id',
       'dest_city_market_id', 'dest', 'dest_state', 'dep_delay', 'dep_del15',
       'departure_delay_groups', 'taxi_out', 'taxi_in', 'arr_delay',
       'arr_del15', 'arrival_delay_groups', 'cancelled', 'diverted',
       'scheduled_elapsed_time', 'actual_elapsed_time', 'air_time', 'distance',
       'distance_group', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay', 'div_airport_landings',
       'code_share_flight', 'origin_city', 'destination_city',
       'scheduled_departure_datetime', 'scheduled_arrival_datetime',
       'actual_departure_datetime', 'actual_arrival_datetime',
       'overnight_depature', 'overnight_arrival', 'airline_mkt', 'airline_ops',
       'o

In [132]:
flights_clean.to_csv(DATA_PATH + '/interim/2023-performance-data-clean.csv', index=False)
flights_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7260941 entries, 0 to 7260940
Data columns (total 56 columns):
 #   Column                            Dtype              
---  ------                            -----              
 0   year                              int64              
 1   quarter                           int64              
 2   month                             int64              
 3   day_of_month                      int64              
 4   day_of_week                       int64              
 5   marketing_airline_id              int64              
 6   flight_number_marketing_airline   int64              
 7   operating_airline_id              int64              
 8   tail_number                       object             
 9   origin_airport_id                 int64              
 10  origin_city_market_id             int64              
 11  origin                            object             
 12  origin_state                      object             
 1

# Import Aircraft Registration Data

https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download

In [133]:
aircraft_registration = pd.read_csv(DATA_PATH + '/raw/ReleasableAircraft-2023/MASTER.txt')

  aircraft_registration = pd.read_csv(DATA_PATH + '/raw/ReleasableAircraft-2023/MASTER.txt')


In [134]:
aircraft_ref = pd.read_csv(DATA_PATH + '/raw/ReleasableAircraft-2023/ACFTREF.txt')

In [135]:
engine_ref = pd.read_csv(DATA_PATH + '/raw/ReleasableAircraft-2023/ENGINE.txt')

In [136]:
aircraft_registration.head()

Unnamed: 0,N-NUMBER,SERIAL NUMBER,MFR MDL CODE,ENG MFR MDL,YEAR MFR,TYPE REGISTRANT,NAME,STREET,STREET2,CITY,STATE,ZIP CODE,REGION,COUNTY,COUNTRY,LAST ACTION DATE,CERT ISSUE DATE,CERTIFICATION,TYPE AIRCRAFT,TYPE ENGINE,STATUS CODE,MODE S CODE,FRACT OWNER,AIR WORTH DATE,OTHER NAMES(1),OTHER NAMES(2),OTHER NAMES(3),OTHER NAMES(4),OTHER NAMES(5),EXPIRATION DATE,UNIQUE ID,KIT MFR,KIT MODEL,MODE S CODE HEX,Unnamed: 34
0,1,680-0519,2076811,52041.0,2014.0,7,TENAX AEROSPACE LLC ...,400 W PARKWAY PL STE 201,,RIDGELAND,MS,391576005,2,89,US,20230823,20211130,1T,5,5,V,50000001,,20140325.0,...,...,...,...,...,20281130,1141371,,,A00001,
1,100,5334,7100510,17003.0,1940.0,1,BENE MARY D ...,PO BOX 329,,KETCHUM,OK,743490329,2,97,US,20230122,20050506,1,4,1,V,50002263,,19540430.0,...,...,...,...,...,20270430,600060,,,A004B3,
2,10001,A28,9601202,67007.0,1928.0,1,STOOS ROBERT A ...,PO BOX 1056,,LAKELAND,FL,338021056,7,105,US,20230718,20190227,1,4,1,V,50003446,,,...,...,...,...,...,20290228,432072,,,A00726,
3,10004,T18208245,2072738,,,7,ETOS AIR LLC ...,PO BOX 288,,NEW LONDON,TX,756820288,2,401,US,20230722,20130312,,4,1,V,50003451,,,...,...,...,...,...,20290331,102879,,,A00729,
4,10006,BG-72,1152020,17026.0,1955.0,1,COUTCHES ROBERT HERCULES DBA ...,550 AIRWAY BLVD,,LIVERMORE,CA,945519533,4,1,US,20230421,19980826,1U,4,1,V,50003453,,19710909.0,AERO FLIGHT AVIATION ...,...,...,...,...,20280229,480110,,,A0072B,


In [137]:
aircraft_registration.shape

(293465, 35)

In [138]:
aircraft_registration.columns

Index(['N-NUMBER', 'SERIAL NUMBER', 'MFR MDL CODE', 'ENG MFR MDL', 'YEAR MFR',
       'TYPE REGISTRANT', 'NAME', 'STREET', 'STREET2', 'CITY', 'STATE',
       'ZIP CODE', 'REGION', 'COUNTY', 'COUNTRY', 'LAST ACTION DATE',
       'CERT ISSUE DATE', 'CERTIFICATION', 'TYPE AIRCRAFT', 'TYPE ENGINE',
       'STATUS CODE', 'MODE S CODE', 'FRACT OWNER', 'AIR WORTH DATE',
       'OTHER NAMES(1)', 'OTHER NAMES(2)', 'OTHER NAMES(3)', 'OTHER NAMES(4)',
       'OTHER NAMES(5)', 'EXPIRATION DATE', 'UNIQUE ID', 'KIT MFR',
       ' KIT MODEL', 'MODE S CODE HEX', 'Unnamed: 34'],
      dtype='object')

In [139]:
aircraft_registration.drop(columns=[
    'FRACT OWNER',
    'OTHER NAMES(1)', 
    'OTHER NAMES(2)', 
    'OTHER NAMES(3)', 
    'OTHER NAMES(4)',
    'OTHER NAMES(5)',
    'KIT MFR',
    ' KIT MODEL',
    'Unnamed: 34'], 
    inplace=True)

I'll filter the aircraft registration file for tail numbers in the monthly performance data since we don't need data about any other aircrafts from the FAA.

In [140]:
# check for number of aircrafts on FAA list for commercial flights
df_merged.Tail_Number.nunique()

6326

In [141]:
df_merged.Tail_Number.head()

0    N978NK
1    N974NK
2    N519NK
3    N532NK
4    N529NK
Name: Tail_Number, dtype: object

In [142]:
df_merged.Tail_Number.isna().sum()

19648

Just over 19,000 flights are missing Tail Numbers. 

In [143]:
# remove 'N' from Tail Number to compare to aircraft registrations
df_merged['Tail_Number'] = df['Tail_Number'].apply(lambda x: x[1:] if isinstance(x, str) and x.startswith('N') else x)
df_merged.Tail_Number.head()

0    978NK
1    974NK
2    519NK
3    532NK
4    529NK
Name: Tail_Number, dtype: object

In [144]:
tail_numbers = df_merged.Tail_Number

In [145]:
aircraft_registration = aircraft_registration[aircraft_registration['N-NUMBER'].isin(tail_numbers)]

In [146]:
aircraft_registration.shape

(6273, 26)

In [147]:
6326-6273

53

The 2023 aircraft registration is missing registrations for 53 planes. These might be in older registrations. I will download the 2022 data to compare. 

In [148]:
aircraft_registration.head()

Unnamed: 0,N-NUMBER,SERIAL NUMBER,MFR MDL CODE,ENG MFR MDL,YEAR MFR,TYPE REGISTRANT,NAME,STREET,STREET2,CITY,STATE,ZIP CODE,REGION,COUNTY,COUNTRY,LAST ACTION DATE,CERT ISSUE DATE,CERTIFICATION,TYPE AIRCRAFT,TYPE ENGINE,STATUS CODE,MODE S CODE,AIR WORTH DATE,EXPIRATION DATE,UNIQUE ID,MODE S CODE HEX
645,10156,145786,3260214,54582,2004,3,UNITED AIRLINES INC ...,233 S WACKER DR,,CHICAGO,IL,606067147,C,31,US,20230804,20190531,1T,5,5,V,50005621,20040215,20290531,90729,A00B91
807,101DQ,9409,3940005,13101,2020,3,DELTA AIR LINES INC ...,1775 M H JACKSON SERVICE RD,DEPT 595 AIRCRAFT REGISTRATIONS,ATLANTA,GA,30354,7,121,US,20231007,20200213,1T,5,5,V,50004305,20200213,20300228,1312557,A008C5
810,101DU,50020,1400012,52315,2018,3,BANK OF UTAH TRUSTEE ...,50 S 200 E STE 110,,SALT LAKE CITY,UT,841111617,S,35,US,20230906,20200808,1T,5,5,V,50004311,20181025,20270831,1285810,A008C9
866,101HQ,17000156,3260121,30061,2007,3,REPUBLIC AIRWAYS INC ...,8909 PURDUE RD STE 300,,INDIANAPOLIS,IN,462683152,C,97,US,20230818,20070227,1T,5,5,V,50004451,20070228,20290731,328579,A00929
938,101NN,5834,3930402,34601,2013,3,WILMINGTON TRUST CO TRUSTEE ...,1100 N MARKET ST,,WILMINGTON,DE,198901605,1,3,US,20230421,20210222,1T,5,5,V,50004644,20131122,20280229,1119973,A009A4


In [149]:
mask = df_merged['Tail_Number'].isin(aircraft_registration['N-NUMBER'])
missing_registration = df_merged[~mask]

In [150]:
missing_registration.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,OriginAirportID,OriginCityMarketID,Origin,OriginState,DestAirportID,DestCityMarketID,Dest,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivAirportLandings,Duplicate,Code_Share_Flight,Origin_City,Destination_City,scheduled_departure_datetime,scheduled_arrival_datetime,actual_departure_datetime,actual_arrival_datetime,DeptTimeBlk_Start,DeptTimeBlk_End,ArrTimeBlk_Start,ArrTimeBlk_End,ArrivalBlock_Total,DepartureBlock_Total,Overnight_Depature,Overnight_Arrival
11878,2023,3,8,30,3,2023-08-30,19805,AA,5016,20397,OH,,12451,31136,JAX,FL,11278,30852,DCA,VA,1202,,,,,1200-1259,,,,,1358,,,,,1300-1359,1,0,116.0,,,1.0,634.0,3,,,,,,0.0,N,1,Jacksonville,"Washington, DC",2023-08-30 12:02:00,2023-08-30 13:58:00,NaT,NaT,1200,1259,1300,1359,59,59,0,0
12397,2023,3,8,8,2,2023-08-08,19805,AA,5036,20397,OH,,13931,33667,ORF,VA,11057,31057,CLT,NC,635,,,,,0600-0659,,,,,802,,,,,0800-0859,1,0,87.0,,,1.0,290.0,2,,,,,,0.0,N,1,Norfolk,Charlotte,2023-08-08 06:35:00,2023-08-08 08:02:00,NaT,NaT,600,659,800,859,59,59,0,0
12898,2023,3,8,31,4,2023-08-31,19805,AA,5047,20397,OH,,14685,34685,SAV,GA,11298,30194,DFW,TX,726,,,,,0700-0759,,,,,921,,,,,0900-0959,1,0,175.0,,,1.0,925.0,4,,,,,,0.0,N,1,Savannah,Dallas/Fort Worth,2023-08-31 07:26:00,2023-08-31 09:21:00,NaT,NaT,700,759,900,959,59,59,0,0
12953,2023,3,8,31,4,2023-08-31,19805,AA,5049,20397,OH,,14685,34685,SAV,GA,11057,31057,CLT,NC,741,,,,,0700-0759,,,,,904,,,,,0900-0959,1,0,83.0,,,1.0,213.0,1,,,,,,0.0,N,1,Savannah,Charlotte,2023-08-31 07:41:00,2023-08-31 09:04:00,NaT,NaT,700,759,900,959,59,59,0,0
13344,2023,3,8,30,3,2023-08-30,19805,AA,5059,20397,OH,,14100,34100,PHL,PA,12451,31136,JAX,FL,907,,,,,0900-0959,,,,,1128,,,,,1100-1159,1,0,141.0,,,1.0,742.0,3,,,,,,0.0,N,1,Philadelphia,Jacksonville,2023-08-30 09:07:00,2023-08-30 11:28:00,NaT,NaT,900,959,1100,1159,59,59,0,0


In [151]:
df_ac_reg_2022 = pd.read_csv(DATA_PATH + '/raw/ReleasableAircraft-2022/MASTER.txt')

  df_ac_reg_2022 = pd.read_csv(DATA_PATH + '/raw/ReleasableAircraft-2022/MASTER.txt')


In [152]:
aircraft_registration_2022 = df_ac_reg_2022[df_ac_reg_2022['N-NUMBER'].isin(missing_registration['Tail_Number'])]
aircraft_registration_2022.shape

(22, 35)

In [153]:
df_ac_reg_2021 = pd.read_csv(DATA_PATH + '/raw/ReleasableAircraft-2021/MASTER.txt')

  df_ac_reg_2021 = pd.read_csv(DATA_PATH + '/raw/ReleasableAircraft-2021/MASTER.txt')


In [154]:
aircraft_registration_2021 = df_ac_reg_2021[df_ac_reg_2021['N-NUMBER'].isin(missing_registration['Tail_Number'])]
aircraft_registration_2021.shape

(22, 35)

In [155]:
aircraft_registration_2021.merge(aircraft_registration_2022, on='N-NUMBER').shape

(22, 69)

2021 and 2022 aircraft registrations contain the same aircrafts that are missing from the 2023 df. I will add these. 

In [156]:
aircraft_registration_2022.drop(columns=[
    'FRACT OWNER',
    'OTHER NAMES(1)', 
    'OTHER NAMES(2)', 
    'OTHER NAMES(3)', 
    'OTHER NAMES(4)',
    'OTHER NAMES(5)',
    'KIT MFR',
    ' KIT MODEL',
    'Unnamed: 34'], 
    inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aircraft_registration_2022.drop(columns=[


In [157]:
aircraft_registration = pd.concat([aircraft_registration, aircraft_registration_2022], axis=0)

In [158]:
aircraft_registration.shape

(6295, 26)

In [159]:
aircraft_registration.head(2)

Unnamed: 0,N-NUMBER,SERIAL NUMBER,MFR MDL CODE,ENG MFR MDL,YEAR MFR,TYPE REGISTRANT,NAME,STREET,STREET2,CITY,STATE,ZIP CODE,REGION,COUNTY,COUNTRY,LAST ACTION DATE,CERT ISSUE DATE,CERTIFICATION,TYPE AIRCRAFT,TYPE ENGINE,STATUS CODE,MODE S CODE,AIR WORTH DATE,EXPIRATION DATE,UNIQUE ID,MODE S CODE HEX
645,10156,145786,3260214,54582,2004,3,UNITED AIRLINES INC ...,233 S WACKER DR,,CHICAGO,IL,606067147,C,31,US,20230804,20190531,1T,5,5,V,50005621,20040215,20290531,90729,A00B91
807,101DQ,9409,3940005,13101,2020,3,DELTA AIR LINES INC ...,1775 M H JACKSON SERVICE RD,DEPT 595 AIRCRAFT REGISTRATIONS,ATLANTA,GA,30354,7,121,US,20231007,20200213,1T,5,5,V,50004305,20200213,20300228,1312557,A008C5


In [160]:
aircraft_ref.head()

Unnamed: 0,CODE,MFR,MODEL,TYPE-ACFT,TYPE-ENG,AC-CAT,BUILD-CERT-IND,NO-ENG,NO-SEATS,AC-WEIGHT,SPEED,TC-DATA-SHEET,TC-DATA-HOLDER,Unnamed: 13
0,0020901,AAR AIRLIFT GROUP INC,UH-60A,6,3,1,0,2,15,CLASS 3,0,,...,
1,0030109,EXLINE ACE-C,ACE-C,4,1,1,1,1,1,CLASS 1,82,,...,
2,003010D,DELEBAUGH,P,4,1,1,1,1,1,CLASS 1,82,,...,
3,003010H,DAL PORTO,BABY ACE D,4,1,1,1,1,1,CLASS 1,82,,...,
4,003010P,DUNN,BABY ACE,4,1,1,1,1,1,CLASS 1,82,,...,


In [161]:
aircraft_ref.drop(columns=['TC-DATA-SHEET', 'TC-DATA-HOLDER', 'Unnamed: 13'], inplace=True)

In [162]:
engine_ref.head()

Unnamed: 0,CODE,MFR,MODEL,TYPE,HORSEPOWER,THRUST,Unnamed: 6
0,0,NONE,NONE,0,0,0,
1,401,A.C.E.,HIDR MARK III,1,95,0,
2,402,A.C.E.,UPRI MARK III,1,100,0,
3,450,AEROMOMENT,AM13 SERIES,8,100,0,
4,452,AEROMOMENT,AM15 SERIES,8,117,0,


In [163]:
engine_ref.drop(columns='Unnamed: 6', inplace=True)

In [164]:
aircraft_registration_merged = aircraft_registration.merge(aircraft_ref, left_on='MFR MDL CODE', right_on='CODE', how='left')

In [165]:
aircraft_registration_merged['ENG MFR MDL'] = aircraft_registration_merged['ENG MFR MDL'].replace('     ', 0).astype(int)
aircraft_registration_merged = aircraft_registration_merged.merge(engine_ref, left_on='ENG MFR MDL', right_on='CODE', how='left', suffixes=['_aircraft', '_engine'])

In [166]:
aircraft_registration_merged.head()

Unnamed: 0,N-NUMBER,SERIAL NUMBER,MFR MDL CODE,ENG MFR MDL,YEAR MFR,TYPE REGISTRANT,NAME,STREET,STREET2,CITY,STATE,ZIP CODE,REGION,COUNTY,COUNTRY,LAST ACTION DATE,CERT ISSUE DATE,CERTIFICATION,TYPE AIRCRAFT,TYPE ENGINE,STATUS CODE,MODE S CODE,AIR WORTH DATE,EXPIRATION DATE,UNIQUE ID,MODE S CODE HEX,CODE_aircraft,MFR_aircraft,MODEL_aircraft,TYPE-ACFT,TYPE-ENG,AC-CAT,BUILD-CERT-IND,NO-ENG,NO-SEATS,AC-WEIGHT,SPEED,CODE_engine,MFR_engine,MODEL_engine,TYPE,HORSEPOWER,THRUST
0,10156,145786,3260214,54582,2004,3,UNITED AIRLINES INC ...,233 S WACKER DR,,CHICAGO,IL,606067147,C,31,US,20230804,20190531,1T,5,5,V,50005621,20040215,20290531,90729,A00B91,3260214,EMBRAER,EMB-145XR,5,5,1,0,2,55,CLASS 2,0,54582,ROLLS-ROYC,AE3007 SER,5,0,6442
1,101DQ,9409,3940005,13101,2020,3,DELTA AIR LINES INC ...,1775 M H JACKSON SERVICE RD,DEPT 595 AIRCRAFT REGISTRATIONS,ATLANTA,GA,30354,7,121,US,20231007,20200213,1T,5,5,V,50004305,20200213,20300228,1312557,A008C5,3940005,AIRBUS,A321-211,5,5,1,0,2,199,CLASS 3,0,13101,CFM INTL,CFM56-5B3/3,5,0,32000
2,101DU,50020,1400012,52315,2018,3,BANK OF UTAH TRUSTEE ...,50 S 200 E STE 110,,SALT LAKE CITY,UT,841111617,S,35,US,20230906,20200808,1T,5,5,V,50004311,20181025,20270831,1285810,A008C9,1400012,C SERIES AIRCRAFT LTD PTNRSP,BD-500-1A10,5,5,1,0,2,133,CLASS 3,0,52315,P & W,PW1519G,5,0,19775
3,101HQ,17000156,3260121,30061,2007,3,REPUBLIC AIRWAYS INC ...,8909 PURDUE RD STE 300,,INDIANAPOLIS,IN,462683152,C,97,US,20230818,20070227,1T,5,5,V,50004451,20070228,20290731,328579,A00929,3260121,EMBRAER-EMPRESA BRASILEIRA DE,ERJ 170-200 LR,5,5,1,0,2,80,CLASS 3,0,30061,GE,CF34-8E5,5,0,14510
4,101NN,5834,3930402,34601,2013,3,WILMINGTON TRUST CO TRUSTEE ...,1100 N MARKET ST,,WILMINGTON,DE,198901605,1,3,US,20230421,20210222,1T,5,5,V,50004644,20131122,20280229,1119973,A009A4,3930402,AIRBUS INDUSTRIE,A321-231,5,5,1,0,2,379,CLASS 3,0,34601,IAE,V2500SERIES,4,0,25000


In [167]:
aircraft_registration_merged.to_csv(DATA_PATH + '/interim/aircraft_registration.csv', index=False)