In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
import geopandas as gpd
from shapely.geometry import Point

In [2]:
flights = pd.read_csv('./data/flights_sample_3m.csv')
flights.head(10)

Unnamed: 0,FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,...,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
0,2019-01-09,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,1562,FLL,"Fort Lauderdale, FL",EWR,"Newark, NJ",...,0.0,186.0,176.0,153.0,1065.0,,,,,
1,2022-11-19,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,1149,MSP,"Minneapolis, MN",SEA,"Seattle, WA",...,0.0,235.0,236.0,189.0,1399.0,,,,,
2,2022-07-22,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,459,DEN,"Denver, CO",MSP,"Minneapolis, MN",...,0.0,118.0,112.0,87.0,680.0,,,,,
3,2023-03-06,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,2295,MSP,"Minneapolis, MN",SFO,"San Francisco, CA",...,0.0,260.0,285.0,249.0,1589.0,0.0,0.0,24.0,0.0,0.0
4,2020-02-23,Spirit Air Lines,Spirit Air Lines: NK,NK,20416,407,MCO,"Orlando, FL",DFW,"Dallas/Fort Worth, TX",...,0.0,181.0,182.0,153.0,985.0,,,,,
5,2019-07-31,Southwest Airlines Co.,Southwest Airlines Co.: WN,WN,19393,665,DAL,"Dallas, TX",OKC,"Oklahoma City, OK",...,0.0,60.0,54.0,36.0,181.0,141.0,0.0,0.0,0.0,0.0
6,2023-06-11,American Airlines Inc.,American Airlines Inc.: AA,AA,19805,2134,DCA,"Washington, DC",BOS,"Boston, MA",...,0.0,109.0,89.0,58.0,399.0,,,,,
7,2019-07-08,Republic Airline,Republic Airline: YX,YX,20452,4464,HSV,"Huntsville, AL",DCA,"Washington, DC",...,0.0,122.0,151.0,88.0,613.0,0.0,0.0,23.0,0.0,0.0
8,2023-02-12,Spirit Air Lines,Spirit Air Lines: NK,NK,20416,590,IAH,"Houston, TX",LAX,"Los Angeles, CA",...,0.0,227.0,219.0,200.0,1379.0,,,,,
9,2020-08-22,Alaska Airlines Inc.,Alaska Airlines Inc.: AS,AS,19930,223,SEA,"Seattle, WA",FAI,"Fairbanks, AK",...,0.0,210.0,220.0,198.0,1533.0,,,,,


In [3]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 32 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   FL_DATE                  object 
 1   AIRLINE                  object 
 2   AIRLINE_DOT              object 
 3   AIRLINE_CODE             object 
 4   DOT_CODE                 int64  
 5   FL_NUMBER                int64  
 6   ORIGIN                   object 
 7   ORIGIN_CITY              object 
 8   DEST                     object 
 9   DEST_CITY                object 
 10  CRS_DEP_TIME             int64  
 11  DEP_TIME                 float64
 12  DEP_DELAY                float64
 13  TAXI_OUT                 float64
 14  WHEELS_OFF               float64
 15  WHEELS_ON                float64
 16  TAXI_IN                  float64
 17  CRS_ARR_TIME             int64  
 18  ARR_TIME                 float64
 19  ARR_DELAY                float64
 20  CANCELLED                float64
 21  CANCELLA

In [4]:
miss_ser = flights.isna().sum()

miss_val = pd.DataFrame({
    'Missing Values':miss_ser.values,
    'Miss rate':round((miss_ser/flights.shape[0])*100,4).astype(str) + '%',
    'Datatype':flights.dtypes
})

miss_val.sort_values(by='Miss rate', ascending=False)

Unnamed: 0,Missing Values,Miss rate,Datatype
CANCELLATION_CODE,2920860,97.362%,object
DELAY_DUE_LATE_AIRCRAFT,2466137,82.2046%,float64
DELAY_DUE_SECURITY,2466137,82.2046%,float64
DELAY_DUE_NAS,2466137,82.2046%,float64
DELAY_DUE_WEATHER,2466137,82.2046%,float64
DELAY_DUE_CARRIER,2466137,82.2046%,float64
AIR_TIME,86198,2.8733%,float64
ELAPSED_TIME,86198,2.8733%,float64
ARR_DELAY,86198,2.8733%,float64
WHEELS_ON,79944,2.6648%,float64


The data presented in the table clearly indicates that over 97% of the feature labeled CANCELLATION_CODE is absent, and this feature is stored as a string object. Since it's not feasible to deduce this information without explicit details, we will remove this feature to prevent any potential biases in assumptions.

Additionally, the other tables related to delays show that more than 80% of the entries are missing. Given that these values are numerical, we can reasonably infer that a missing entry indicates no delays occurred. Therefore, we will replace these missing values with 0 to simplify calculations while analyzing the delay data.

Lastly, the missing values in the other flight logistics characteristics account for only about 3%. These values are significant enough to warrant consideration, so we will employ kNN imputation to fill in the missing data, utilizing the mean from the 'k' nearest neighbors for this purpose.

In [5]:
flights.drop('CANCELLATION_CODE', inplace=True, axis=1)
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 31 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   FL_DATE                  object 
 1   AIRLINE                  object 
 2   AIRLINE_DOT              object 
 3   AIRLINE_CODE             object 
 4   DOT_CODE                 int64  
 5   FL_NUMBER                int64  
 6   ORIGIN                   object 
 7   ORIGIN_CITY              object 
 8   DEST                     object 
 9   DEST_CITY                object 
 10  CRS_DEP_TIME             int64  
 11  DEP_TIME                 float64
 12  DEP_DELAY                float64
 13  TAXI_OUT                 float64
 14  WHEELS_OFF               float64
 15  WHEELS_ON                float64
 16  TAXI_IN                  float64
 17  CRS_ARR_TIME             int64  
 18  ARR_TIME                 float64
 19  ARR_DELAY                float64
 20  CANCELLED                float64
 21  DIVERTED

In [6]:
delay_cols = ['DELAY_DUE_LATE_AIRCRAFT', 'DELAY_DUE_SECURITY', 'DELAY_DUE_NAS',
              'DELAY_DUE_WEATHER', 'DELAY_DUE_CARRIER']

flights[delay_cols] = flights[delay_cols].fillna(0)
flights.isna().sum()

FL_DATE                        0
AIRLINE                        0
AIRLINE_DOT                    0
AIRLINE_CODE                   0
DOT_CODE                       0
FL_NUMBER                      0
ORIGIN                         0
ORIGIN_CITY                    0
DEST                           0
DEST_CITY                      0
CRS_DEP_TIME                   0
DEP_TIME                   77615
DEP_DELAY                  77644
TAXI_OUT                   78806
WHEELS_OFF                 78806
WHEELS_ON                  79944
TAXI_IN                    79944
CRS_ARR_TIME                   0
ARR_TIME                   79942
ARR_DELAY                  86198
CANCELLED                      0
DIVERTED                       0
CRS_ELAPSED_TIME              14
ELAPSED_TIME               86198
AIR_TIME                   86198
DISTANCE                       0
DELAY_DUE_CARRIER              0
DELAY_DUE_WEATHER              0
DELAY_DUE_NAS                  0
DELAY_DUE_SECURITY             0
DELAY_DUE_

Given the substantial size of the dataset, performing any form of imputation or data cleansing could result in a significant computational burden.

Therefore, we will extract a subset of the dataset that corresponds to the airline with the most operating flights. This process involves counting the instances for each airline represented in the dataset and selecting the one that has the greatest number of entries.

In [7]:
flights.loc[:,['AIRLINE_CODE','AIRLINE','AIRLINE_DOT']].value_counts()

AIRLINE_CODE  AIRLINE                             AIRLINE_DOT                           
WN            Southwest Airlines Co.              Southwest Airlines Co.: WN                576470
DL            Delta Air Lines Inc.                Delta Air Lines Inc.: DL                  395239
AA            American Airlines Inc.              American Airlines Inc.: AA                383106
OO            SkyWest Airlines Inc.               SkyWest Airlines Inc.: OO                 343737
UA            United Air Lines Inc.               United Air Lines Inc.: UA                 254504
YX            Republic Airline                    Republic Airline: YX                      143107
MQ            Envoy Air                           Envoy Air: MQ                             121256
B6            JetBlue Airways                     JetBlue Airways: B6                       112844
9E            Endeavor Air Inc.                   Endeavor Air Inc.: 9E                     112463
OH            PSA Ai

Based on the observations mentioned earlier, it is clear that __Southwest Airlines__ accounts for the largest portion of our dataset, representing approximately `20%` of the total entries. This abundance of data allows us to concentrate our analysis on a single airline, facilitating more accurate
insights specific to this airline operator.

In [8]:
fl_sw = flights.loc[(flights.loc[:,'AIRLINE_CODE'] == 'WN'), :].reset_index(drop=True)
display(fl_sw.head().T)
display(fl_sw.loc[:,'AIRLINE'].value_counts())
display(fl_sw.loc[:,'AIRLINE_DOT'].value_counts())
display(fl_sw.loc[:,'AIRLINE_CODE'].value_counts())

Unnamed: 0,0,1,2,3,4
FL_DATE,2019-07-31,2021-08-05,2020-04-07,2022-05-01,2021-12-25
AIRLINE,Southwest Airlines Co.,Southwest Airlines Co.,Southwest Airlines Co.,Southwest Airlines Co.,Southwest Airlines Co.
AIRLINE_DOT,Southwest Airlines Co.: WN,Southwest Airlines Co.: WN,Southwest Airlines Co.: WN,Southwest Airlines Co.: WN,Southwest Airlines Co.: WN
AIRLINE_CODE,WN,WN,WN,WN,WN
DOT_CODE,19393,19393,19393,19393,19393
FL_NUMBER,665,430,687,1011,2516
ORIGIN,DAL,MDW,SJC,BWI,DAL
ORIGIN_CITY,"Dallas, TX","Chicago, IL","San Jose, CA","Baltimore, MD","Dallas, TX"
DEST,OKC,BNA,LAX,BDL,MSY
DEST_CITY,"Oklahoma City, OK","Nashville, TN","Los Angeles, CA","Hartford, CT","New Orleans, LA"


AIRLINE
Southwest Airlines Co.    576470
Name: count, dtype: int64

AIRLINE_DOT
Southwest Airlines Co.: WN    576470
Name: count, dtype: int64

AIRLINE_CODE
WN    576470
Name: count, dtype: int64

Having extracted and created a subset of the dataset for Southwest Airlines, we can now perform our exploratory data analysis on the refined dataset.

In [9]:
fl_sw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576470 entries, 0 to 576469
Data columns (total 31 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   FL_DATE                  576470 non-null  object 
 1   AIRLINE                  576470 non-null  object 
 2   AIRLINE_DOT              576470 non-null  object 
 3   AIRLINE_CODE             576470 non-null  object 
 4   DOT_CODE                 576470 non-null  int64  
 5   FL_NUMBER                576470 non-null  int64  
 6   ORIGIN                   576470 non-null  object 
 7   ORIGIN_CITY              576470 non-null  object 
 8   DEST                     576470 non-null  object 
 9   DEST_CITY                576470 non-null  object 
 10  CRS_DEP_TIME             576470 non-null  int64  
 11  DEP_TIME                 557105 non-null  float64
 12  DEP_DELAY                557105 non-null  float64
 13  TAXI_OUT                 557032 non-null  float64
 14  WHEE

Let's focus on addressing the columns with missing values by applying the __median imputation method__.

In [10]:
miss_cols = [col for col in fl_sw.columns if fl_sw[col].isna().any()]
miss_cols

['DEP_TIME',
 'DEP_DELAY',
 'TAXI_OUT',
 'WHEELS_OFF',
 'WHEELS_ON',
 'TAXI_IN',
 'ARR_TIME',
 'ARR_DELAY',
 'ELAPSED_TIME',
 'AIR_TIME']

In [11]:
imp = SimpleImputer(strategy='median', copy=True)
fl_sw[miss_cols] = imp.fit_transform(fl_sw[miss_cols])
fl_sw.isna().sum()

FL_DATE                    0
AIRLINE                    0
AIRLINE_DOT                0
AIRLINE_CODE               0
DOT_CODE                   0
FL_NUMBER                  0
ORIGIN                     0
ORIGIN_CITY                0
DEST                       0
DEST_CITY                  0
CRS_DEP_TIME               0
DEP_TIME                   0
DEP_DELAY                  0
TAXI_OUT                   0
WHEELS_OFF                 0
WHEELS_ON                  0
TAXI_IN                    0
CRS_ARR_TIME               0
ARR_TIME                   0
ARR_DELAY                  0
CANCELLED                  0
DIVERTED                   0
CRS_ELAPSED_TIME           0
ELAPSED_TIME               0
AIR_TIME                   0
DISTANCE                   0
DELAY_DUE_CARRIER          0
DELAY_DUE_WEATHER          0
DELAY_DUE_NAS              0
DELAY_DUE_SECURITY         0
DELAY_DUE_LATE_AIRCRAFT    0
dtype: int64

In order to prepare the dataset for analysis, we must adjust the data types of certain features to align with their provided descriptions. Additionally, we will remove some string columns that are redundant since their information is already represented as codes for each entry. Lastly, we will organize the variables in chronological sequence and check for any missing readings.

In [12]:
fl_sw.drop(['AIRLINE','AIRLINE_DOT','ORIGIN_CITY','DEST_CITY'], inplace=True, axis=1)
fl_sw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576470 entries, 0 to 576469
Data columns (total 27 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   FL_DATE                  576470 non-null  object 
 1   AIRLINE_CODE             576470 non-null  object 
 2   DOT_CODE                 576470 non-null  int64  
 3   FL_NUMBER                576470 non-null  int64  
 4   ORIGIN                   576470 non-null  object 
 5   DEST                     576470 non-null  object 
 6   CRS_DEP_TIME             576470 non-null  int64  
 7   DEP_TIME                 576470 non-null  float64
 8   DEP_DELAY                576470 non-null  float64
 9   TAXI_OUT                 576470 non-null  float64
 10  WHEELS_OFF               576470 non-null  float64
 11  WHEELS_ON                576470 non-null  float64
 12  TAXI_IN                  576470 non-null  float64
 13  CRS_ARR_TIME             576470 non-null  int64  
 14  ARR_

It is evident that the _flight date_ feature is currently categorized as an `'object'` datatype, while it should actually be in the `'datetime'` format. The remaining datatypes appear to align with their respective feature descriptions.

In [13]:
fl_sw['FL_DATE'] = pd.to_datetime(fl_sw['FL_DATE'], format='%Y-%m-%d')
fl_sw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576470 entries, 0 to 576469
Data columns (total 27 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   FL_DATE                  576470 non-null  datetime64[ns]
 1   AIRLINE_CODE             576470 non-null  object        
 2   DOT_CODE                 576470 non-null  int64         
 3   FL_NUMBER                576470 non-null  int64         
 4   ORIGIN                   576470 non-null  object        
 5   DEST                     576470 non-null  object        
 6   CRS_DEP_TIME             576470 non-null  int64         
 7   DEP_TIME                 576470 non-null  float64       
 8   DEP_DELAY                576470 non-null  float64       
 9   TAXI_OUT                 576470 non-null  float64       
 10  WHEELS_OFF               576470 non-null  float64       
 11  WHEELS_ON                576470 non-null  float64       
 12  TAXI_IN         

We have noticed that the entries in the date column are inconsistent and require chronological organization. This can be achieved with the `sort_values` method, which will sort the data automatically if the feature is formatted as date-time.

In [14]:
fl_sw = fl_sw.sort_values(by='FL_DATE').reset_index(drop=True)
fl_sw

Unnamed: 0,FL_DATE,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,...,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
0,2019-01-01,WN,19393,2040,SJC,SEA,1550,1609.0,19.0,13.0,...,0.0,125.0,131.0,112.0,696.0,16.0,0.0,6.0,0.0,3.0
1,2019-01-01,WN,19393,1550,ELP,LAS,1510,1508.0,-2.0,9.0,...,0.0,110.0,95.0,82.0,583.0,0.0,0.0,0.0,0.0,0.0
2,2019-01-01,WN,19393,2096,MCO,PVD,2215,2238.0,23.0,8.0,...,0.0,155.0,136.0,125.0,1072.0,0.0,0.0,0.0,0.0,0.0
3,2019-01-01,WN,19393,2002,SJC,BUR,1040,1035.0,-5.0,12.0,...,0.0,60.0,62.0,47.0,296.0,0.0,0.0,0.0,0.0,0.0
4,2019-01-01,WN,19393,223,RNO,SJC,1740,1743.0,3.0,7.0,...,0.0,65.0,51.0,41.0,188.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576465,2023-08-31,WN,19393,198,LAS,LAX,2220,2259.0,39.0,24.0,...,0.0,65.0,69.0,40.0,236.0,29.0,0.0,4.0,0.0,10.0
576466,2023-08-31,WN,19393,2544,KOA,LAX,1205,1211.0,6.0,11.0,...,0.0,325.0,313.0,295.0,2504.0,0.0,0.0,0.0,0.0,0.0
576467,2023-08-31,WN,19393,4577,PHX,ONT,1910,1927.0,17.0,12.0,...,0.0,70.0,63.0,46.0,325.0,0.0,0.0,0.0,0.0,0.0
576468,2023-08-31,WN,19393,1981,BNA,DEN,1510,1534.0,24.0,16.0,...,0.0,165.0,157.0,135.0,1014.0,0.0,0.0,0.0,0.0,16.0


Based on the review conducted, it appears that the data is complete and ready for our analysis. To further ensure data integrity, we can also verify if there are any duplicate entries present in the dataset.

In [15]:
fl_sw.duplicated().value_counts()

False    576470
Name: count, dtype: int64

Having obtained the final cleaned dataset, we can now perform a distribution analysis to better comprehend the data characteristics, as well as carry out various statistical tests.

Specifically, we will examine daily logistical operations through metrics such as taxiing durations, delays in arrivals and departures, and overall flight times for all flights operated by Southwest Airlines.

In [16]:
fl_sw.describe()

Unnamed: 0,FL_DATE,DOT_CODE,FL_NUMBER,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,...,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
count,576470,576470.0,576470.0,576470.0,576470.0,576470.0,576470.0,576470.0,576470.0,576470.0,...,576470.0,576470.0,576470.0,576470.0,576470.0,576470.0,576470.0,576470.0,576470.0,576470.0
mean,2021-05-21 18:04:54.434749184,19393.0,2076.226718,1329.52413,1339.797016,10.45359,12.167507,1358.718244,1457.560055,5.475839,...,0.001971,128.060706,120.091516,102.353906,740.286402,3.195269,0.234345,1.458631,0.032036,4.579414
min,2019-01-01 00:00:00,19393.0,1.0,500.0,1.0,-52.0,1.0,1.0,1.0,1.0,...,0.0,35.0,22.0,13.0,73.0,0.0,0.0,0.0,0.0,0.0
25%,2020-01-14 00:00:00,19393.0,920.0,910.0,924.0,-3.0,9.0,935.0,1050.0,3.0,...,0.0,85.0,78.0,61.0,386.0,0.0,0.0,0.0,0.0,0.0
50%,2021-07-11 00:00:00,19393.0,1818.0,1325.0,1330.0,0.0,11.0,1342.0,1502.0,4.0,...,0.0,115.0,108.0,90.0,639.0,0.0,0.0,0.0,0.0,0.0
75%,2022-08-28 00:00:00,19393.0,2841.0,1740.0,1741.0,12.0,14.0,1753.0,1902.0,6.0,...,0.0,155.0,147.0,129.0,972.0,0.0,0.0,0.0,0.0,0.0
max,2023-08-31 00:00:00,19393.0,6999.0,2355.0,2400.0,648.0,177.0,2400.0,2400.0,176.0,...,1.0,480.0,488.0,409.0,2979.0,648.0,581.0,457.0,366.0,597.0
std,,0.0,1495.32409,484.532861,490.704258,29.703128,6.117327,490.742752,532.476864,4.679094,...,0.044348,56.439209,55.599871,54.436681,459.708089,15.249557,5.282894,9.046317,1.703886,18.861252


In [17]:
fl_sw_op = fl_sw.groupby(by=['FL_DATE'])[['TAXI_IN','TAXI_OUT', 'CRS_ELAPSED_TIME', 'AIR_TIME', 'ELAPSED_TIME','ARR_DELAY','DEP_DELAY', 'DELAY_DUE_CARRIER',
                                          'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT']].mean()
fl_sw_op

Unnamed: 0_level_0,TAXI_IN,TAXI_OUT,CRS_ELAPSED_TIME,AIR_TIME,ELAPSED_TIME,ARR_DELAY,DEP_DELAY,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
FL_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-01,5.734417,12.116531,132.831978,108.037940,125.921409,6.084011,13.067751,2.750678,0.406504,1.439024,0.051491,5.512195
2019-01-02,5.847500,12.220000,132.000000,107.815000,125.882500,13.200000,19.317500,5.052500,0.060000,1.602500,0.022500,8.880000
2019-01-03,5.216931,12.267196,124.563492,99.404762,116.931217,6.042328,14.026455,4.071429,0.000000,0.775132,0.000000,4.830688
2019-01-04,5.566416,12.303258,133.020050,106.842105,124.711779,-2.265664,6.042607,2.581454,0.000000,0.571429,0.000000,1.969925
2019-01-05,5.132653,12.510204,132.363946,106.224490,123.904762,-1.897959,6.469388,1.340136,0.085034,1.207483,0.000000,2.744898
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-27,5.722477,13.685780,120.126147,96.337156,115.736239,6.199541,10.440367,2.965596,0.190367,2.637615,0.000000,4.924312
2023-08-28,5.430913,13.149883,125.409836,101.711944,120.299766,1.215457,6.426230,1.833724,0.327869,1.292740,0.000000,3.355972
2023-08-29,5.208333,12.840278,122.615741,97.636574,115.803241,-2.333333,3.555556,1.252315,0.171296,1.108796,0.000000,2.141204
2023-08-30,5.685230,13.975787,121.622276,97.980630,117.757869,4.864407,8.154964,2.399516,0.382567,4.106538,0.000000,3.569007


In [18]:
us_airports = pd.read_csv('./data/us_airports.csv')
us_airports.head(10)

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,country_name,iso_country,...,scheduled_service,gps_code,icao_code,iata_code,local_code,home_link,wikipedia_link,keywords,score,last_updated
0,3632,KLAX,large_airport,Los Angeles International Airport,33.942501,-118.407997,125.0,,United States,US,...,1,KLAX,KLAX,LAX,LAX,https://www.flylax.com/,https://en.wikipedia.org/wiki/Los_Angeles_Inte...,Tom Bradley,1335475,2024-04-02T16:36:13+00:00
1,3754,KORD,large_airport,Chicago O'Hare International Airport,41.9786,-87.9048,680.0,,United States,US,...,1,KORD,KORD,ORD,ORD,https://www.flychicago.com/ohare/home/pages/de...,https://en.wikipedia.org/wiki/O'Hare_Internati...,"CHI, Orchard Place",1503175,2024-03-09T23:28:49+00:00
2,3622,KJFK,large_airport,John F Kennedy International Airport,40.639447,-73.779317,13.0,,United States,US,...,1,KJFK,KJFK,JFK,JFK,https://www.jfkairport.com/,https://en.wikipedia.org/wiki/John_F._Kennedy_...,"Manhattan, New York City, NYC, Idlewild, IDL, ...",1052075,2022-10-18T18:49:55+00:00
3,3384,KATL,large_airport,Hartsfield Jackson Atlanta International Airport,33.6367,-84.428101,1026.0,,United States,US,...,1,KATL,KATL,ATL,ATL,http://www.atlanta-airport.com/,https://en.wikipedia.org/wiki/Hartsfield–Jacks...,,2002475,2024-04-02T16:26:01+00:00
4,3878,KSFO,large_airport,San Francisco International Airport,37.619806,-122.374821,13.0,,United States,US,...,1,KSFO,KSFO,SFO,SFO,http://www.flysfo.com/,https://en.wikipedia.org/wiki/San_Francisco_In...,"QSF, QBA",1112475,2024-04-02T16:43:50+00:00
5,3521,KEWR,large_airport,Newark Liberty International Airport,40.692501,-74.168701,18.0,,United States,US,...,1,KEWR,KEWR,EWR,EWR,http://www.panynj.gov/CommutingTravel/airports...,https://en.wikipedia.org/wiki/Newark_Liberty_I...,"Manhattan, New York City, NYC",1064475,2022-09-17T20:28:21+00:00
6,3488,KDFW,large_airport,Dallas Fort Worth International Airport,32.896801,-97.038002,607.0,,United States,US,...,1,KDFW,KDFW,DFW,DFW,https://www.dfwairport.com/,https://en.wikipedia.org/wiki/Dallas/Fort_Wort...,QDF,1203175,2018-09-19T14:53:02+00:00
7,3631,KLAS,large_airport,Harry Reid International Airport,36.083361,-115.151817,2181.0,,United States,US,...,1,KLAS,KLAS,LAS,LAS,https://www.harryreidairport.com/,https://en.wikipedia.org/wiki/Harry_Reid_Inter...,McCarran International Airport,1068475,2022-10-18T19:05:10+00:00
8,3670,KMCO,large_airport,Orlando International Airport,28.429399,-81.308998,96.0,,United States,US,...,1,KMCO,KMCO,MCO,MCO,http://www.orlandoairports.net/,https://en.wikipedia.org/wiki/Orlando_Internat...,"Disney World,Epcot Center",1044075,2008-06-13T14:30:04+00:00
9,3486,KDEN,large_airport,Denver International Airport,39.861698,-104.672997,5431.0,,United States,US,...,1,KDEN,KDEN,DEN,DEN,http://www.flydenver.com/,https://en.wikipedia.org/wiki/Denver_Internati...,"DVX, KVDX",1103275,2015-11-13T09:28:42+00:00


In [19]:
fl_sw_geo = fl_sw.merge(us_airports[['iata_code', 'latitude_deg', 'longitude_deg']], left_on='ORIGIN', right_on='iata_code', how='left')
fl_sw_geo.drop(['iata_code'], inplace=True, axis=1)
fl_sw_geo.rename(columns={'latitude_deg': 'ORIGIN_LATITUDE', 'longitude_deg': 'ORIGIN_LONGITUDE'}, inplace=True)
fl_sw_geo.head(10)

Unnamed: 0,FL_DATE,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,...,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT,ORIGIN_LATITUDE,ORIGIN_LONGITUDE
0,2019-01-01,WN,19393,2040,SJC,SEA,1550,1609.0,19.0,13.0,...,131.0,112.0,696.0,16.0,0.0,6.0,0.0,3.0,37.362452,-121.929188
1,2019-01-01,WN,19393,1550,ELP,LAS,1510,1508.0,-2.0,9.0,...,95.0,82.0,583.0,0.0,0.0,0.0,0.0,0.0,31.807199,-106.377998
2,2019-01-01,WN,19393,2096,MCO,PVD,2215,2238.0,23.0,8.0,...,136.0,125.0,1072.0,0.0,0.0,0.0,0.0,0.0,28.429399,-81.308998
3,2019-01-01,WN,19393,2002,SJC,BUR,1040,1035.0,-5.0,12.0,...,62.0,47.0,296.0,0.0,0.0,0.0,0.0,0.0,37.362452,-121.929188
4,2019-01-01,WN,19393,223,RNO,SJC,1740,1743.0,3.0,7.0,...,51.0,41.0,188.0,0.0,0.0,0.0,0.0,0.0,39.4991,-119.767998
5,2019-01-01,WN,19393,3158,CMH,FLL,1650,1647.0,-3.0,7.0,...,157.0,135.0,973.0,0.0,0.0,0.0,0.0,0.0,39.998001,-82.891899
6,2019-01-01,WN,19393,614,DCA,CMH,1905,1933.0,28.0,7.0,...,68.0,58.0,323.0,13.0,0.0,0.0,0.0,3.0,38.8521,-77.037697
7,2019-01-01,WN,19393,802,AUS,ATL,640,640.0,0.0,10.0,...,115.0,97.0,813.0,0.0,0.0,0.0,0.0,0.0,30.197535,-97.662015
8,2019-01-01,WN,19393,1277,EWR,MDW,1750,1842.0,52.0,19.0,...,156.0,132.0,711.0,1.0,0.0,6.0,0.0,51.0,40.692501,-74.168701
9,2019-01-01,WN,19393,1750,LAX,OAK,1115,1114.0,-1.0,10.0,...,71.0,57.0,337.0,0.0,0.0,0.0,0.0,0.0,33.942501,-118.407997


In [25]:
fl_sw_geo_agg = fl_sw_geo.groupby("ORIGIN").agg(AVG_DELAY=("DEP_DELAY", "mean"), ORIGIN_LATITUDE=("ORIGIN_LATITUDE", "first"), ORIGIN_LONGITUDE=("ORIGIN_LONGITUDE", "first")).reset_index()
fl_sw_geo_agg.head(10)

Unnamed: 0,ORIGIN,AVG_DELAY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE
0,ABQ,8.984209,35.039976,-106.608925
1,ALB,7.217391,42.748299,-73.801697
2,AMA,5.094089,35.219398,-101.706001
3,ATL,11.148153,33.6367,-84.428101
4,AUS,10.678088,30.197535,-97.662015
5,BDL,7.676019,41.93851,-72.688066
6,BHM,8.73022,33.562901,-86.753502
7,BLI,12.52451,48.792801,-122.538002
8,BNA,9.550921,36.1245,-86.6782
9,BOI,4.813139,43.5644,-116.223


In [28]:
gdf = gpd.GeoDataFrame(
    fl_sw_geo_agg[["ORIGIN", "AVG_DELAY"]],
    geometry=gpd.points_from_xy(fl_sw_geo_agg['ORIGIN_LONGITUDE'], fl_sw_geo_agg['ORIGIN_LATITUDE']),
    crs="EPSG:4326"
)

gdf.to_file('./data/fl_sw_agg.geojson', driver='GeoJSON')