In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [55]:
# Reads each of the raw data files, drops any rows with missing values, duplicates, or rows that are not from JFK, and saves the cleaned data to a new file in the clean directory
for yy in range(10,19):
    for mm in range(1,13):
        if yy == 18 and mm == 8:
            break
        if mm < 10:
            mm = f'0{mm}'

        df = pd.read_csv(f'data/raw/JFK_{mm}_{yy}.csv', sep=',')
        df = df.dropna()
        df = df.drop_duplicates()
        df = df.drop(df[df['ORIGIN'] != 'JFK'].index)
        df.to_csv(f'data/clean/JFK_{mm}_{yy}.csv', index=False)

In [56]:
# Concatenates all of the cleaned data files into one large dataset
full_df = pd.DataFrame()

for yy in range(10,19):
    for mm in range(1,13):
        if yy == 18 and mm == 8:
            break
        if mm < 10:
            mm = f'0{mm}'

        df = pd.read_csv(f'data/clean/JFK_{mm}_{yy}.csv', sep=',')
        full_df = pd.concat([full_df, df])
        full_df.to_csv('data/clean/full.csv', index=False)

In [57]:
full_df = pd.read_csv('data/clean/full.csv', sep=',')
full_df

Unnamed: 0,FL_DATE,ORIGIN,DEST,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,CANCELLED
0,01/03/2010 00:00,JFK,BTV,106.0,146.0,146.0,1.0,1.0
1,01/10/2010 00:00,JFK,MCO,1037.0,66.0,66.0,1.0,1.0
2,1/22/2010 12:00:00 AM,JFK,SAN,1925.0,-5.0,0.0,0.0,1.0
3,1/26/2010 12:00:00 AM,JFK,MIA,1459.0,-6.0,0.0,0.0,1.0
4,1/29/2010 12:00:00 AM,JFK,SFO,815.0,75.0,75.0,1.0,1.0
...,...,...,...,...,...,...,...,...
863962,7/31/2018 12:00:00 AM,JFK,TPA,1053.0,-7.0,0.0,0.0,0.0
863963,7/31/2018 12:00:00 AM,JFK,TPA,1340.0,10.0,10.0,0.0,0.0
863964,7/31/2018 12:00:00 AM,JFK,TPA,1525.0,-5.0,0.0,0.0,0.0
863965,7/31/2018 12:00:00 AM,JFK,TPA,1954.0,29.0,29.0,1.0,0.0


In [58]:
weather_df = pd.read_csv('data/JFK_weather_10_18/jfk_weather_cleaned.csv', sep=',')
weather_df

Unnamed: 0,DATE,HOURLYVISIBILITY,HOURLYDRYBULBTEMPF,HOURLYWETBULBTEMPF,HOURLYDewPointTempF,HOURLYRelativeHumidity,HOURLYWindSpeed,HOURLYStationPressure,HOURLYSeaLevelPressure,HOURLYPrecip,HOURLYAltimeterSetting,HOURLYWindDirectionSin,HOURLYWindDirectionCos,HOURLYPressureTendencyIncr,HOURLYPressureTendencyDecr,HOURLYPressureTendencyCons
0,2010-01-01 01:00:00,6.0,33.0,32.0,31.0,92.0,0.0,29.97,29.99,0.01,29.99,0.000000,1.000000,0,1,0
1,2010-01-01 02:00:00,6.0,33.0,33.0,32.0,96.0,0.0,29.97,29.99,0.02,29.99,0.000000,1.000000,0,1,0
2,2010-01-01 03:00:00,5.0,33.0,33.0,32.0,96.0,0.0,29.97,29.99,0.00,29.99,0.000000,1.000000,0,1,0
3,2010-01-01 04:00:00,5.0,33.0,33.0,32.0,96.0,0.0,29.95,29.97,0.00,29.97,0.000000,1.000000,0,1,0
4,2010-01-01 05:00:00,5.0,33.0,32.0,31.0,92.0,0.0,29.93,29.96,0.00,29.95,0.000000,1.000000,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75114,2018-07-27 19:00:00,10.0,76.0,73.0,72.0,88.0,3.0,30.00,30.02,0.00,30.02,-0.766044,-0.642788,1,0,0
75115,2018-07-27 20:00:00,4.0,69.0,69.0,69.0,100.0,13.0,29.99,30.01,1.16,30.01,0.642788,0.766044,1,0,0
75116,2018-07-27 21:00:00,10.0,71.0,70.0,70.0,96.0,0.0,30.02,30.04,0.01,30.04,0.000000,1.000000,1,0,0
75117,2018-07-27 22:00:00,10.0,72.0,71.0,70.0,94.0,5.0,30.00,30.02,0.00,30.02,0.766044,0.642788,1,0,0


In [59]:
# reformatting departure times to date times
full_df['DEP_TIME'] = full_df['DEP_TIME'].apply(lambda x: f"{int(x):04d}")
full_df['DEP_TIME'] = full_df['DEP_TIME'].apply(lambda x: f"{x[:2]}:{x[2:]}:00")
full_df

Unnamed: 0,FL_DATE,ORIGIN,DEST,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,CANCELLED
0,01/03/2010 00:00,JFK,BTV,01:06:00,146.0,146.0,1.0,1.0
1,01/10/2010 00:00,JFK,MCO,10:37:00,66.0,66.0,1.0,1.0
2,1/22/2010 12:00:00 AM,JFK,SAN,19:25:00,-5.0,0.0,0.0,1.0
3,1/26/2010 12:00:00 AM,JFK,MIA,14:59:00,-6.0,0.0,0.0,1.0
4,1/29/2010 12:00:00 AM,JFK,SFO,08:15:00,75.0,75.0,1.0,1.0
...,...,...,...,...,...,...,...,...
863962,7/31/2018 12:00:00 AM,JFK,TPA,10:53:00,-7.0,0.0,0.0,0.0
863963,7/31/2018 12:00:00 AM,JFK,TPA,13:40:00,10.0,10.0,0.0,0.0
863964,7/31/2018 12:00:00 AM,JFK,TPA,15:25:00,-5.0,0.0,0.0,0.0
863965,7/31/2018 12:00:00 AM,JFK,TPA,19:54:00,29.0,29.0,1.0,0.0


In [60]:
# reformatting FL_DATE to correct date-time format
# remove AM/PM
full_df['FL_DATE'] = full_df['FL_DATE'].str.replace(r'AM|PM', '', regex=True)

# convert to date time
full_df['FL_DATE'] = pd.to_datetime(full_df['FL_DATE'], format='mixed')
full_df

Unnamed: 0,FL_DATE,ORIGIN,DEST,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,CANCELLED
0,2010-01-03 00:00:00,JFK,BTV,01:06:00,146.0,146.0,1.0,1.0
1,2010-01-10 00:00:00,JFK,MCO,10:37:00,66.0,66.0,1.0,1.0
2,2010-01-22 12:00:00,JFK,SAN,19:25:00,-5.0,0.0,0.0,1.0
3,2010-01-26 12:00:00,JFK,MIA,14:59:00,-6.0,0.0,0.0,1.0
4,2010-01-29 12:00:00,JFK,SFO,08:15:00,75.0,75.0,1.0,1.0
...,...,...,...,...,...,...,...,...
863962,2018-07-31 12:00:00,JFK,TPA,10:53:00,-7.0,0.0,0.0,0.0
863963,2018-07-31 12:00:00,JFK,TPA,13:40:00,10.0,10.0,0.0,0.0
863964,2018-07-31 12:00:00,JFK,TPA,15:25:00,-5.0,0.0,0.0,0.0
863965,2018-07-31 12:00:00,JFK,TPA,19:54:00,29.0,29.0,1.0,0.0


In [61]:
# Handle "24:00:00" in DEP_TIME
mask = full_df['DEP_TIME'] == '24:00:00'  # Find rows with "24:00:00"

# Replace "24:00:00" with "00:00:00"
full_df.loc[mask, 'DEP_TIME'] = '00:00:00'

# Add one day to FL_DATE where DEP_TIME was "24:00:00"
full_df.loc[mask, 'FL_DATE'] += pd.Timedelta(days=1)

# Convert to proper datetime format
full_df['DEP_DATE_TIME'] = pd.to_datetime(full_df['FL_DATE'].dt.date.astype(str) + ' ' + full_df['DEP_TIME'])
full_df

Unnamed: 0,FL_DATE,ORIGIN,DEST,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,CANCELLED,DEP_DATE_TIME
0,2010-01-03 00:00:00,JFK,BTV,01:06:00,146.0,146.0,1.0,1.0,2010-01-03 01:06:00
1,2010-01-10 00:00:00,JFK,MCO,10:37:00,66.0,66.0,1.0,1.0,2010-01-10 10:37:00
2,2010-01-22 12:00:00,JFK,SAN,19:25:00,-5.0,0.0,0.0,1.0,2010-01-22 19:25:00
3,2010-01-26 12:00:00,JFK,MIA,14:59:00,-6.0,0.0,0.0,1.0,2010-01-26 14:59:00
4,2010-01-29 12:00:00,JFK,SFO,08:15:00,75.0,75.0,1.0,1.0,2010-01-29 08:15:00
...,...,...,...,...,...,...,...,...,...
863962,2018-07-31 12:00:00,JFK,TPA,10:53:00,-7.0,0.0,0.0,0.0,2018-07-31 10:53:00
863963,2018-07-31 12:00:00,JFK,TPA,13:40:00,10.0,10.0,0.0,0.0,2018-07-31 13:40:00
863964,2018-07-31 12:00:00,JFK,TPA,15:25:00,-5.0,0.0,0.0,0.0,2018-07-31 15:25:00
863965,2018-07-31 12:00:00,JFK,TPA,19:54:00,29.0,29.0,1.0,0.0,2018-07-31 19:54:00


In [62]:
# Ensure datetime format
weather_df['DATE'] = pd.to_datetime(weather_df['DATE'])
full_df['DEP_DATE_TIME'] = pd.to_datetime(full_df['DEP_DATE_TIME'])

# Round DEP_DATE_TIME to the nearest hour for merging
full_df['DEP_DATE_TIME_HOURLY'] = full_df['DEP_DATE_TIME'].dt.round('H')

# Sort both dataframes by time (required for merge_asof)
weather_df = weather_df.sort_values('DATE')
full_df = full_df.sort_values('DEP_DATE_TIME_HOURLY')

# Merge, taking the nearest past weather observation
merged_df = pd.merge_asof(
    full_df, 
    weather_df, 
    left_on='DEP_DATE_TIME_HOURLY',  # Use rounded departure time for merging
    right_on='DATE', 
    direction='backward'  # Match the most recent weather before departure
)

# Drop the helper column
merged_df.drop(columns=['DEP_DATE_TIME_HOURLY'], inplace=True)

merged_df

  full_df['DEP_DATE_TIME_HOURLY'] = full_df['DEP_DATE_TIME'].dt.round('H')


Unnamed: 0,FL_DATE,ORIGIN,DEST,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,CANCELLED,DEP_DATE_TIME,DATE,...,HOURLYWindSpeed,HOURLYStationPressure,HOURLYSeaLevelPressure,HOURLYPrecip,HOURLYAltimeterSetting,HOURLYWindDirectionSin,HOURLYWindDirectionCos,HOURLYPressureTendencyIncr,HOURLYPressureTendencyDecr,HOURLYPressureTendencyCons
0,2010-01-01 00:00:00,JFK,BUF,00:23:00,84.0,84.0,1.0,0.0,2010-01-01 00:23:00,NaT,...,,,,,,,,,,
1,2010-01-01 00:00:00,JFK,SYR,01:29:00,164.0,164.0,1.0,0.0,2010-01-01 01:29:00,2010-01-01 01:00:00,...,0.0,29.97,29.99,0.01,29.99,0.0,1.0,0.0,1.0,0.0
2,2010-01-01 00:00:00,JFK,PBI,06:15:00,15.0,15.0,1.0,0.0,2010-01-01 06:15:00,2010-01-01 06:00:00,...,0.0,29.95,29.97,0.00,29.97,0.0,1.0,0.0,1.0,0.0
3,2010-01-01 00:00:00,JFK,MIA,05:57:00,12.0,12.0,0.0,0.0,2010-01-01 05:57:00,2010-01-01 06:00:00,...,0.0,29.95,29.97,0.00,29.97,0.0,1.0,0.0,1.0,0.0
4,2010-01-01 00:00:00,JFK,LAX,06:27:00,2.0,2.0,0.0,0.0,2010-01-01 06:27:00,2010-01-01 06:00:00,...,0.0,29.95,29.97,0.00,29.97,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863962,2018-07-31 12:00:00,JFK,BOS,23:10:00,100.0,100.0,1.0,0.0,2018-07-31 23:10:00,2018-07-27 23:00:00,...,0.0,30.01,30.03,0.00,30.03,0.0,1.0,1.0,0.0,0.0
863963,2018-07-31 12:00:00,JFK,BOS,22:54:00,-1.0,0.0,0.0,0.0,2018-07-31 22:54:00,2018-07-27 23:00:00,...,0.0,30.01,30.03,0.00,30.03,0.0,1.0,1.0,0.0,0.0
863964,2018-07-31 12:00:00,JFK,LAX,22:43:00,-7.0,0.0,0.0,0.0,2018-07-31 22:43:00,2018-07-27 23:00:00,...,0.0,30.01,30.03,0.00,30.03,0.0,1.0,1.0,0.0,0.0
863965,2018-07-31 12:00:00,JFK,BTV,23:35:00,50.0,50.0,1.0,0.0,2018-07-31 23:35:00,2018-07-27 23:00:00,...,0.0,30.01,30.03,0.00,30.03,0.0,1.0,1.0,0.0,0.0
