In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def CleanFlightData(airport):
    # Reads each of the raw data files, drops any rows with missing values, duplicates, or rows that are not from the target airport, and saves the cleaned data to a new file in the clean directory
    for yy in range(10,19):
        for mm in range(1,13):
            if yy == 18 and mm == 8:
                break
            if mm < 10:
                mm = f'0{mm}'

            df = pd.read_csv(f'data/raw/{airport}/{airport}_{mm}_{yy}.csv', sep=',')

            df = df.drop_duplicates()

            # Only require flights from the target airport
            df = df.drop(df[df['ORIGIN'] != f'{airport}'].index)

            # Where delayed, but no delay reason is given, drop the row
            df = df[~((df['DEP_DELAY_GROUP'] > 0) & df[['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']].isna().any(axis=1))]
            df = df.fillna(0)

            df.to_csv(f'data/clean/{airport}/{airport}_{mm}_{yy}.csv', index=False)

In [3]:
def MergeFiles(airport):
    # Concatenates all of the cleaned data files into one large dataset
    full_df = pd.DataFrame()

    # Read in the cleaned datafiles spanning from January 2010 to July 2018 (Files names follow convention e.g. 'JFK_MM_YY.csv')
    for yy in range(10,19):
        for mm in range(1,13):
            if yy == 18 and mm == 8:
                break
            if mm < 10:
                mm = f'0{mm}'

            df = pd.read_csv(f'data/clean/{airport}/{airport}_{mm}_{yy}.csv', sep=',')
            full_df = pd.concat([full_df, df])
            full_df.to_csv(f'data/clean/{airport}/{airport}_full.csv', index=False)

In [4]:
def CleanWeatherData(airport):
    # Reads the weather data file, drops any rows with missing values, and saves the cleaned data to a new file in the clean directory
    df = pd.read_csv(f'data/Weather/{airport}_Weather_cleaned.csv', sep=',')

    # Drop rows with missing/duplicate values and only keep the columns we need
    df = df.fillna(0)    
    df = df.drop_duplicates()
    df = df[['DATE','HOURLYVISIBILITY','HOURLYDRYBULBTEMPF','HOURLYWETBULBTEMPF','HOURLYDewPointTempF','HOURLYRelativeHumidity','HOURLYWindSpeed','HOURLYStationPressure','HOURLYSeaLevelPressure','HOURLYPrecip','HOURLYAltimeterSetting','HOURLYWindDirectionSin','HOURLYWindDirectionCos','HOURLYPressureTendencyIncr','HOURLYPressureTendencyDecr','HOURLYPressureTendencyCons']]
    
    # Override file with the cleaned weather data
    df.to_csv(f'data/Weather/{airport}_Weather_cleaned.csv', index=False)

In [5]:
def FormatTimes(full_df):
    # Reformatting departure times to date times
    full_df['CRS_DEP_TIME'] = full_df['CRS_DEP_TIME'].apply(lambda x: f"{int(x):04d}")
    full_df['DEP_TIME'] = full_df['DEP_TIME'].apply(lambda x: f"{int(x):04d}")
    full_df['CRS_DEP_TIME'] = full_df['CRS_DEP_TIME'].apply(lambda x: f"{x[:2]}:{x[2:]}:00")
    full_df['DEP_TIME'] = full_df['DEP_TIME'].apply(lambda x: f"{x[:2]}:{x[2:]}:00")

    # Reformatting FL_DATE to correct date-time format by removing AM/PM
    full_df['FL_DATE'] = full_df['FL_DATE'].str.replace(r'AM|PM', '', regex=True)

    # Convert datatype to date time
    full_df['FL_DATE'] = pd.to_datetime(full_df['FL_DATE'], format='mixed')
    
    # Handle "24:00:00" in DEP_TIME
    mask1 = full_df['CRS_DEP_TIME'] == '24:00:00'  # Find rows with "24:00:00"
    mask2 = full_df['DEP_TIME'] == '24:00:00'  # Find rows with "24:00:00"

    # Replace "24:00:00" with "00:00:00"
    full_df.loc[mask1, 'DEP_TIME'] = '00:00:00'
    full_df.loc[mask2, 'CRS_DEP_TIME'] = '00:00:00'

    # Add one day to FL_DATE where DEP_TIME was "24:00:00"
    full_df.loc[mask1, 'FL_DATE'] += pd.Timedelta(days=1)

    # Convert to proper datetime format
    full_df['DEP_DATE_TIME'] = pd.to_datetime(full_df['FL_DATE'].dt.date.astype(str) + ' ' + full_df['CRS_DEP_TIME'])

    return full_df

In [6]:
def MergeDatasets(airport):
    full_df = pd.read_csv(f'data/clean/{airport}/{airport}_full.csv', sep=',')
    weather_df = pd.read_csv(f'data/Weather/{airport}_Weather_cleaned.csv', sep=',')

    full_df = FormatTimes(full_df)

    # Ensure datetime format
    weather_df['DATE'] = pd.to_datetime(weather_df['DATE'])
    full_df['DEP_DATE_TIME'] = pd.to_datetime(full_df['DEP_DATE_TIME'])

    # Round DEP_DATE_TIME to the nearest hour for merging
    full_df['DEP_DATE_TIME_HOURLY'] = full_df['DEP_DATE_TIME'].dt.round('H')

    # Sort both dataframes by time (required for merge_asof)
    weather_df = weather_df.sort_values('DATE')
    full_df = full_df.sort_values('DEP_DATE_TIME_HOURLY')

    # Merge, taking the nearest past weather observation
    merged_df = pd.merge_asof(
        full_df, 
        weather_df, 
        left_on='DEP_DATE_TIME_HOURLY',  # Use rounded departure time for merging
        right_on='DATE', 
        direction='backward'  # Match the most recent weather before departure
    )

    # Drop the FL_DATE column as no longer needed and replace it with more informative DEP_DATE_TIME_HOURLY
    merged_df.drop(columns=['FL_DATE'], inplace=True)

    # Reorder columns
    col_data = merged_df.pop("DEP_DATE_TIME_HOURLY")
    merged_df.insert(0, "DEP_DATE_TIME_HOURLY", col_data)  # Insert it at index 0 (first column)

    return merged_df

In [7]:
def TidyData(merged_df):
    # Remove cancelled flights with delay group != 0
    merged_df = merged_df[(merged_df['CANCELLED'] == 0)]

    # Convert 'HOURLYDRYBULBTEMPF', 'HOURLYWETBULBTEMPF' and 'HOURLYDewPointTempF' values from farhenheit to celsius
    merged_df['HOURLYDRYBULBTEMPC'] = (merged_df['HOURLYDRYBULBTEMPF'] - 32) * 5/9
    merged_df['HOURLYWETBULBTEMPC'] = (merged_df['HOURLYWETBULBTEMPF'] - 32) * 5/9
    merged_df['HOURLYDewPointTempC'] = (merged_df['HOURLYDewPointTempF'] - 32) * 5/9

    # Drop the original columns
    merged_df.drop(columns=['HOURLYDRYBULBTEMPF', 'HOURLYWETBULBTEMPF', 'HOURLYDewPointTempF'], inplace=True)

    # Show only 'DEP_DATE_TIME_HOURLY','CRS_DEP_TIME','DEP_TIME','DEP_DATE_TIME','DATE' columns
    merged_df[['DEP_DATE_TIME_HOURLY','CRS_DEP_TIME','DEP_TIME','DEP_DATE_TIME','DATE']]
    merged_df.drop(columns=['DEP_DATE_TIME_HOURLY','CRS_DEP_TIME','DATE'], inplace=True)

    # Move DEP_DATE_TIME and DEP_TIME to the front
    col_data = merged_df.pop("DEP_DATE_TIME")
    merged_df.insert(0, "DEP_DATE_TIME", col_data)
    col_data = merged_df.pop("DEP_TIME")
    merged_df.insert(1, "DEP_TIME", col_data)

    # Rename DEP_TIME to ACC_DEP_TIME
    merged_df.rename(columns={'DEP_TIME': 'ACC_DEP_TIME'}, inplace=True)

    # Removing 'early' classification
    merged_df.loc[merged_df['DEP_DELAY_GROUP'] <0, 'DEP_DELAY_GROUP'] = 0

    # Extract time-related features from `DEP_DATE_TIME`
    merged_df['DEP_HOUR'] = merged_df['DEP_DATE_TIME'].dt.hour
    merged_df['DEP_DAY'] = merged_df['DEP_DATE_TIME'].dt.day
    merged_df['DEP_MONTH'] = merged_df['DEP_DATE_TIME'].dt.month
    merged_df['DEP_WEEKDAY'] = merged_df['DEP_DATE_TIME'].dt.weekday

    # Convert DEP_DELAY_NEW, DEP_DELAY_GROUP, CANCELLED, CARRIER_DELAY, WEATHER_DELAY, NAS_DELAY, SECURITY_DELAY and LATE_AIRCRAFT_DELAY to int
    merged_df['DEP_DELAY_NEW'] = merged_df['DEP_DELAY_NEW'].astype(int)
    merged_df['DEP_DELAY_GROUP'] = merged_df['DEP_DELAY_GROUP'].astype(int)
    merged_df['CANCELLED'] = merged_df['CANCELLED'].astype(int)
    merged_df['CARRIER_DELAY'] = merged_df['CARRIER_DELAY'].astype(int)
    merged_df['WEATHER_DELAY'] = merged_df['WEATHER_DELAY'].astype(int)
    merged_df['NAS_DELAY'] = merged_df['NAS_DELAY'].astype(int)
    merged_df['SECURITY_DELAY'] = merged_df['SECURITY_DELAY'].astype(int)
    merged_df['LATE_AIRCRAFT_DELAY'] = merged_df['LATE_AIRCRAFT_DELAY'].astype(int)

    # Reindex the DataFrame to ensure that the columns are in the correct order
    merged_df = merged_df.reset_index(drop=True)

    return merged_df

In [8]:
def ProcessData(airport):
    # Read and clean the raw individual flight data files for each airport
    CleanFlightData(airport)
    # Merge the cleaned data files into one large dataset
    MergeFiles(airport)
    # Clean Weather Data
    CleanWeatherData(airport)
    # Merge the full dataset with the weather data
    merged_df = MergeDatasets(airport)

    merged_df = TidyData(merged_df)
    merged_df.to_csv(f'data/complete/{airport}_final_df.csv', index=False)

In [None]:
#airports = ['JFK','LAX','EWR','MIA','ORD']
#for airport in airports:
#    ProcessData(airport)
ProcessData('ORD')