### 1) Data Preparation

In [360]:
import pandas as pd
import numpy as np
import zipfile
import os
import datetime

In [361]:
def combine_zipped_data(root_data_folder_path):
    '''
    This function opens all zip files in a given folder and combines any csv data found
    within them into a single Pandas DataFrame.
    
    Parameters
    ----------
    root_data_folder_path: A string containing the path to a folder containing zip files
                           with csv data.
    
    Returns
    -------
    A Pandas Dataframe with all csv data found in the given folder, combined together along
    the index (0) axis.
    '''
    # Creating empty list of DataFrames
    data_to_combine = []
    # Looping through raw data folder
    with os.scandir(root_data_folder_path) as root_data_folder:
        total_files = len(os.listdir(root_data_folder_path))
        current_progress = 0
        for entry in root_data_folder:
            # Displaying current progress
            current_progress += 1
            print(f"Processing file {current_progress}/{total_files} ...", end="\r")
            # Searching for zipped data
            if entry.name.endswith(".zip") and entry.is_file():
                # Opening zipped data folders
                with zipfile.ZipFile(root_data_folder_path + '/' + entry.name, "r") as zipped:
                    for name in zipped.namelist():
                        # Searching for csv files in zipped folders
                        if name.endswith('.csv'):
                            with zipped.open(name) as delay_data:
                                # Reading csv and adding to list of datasets
                                data_to_combine.append(pd.read_csv(delay_data, low_memory=False))
    # Attempting to combine and return collected data
    print('All files unpacked. Combining data...', end='\r')
    combined_data = pd.concat(data_to_combine)
    print('Data successfully combined!           ')
    return combined_data

flight_df = combine_zipped_data("BTS_Data")
flight_df = flight_df.dropna(subset=["FlightDate","DepTime"])

Data successfully combined!           


In [362]:
# Removing all columns that are more than 5% NaN values 
flight_df = flight_df[flight_df.columns[flight_df.isna().sum() < flight_df.shape[0] / 20]]
# Dropping rows of data with NaN in the target delay column (ArrDel15)
flight_df = flight_df.dropna(subset="ArrDel15")

In [363]:
def combine_weather_data(root_data_folder_path):
    '''
    
    '''
    data_to_combine = []
    with os.scandir(root_data_folder_path) as root_data_folder:
        total_files = len(os.listdir(root_data_folder_path))
        current_progress = 0
        for entry in root_data_folder:
            # Displaying current progress
            current_progress += 1
            print(f"Processing file {current_progress}/{total_files} ...", end="\r")
            # Searching for csv data
            if entry.is_file() and entry.name[-4:] == ".csv":
                # Collecting csv files for combination
                airport_df = pd.read_csv(entry.path)
                airport_df.loc[:,"airport_code"] = entry.name[:-4]
                airport_df = airport_df.fillna(value={"gust":0})
                data_to_combine.append(airport_df)
    # Attempting to combine and return collected data
    print('All files unpacked. Combining data...', end='\r')
    combined_data = pd.concat(data_to_combine)
    print('Data successfully combined!           ')
    return combined_data

root_data_folder_path = "WeatherData_Clean"
weather_df = combine_weather_data(root_data_folder_path)

Data successfully combined!           


In [364]:
# Dropping extremely small number of rows with null windspeed/dewpoint/temp values
weather_df = weather_df.dropna()

In [365]:
def match_flight_and_weather_data(flight_df, weather_df):
    '''
    
    '''
    # Preparing flight dataframe to recieve weather data rows
    flight_df = flight_df.reset_index()
    flight_df.loc[:,weather_df.columns] = pd.NA
    weather_column_names = weather_df.columns
    # Recording airport counts to display function's current progress
    num_airports_to_inspect = len(flight_df.Origin.unique())
    airports_with_known_weather_data = weather_df.airport_code.unique()
    for ind, airport_code in enumerate(list(flight_df.Origin.unique())):
        if airport_code in airports_with_known_weather_data:
            print(f'Currently processing airport code {airport_code}. {ind}/{num_airports_to_inspect}  ', end='\r')
            # Obtaining flight data for current airport and sorting by departure time
            airport_flight_df = flight_df[flight_df.Origin == airport_code]
            airport_flight_df.loc[:,"FlightDate"] = (airport_flight_df.FlightDate + " " + airport_flight_df.DepTime.apply(int).apply(str).str.zfill(4).apply(lambda time: time if time != "2400" else "2359"))
            airport_flight_df.loc[:,"FlightDate"] = airport_flight_df.FlightDate.apply(lambda datestring: datetime.datetime.strptime(datestring, "%Y-%m-%d %H%M"))
            airport_flight_df = airport_flight_df.sort_values(by="FlightDate")
            # Obtaining weather data for current airport and sorting by measurement time
            airport_weather_df = weather_df[weather_df.airport_code == airport_code]
            airport_weather_df.loc[:,"record_start_date"] = airport_weather_df.record_start_date.apply(lambda datestring: datetime.datetime.strptime(datestring, "%Y-%m-%d %H:%M:%S"))
            airport_weather_df = airport_weather_df.sort_values(by="record_start_date")
            # Matching flight and weather data by ascending along both date columns
            current_flight_entry = 0
            current_weather_entry = 0
            while current_flight_entry < airport_flight_df.shape[0]:
                while airport_flight_df.FlightDate.iloc[current_flight_entry] > airport_weather_df.record_start_date.iloc[current_weather_entry]:
                    current_weather_entry += 1
                flight_df.loc[airport_flight_df.index[current_flight_entry],weather_column_names] = airport_weather_df.iloc[current_weather_entry,:]
                current_flight_entry += 1
    print("Data successfully attached!                                       ", end="\r")
    return flight_df

# Restricting attachment to only airports with known weather data
relevant_flights_df = flight_df[flight_df.Origin.isin(weather_df.airport_code.unique())]
# Subsetting dataset for demo testing purposes (NOT FINAL)
relevant_flights_df = relevant_flights_df[relevant_flights_df.Origin == "SEA"]
relevant_flights_df = relevant_flights_df[relevant_flights_df.Year == 2022]
relevant_flights_df = relevant_flights_df[relevant_flights_df.Month.isin((5,6,7))]
demo_df = match_flight_and_weather_data(relevant_flights_df, weather_df)

Data successfully attached!                                       

In [366]:
demo_df.to_pickle("combined_flight_data")