### Flight Cleaning Notebook
This notebook will be responsible for merging all the flight information for the San Diego International Airport's On Time performance covering two years.

In [None]:
# import libraries
import pandas as pd

In [None]:
# import the csv files
Jan23 = pd.read_csv('Raw_data/raw_on-time/Jan23_OT_report.csv')
Feb23 = pd.read_csv('Raw_data/raw_on-time/Feb23_OT_report.csv')
Mar23 = pd.read_csv('Raw_data/raw_on-time/Mar23_OT_report.csv')
Apr23 = pd.read_csv('Raw_data/raw_on-time/Apr23_OT_report.csv')
May23 = pd.read_csv('Raw_data/raw_on-time/May23_OT_report.csv')
Jun23 = pd.read_csv('Raw_data/raw_on-time/Jun23_OT_report.csv')
Jul23 = pd.read_csv('Raw_data/raw_on-time/Jul23_OT_report.csv')
Aug23 = pd.read_csv('Raw_data/raw_on-time/Aug23_OT_report.csv')
Sep23 = pd.read_csv('Raw_data/raw_on-time/Sep23_OT_report.csv')
Oct23 = pd.read_csv('Raw_data/raw_on-time/Oct23_OT_report.csv')
Nov23 = pd.read_csv('Raw_data/raw_on-time/Nov23_OT_report.csv')
Dec23 = pd.read_csv('Raw_data/raw_on-time/Dec23_OT_report.csv')
Jan24 = pd.read_csv('Raw_data/raw_on-time/Jan24_OT_report.csv')
Feb24 = pd.read_csv('Raw_data/raw_on-time/Feb24_OT_report.csv')
Mar24 = pd.read_csv('Raw_data/raw_on-time/Mar24_OT_report.csv')
Apr24 = pd.read_csv('Raw_data/raw_on-time/Apr24_OT_report.csv')
May24 = pd.read_csv('Raw_data/raw_on-time/May24_OT_report.csv')
Jun24 = pd.read_csv('Raw_data/raw_on-time/Jun24_OT_report.csv')
Jul24 = pd.read_csv('Raw_data/raw_on-time/Jul24_OT_report.csv')
Aug24 = pd.read_csv('Raw_data/raw_on-time/Aug24_OT_report.csv')
Sep24 = pd.read_csv('Raw_data/raw_on-time/Sep24_OT_report.csv')
Oct24 = pd.read_csv('Raw_data/raw_on-time/Oct24_OT_report.csv')
Nov24 = pd.read_csv('Raw_data/raw_on-time/Nov24_OT_report.csv')
Dec24 = pd.read_csv('Raw_data/raw_on-time/Dec24_OT_report.csv')

In [None]:
# list the df's in chronological order
dfs = [Jan23, Feb23, Mar23, Apr23, May23, Jun23, Jul23, Aug23, Sep23, Oct23, Nov23, Dec23,
       Jan24, Feb24, Mar24, Apr24, May24, Jun24, Jul24, Aug24, Sep24, Oct24, Nov24, Dec24]

In [None]:
# merge df's
merged_df = pd.concat(dfs, ignore_index=True)

In [None]:
# Filter all df so that only flights leaving San Diego are represented
SAN_df = merged_df.loc[merged_df['ORIGIN'] == 'SAN']
# Reset the index
SAN_df = SAN_df.reset_index(drop=True)

In [None]:
# Get shape of the df
SAN_df.shape

In [None]:
# get the data types of the df
SAN_df.dtypes

In [None]:
print(SAN_df.head())

In [None]:
# adjust the FL_DATE column to datetime
SAN_df['FL_DATE'] = pd.to_datetime(SAN_df['FL_DATE']).dt.date

In [None]:
# convert CRS_DEP_TIME to a string
SAN_df['CRS_DEP_TIME'] = SAN_df['CRS_DEP_TIME'].astype(str).str.zfill(4)
# convert CRS_DEP_TIME to time format
SAN_df['CRS_DEP_TIME'] = pd.to_datetime(SAN_df['CRS_DEP_TIME'], format='%H%M').dt.time

In [None]:
print(SAN_df.head())

In [None]:
# create function to handle missing DEP_TIME for cancelled flights
def convert_dep_time(val):
    if pd.isnull(val):
        return None
    try:
        val_float = float(val)
        val_int = int(val_float)
        time_str = str(val_int).zfill(4)
        return pd.to_datetime(time_str, format='%H%M').time()
    except Exception:
        return None

In [None]:
# apply the function to the DEP_TIME column
SAN_df['DEP_TIME'] = SAN_df['DEP_TIME'].apply(convert_dep_time)

In [None]:
print(SAN_df.head())

In [None]:
# convert the DEP_DEL15 column to int
SAN_df['DEP_DEL15'] = SAN_df['DEP_DEL15'].fillna(0).astype(int)

In [None]:
# convert CANCELLED column to int
SAN_df['CANCELLED'] = SAN_df['CANCELLED'].fillna(0).astype(int)

In [None]:
# check what values are present in cancellation_code
SAN_df['CANCELLATION_CODE'].value_counts()

In [None]:
# fill missing values with None in CANCELLATION_CODE
SAN_df['CANCELLATION_CODE'] = SAN_df['CANCELLATION_CODE'].fillna('None')

In [None]:
# check what values are present in CARRIER_DELAY
SAN_df['CARRIER_DELAY'].value_counts()


In [None]:
# convert the CARRIER_DELAY column to int
SAN_df['CARRIER_DELAY'] = SAN_df['CARRIER_DELAY'].fillna(0).astype(int)

In [None]:
# convert the WEATHER_DELAY column to int
SAN_df['WEATHER_DELAY'] = SAN_df['WEATHER_DELAY'].fillna(0).astype(int)

In [None]:
# convert the NAS_DELAY column to int
SAN_df['NAS_DELAY'] = SAN_df['NAS_DELAY'].fillna(0).astype(int)

In [None]:
# convert the SECURITY_DELAY column to int
SAN_df['SECURITY_DELAY'] = SAN_df['SECURITY_DELAY'].fillna(0).astype(int)

In [None]:
# convert the LATE_AIRCRAFT_DELAY column to int
SAN_df['LATE_AIRCRAFT_DELAY'] = SAN_df['LATE_AIRCRAFT_DELAY'].fillna(0).astype(int)

In [None]:
# reorder the columns so times are next to the date
SAN_df = SAN_df[['FL_DATE', 'CRS_DEP_TIME', 'DEP_TIME', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'DEP_DEL15', 'CANCELLED', 'CANCELLATION_CODE', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']]
print(SAN_df.head())

In [None]:
# Save the df to a csv file
SAN_df.to_csv('clean_data/SAN_OT_report.csv', index=False)