In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
DATASET_PATH = "D:/flight dataset/"

In [None]:
df = pd.read_csv(DATASET_PATH + "Combined_Flights_2018.csv")
intr_cols = [
    #"FlightDate",
    #"Airline",
    "Origin",
    "Dest",
    #"Cancelled",
    #"Diverted",
    "DepTime",
    "DepDelayMinutes",
    #"ArrTime",
    #"ArrDelayMinutes",
    #"AirTime",
    #"ActualElapsedTime",
    "Distance",
    #"Year",
    "Month",
    "DayofMonth",
    "DayOfWeek",
    #"Operating_Airline",
    "DOT_ID_Operating_Airline",
    "Tail_Number",
    "Flight_Number_Operating_Airline",
    #"OriginCityName",
    #"OriginStateName",
    #"OriginWac",
    #"DestCityName",
    #"DestStateName",
    #"DestWac",
    "WheelsOff",
    #"WheelsOn",
    "TaxiOut",
    "TaxiIn"
    #"DivAirportLandings"
]

df = df[intr_cols]



In [None]:
# CORR PLOT
corrDf = df.drop("Year", inplace=False, axis=1)
sns.heatmap(corrDf.corr(), annot=False)
plt.show()

In [None]:
# Load all dataframes
files = ["Combined_Flights_2018.csv", "Combined_Flights_2019.csv", "Combined_Flights_2021.csv", "Combined_Flights_2022.csv"]
df = pd.DataFrame({})
for dataFile in files:
    dfTemp = pd.read_csv(DATASET_PATH + dataFile)
    dfTemp = dfTemp[["DepDelayMinutes", "DayOfWeek", "OriginStateName"]]
    df = pd.concat([df, dfTemp], ignore_index=True)
    del dfTemp

df2020 = pd.read_csv(DATASET_PATH + "Combined_Flights_2020.csv")
df2020 = df2020[["DepDelayMinutes", "DayOfWeek", "OriginStateName"]]

In [None]:
# BAR PLOT DELAY
def plotDelayType(df, title):
    onTimeDf = df.query("DepDelayMinutes == 0").size
    smallDelay = df.query("DepDelayMinutes > 0 & DepDelayMinutes <= 15").size
    mediumDelay = df.query("DepDelayMinutes > 15  & DepDelayMinutes <= 30").size
    largeDelay = df.query("DepDelayMinutes > 30").size

    plt.bar(["On Time", "Small Delay", "Medium Delay", "Large Delay"], [onTimeDf, smallDelay, mediumDelay, largeDelay], color = ["purple", "violet", "slateblue", "royalblue"])
    plt.title(title)
    plt.show()
plotDelayType(df, "Type of Delay\n2020 excluded")
plotDelayType(df2020, "Type of Delay 2020")

In [None]:
# BAR PLOT: Average Delay by Week
def weeklyDelay(df, title):
    daily = []
    weekRange = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

    for x in range(1, len(weekRange) + 1):
        dailyDf = df.query(f"DayOfWeek == {x}")
        daily.append(dailyDf["DepDelayMinutes"].sum() / dailyDf.size)

    plt.bar(weekRange, daily, color = ["springgreen", "lightgreen", "mediumseagreen", "limegreen", "seagreen",  "forestgreen",  "darkgreen"])
    plt.title(title)
    plt.show()
weeklyDelay(df, "Average Delay by Week\n2020 excluded")
weeklyDelay(df2020, "Average Delay by Week 2020")


In [None]:
groupDf = df[["DepDelayMinutes", "OriginStateName"]].groupby(["OriginStateName"])
joined = groupDf.sum().merge(groupDf.size().rename("size"), left_index=True, right_index=True)
avgDelState = joined["DepDelayMinutes"] / joined["size"]
avgDelState = avgDelState.sort_values(ascending=True)
plt.figure(figsize=(5,10))
avgDelState.plot.barh(color="teal");


In [None]:
# Dataset preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode str to int32 
leAirport = LabelEncoder()
leTail = LabelEncoder()
leAirport.fit(np.unique(df[["Origin", "Dest"]].values.reshape(1, -1)))
df["Origin"] = leAirport.transform(df["Origin"])
df["Dest"] = leAirport.transform(df["Dest"])
df["Tail_Number"] = leTail.fit_transform(df["Tail_Number"])

# Encode delayed result
df["DepDelayMinutes"] = (df["DepDelayMinutes"] > 15).astype("int")

In [None]:
print(np.isnan(df.values).sum(axis=0))
df.info()

In [None]:
# Split dataset
y = df["DepDelayMinutes"].values
X = df.drop("DepDelayMinutes", inplace=False, axis = 1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

#classifier = LogisticRegression()
np.isnan(X_train)
#classifier.fit(X_train, y_train)