In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm

In [None]:
DATASET_PATH = "D:/flight dataset/"

In [None]:
intr_cols = [
    #"FlightDate",
    #"Airline",
    "Origin",
    "Dest",
    "Cancelled",
    #"Diverted",
    "DepTime",
    "DepDelayMinutes",
    #"ArrTime",
    #"ArrDelayMinutes",
    #"AirTime",
    #"ActualElapsedTime",
    "Distance",
    #"Year",
    "Month",
    "DayofMonth",
    "DayOfWeek",
    #"Operating_Airline",
    "DOT_ID_Operating_Airline",
    "Tail_Number",
    "Flight_Number_Operating_Airline",
    #"OriginCityName",
    #"OriginStateName",
    #"OriginWac",
    #"DestCityName",
    #"DestStateName",
    #"DestWac",
    "WheelsOff",
    #"WheelsOn",
    "TaxiOut",
    #"TaxiIn",
    #"DivAirportLandings"
]

def loadDataset(csvName):
    df = pd.read_csv(DATASET_PATH + csvName)
    return df[intr_cols]

In [None]:
# TODO: Fai la correlazione di tutti i dataset
# CORR PLOT
df = loadDataset("Combined_Flights_2018.csv")
df.drop("Cancelled", axis=1, inplace=True)
plt.figure(figsize=(11,8))
sns.heatmap(df.corr(), annot=True)
#
#sns.pairplot(df)
plt.show()

In [None]:
# TODO: use loadDataset
# Load all dataframes
def loadAll(lstFiles, lstCol):
    df = pd.DataFrame({})
    for dataFile in tqdm(lstFiles):
        dfTemp = pd.read_csv(DATASET_PATH + dataFile)
        dfTemp = dfTemp[lstCol]
        df = pd.concat([df, dfTemp], ignore_index=True)
        del dfTemp
    return df

In [None]:
df2020 = pd.read_csv(DATASET_PATH + "Combined_Flights_2020.csv")
df2020 = df2020[["DepDelayMinutes", "DayOfWeek", "OriginStateName"]]

files = ["Combined_Flights_2018.csv", "Combined_Flights_2019.csv", "Combined_Flights_2021.csv", "Combined_Flights_2022.csv"]
lstCol = ["DepDelayMinutes", "DayOfWeek", "OriginStateName"]
df = loadAll(files, lstCol)

In [None]:
# BAR PLOT DELAY
def plotDelayType(df, title):
    onTimeDf = df.query("DepDelayMinutes == 0").size
    smallDelay = df.query("DepDelayMinutes > 0 & DepDelayMinutes <= 15").size
    mediumDelay = df.query("DepDelayMinutes > 15  & DepDelayMinutes <= 30").size
    largeDelay = df.query("DepDelayMinutes > 30").size

    plt.bar(["On Time", "Small Delay", "Medium Delay", "Large Delay"], [onTimeDf, smallDelay, mediumDelay, largeDelay], color = ["purple", "violet", "slateblue", "royalblue"])
    plt.title(title)
    plt.show()
plotDelayType(df, "Type of Delay\n2020 excluded")
plotDelayType(df2020, "Type of Delay 2020")

In [None]:
# BAR PLOT: Average Delay by Week
def weeklyDelay(df, title):
    daily = []
    weekRange = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

    for x in range(1, len(weekRange) + 1):
        dailyDf = df.query(f"DayOfWeek == {x}")
        daily.append(dailyDf["DepDelayMinutes"].sum() / dailyDf.size)

    plt.bar(weekRange, daily, color = ["springgreen", "lightgreen", "mediumseagreen", "limegreen", "seagreen",  "forestgreen",  "darkgreen"])
    plt.title(title)
    plt.show()
weeklyDelay(df, "Average Delay by Week\n2020 excluded")
weeklyDelay(df2020, "Average Delay by Week 2020")

In [None]:
groupDf = df[["DepDelayMinutes", "OriginStateName"]].groupby(["OriginStateName"])
joined = groupDf.sum().merge(groupDf.size().rename("size"), left_index=True, right_index=True)
avgDelState = joined["DepDelayMinutes"] / joined["size"]
avgDelState = avgDelState.sort_values(ascending=True)
plt.figure(figsize=(5,10))
avgDelState.plot.barh(color="teal");

In [None]:
# Dataset preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

files = ["Combined_Flights_2018.csv", "Combined_Flights_2019.csv", "Combined_Flights_2021.csv", "Combined_Flights_2022.csv"]
df = loadAll(files, intr_cols)

# Remove cancelled flight
df = df[df["Cancelled"] == False].drop("Cancelled", axis=1, inplace=False)
df.dropna(inplace=True)

# Encode str to int32 
leAirport = LabelEncoder()
leTail = LabelEncoder()
leAirport.fit(np.unique(df[["Origin", "Dest"]].values.reshape(1, -1)))
df["Origin"] = leAirport.transform(df["Origin"])
df["Dest"] = leAirport.transform(df["Dest"])
df["Tail_Number"] = leTail.fit_transform(df["Tail_Number"])

# Encode delayed result
df["DepDelayMinutes"] = (df["DepDelayMinutes"] > 15).astype("int")

In [None]:
from sklearn.preprocessing import StandardScaler
# Split dataset
y = df["DepDelayMinutes"].values
X = df.drop("DepDelayMinutes", inplace=False, axis = 1).values
features_name = list(df.drop("DepDelayMinutes", inplace=False, axis = 1).columns.values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Scaling values
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Train model
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver="saga", random_state=0)
classifier = classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score
from sklearn.metrics import accuracy_score

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print(cm)
print(classifier.coef_)
# Hanno più influenza DepTime, Distance and taxiOut
print(np.std(X, 0)*classifier.coef_)
# Plot ROC
fpRate, tpRate, _ = roc_curve(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
plt.plot(fpRate, tpRate, label=f"AUC: {auc}")
plt.legend()

In [None]:
# Decision Tree model
from sklearn.tree import DecisionTreeClassifier

dModel = DecisionTreeClassifier()
dModel.fit(X_train, y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
# subsampling data
num_training = 10000
X_train_sub = X_train[:num_training]
y_train_sub = y_train[:num_training]

k_range = range(5, 11)
scores = {}
scores_list = []

In [None]:
for k in tqdm(k_range):
    kCl = KNeighborsClassifier(n_neighbors=k)
    kFit = kCl.fit(X_train_sub, y_train_sub)
    pred = kFit.predict(X_test)
    scores[k] = pred
    correct = (pred == y_test).sum()
    scores_list.append(correct / y_train_sub.shape[0])

In [None]:
plt.plot(k_range, scores_list)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')

In [None]:
# now pick the best k and train on the whole training set
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
# print the accuracy
print(metrics.accuracy_score(y_test, y_pred))