### Can we cluster airports based on their delay patterns?

We are curious if we can cluster airports based on DepDelay (Departure Delay) and ArrDelay (Arrival Delay)

We want to identify the best airports (low delay) and the less reliable airports (high delay). A third cluster will show mixed performance airports (medium delay)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
path = kagglehub.dataset_download("giovamata/airlinedelaycauses")
df = pd.read_csv(path + '/DelayedFlights.csv')

### Quick plotting settings

In [None]:
sns.set_theme(context='notebook', style='whitegrid')
plt.rcParams['figure.figsize'] = 6, 4

### Data Exploration

In [None]:
df.info()

In [None]:
df.describe()

### Data Preprocessing

This dataset only has data from 2008, so we can drop that column

In [None]:
df = df.drop(['Year'], axis=1)

KMeans does not accept NaN values so we need to remove them from our input variables

In [None]:
df = df.dropna(subset=['DepDelay', 'ArrDelay'])


In [None]:
# Checking for outliers
plt.boxplot([df['DepDelay'], df['ArrDelay']])
plt.xticks([1, 2], ['DepDelay', 'ArrDelay'])
plt.show()

from scipy import stats
z_scores = stats.zscore(df[['DepDelay', 'ArrDelay']])
outliers = (abs(z_scores) > 3).any(axis=1)
print("Number of outliers:", outliers.sum())

I want to try one more method to visualize outliers

In [None]:
df_num = df[['ArrDelay', 'DepDelay']]
numeric_vars = df_num.columns.values

from scipy.stats import zscore
zscore1 = lambda s: zscore(s, nan_policy='omit')

df_scaled = df_num.apply(zscore1)

In [None]:
df_scaled.apply(['min', 'max']).round(2).transpose()

In [None]:
sns.pairplot(df_num);

In [None]:
q01, q99 = df[['DepDelay', 'ArrDelay']].quantile([0.01, 0.99]).values

df['DepDelay'] = df['DepDelay'].clip(q01[0], q99[0])
df['ArrDelay'] = df['ArrDelay'].clip(q01[1], q99[1])

X = StandardScaler().fit_transform(df[['DepDelay', 'ArrDelay']])
kmeans = KMeans(n_clusters=3, random_state=0)
clusters = kmeans.fit_predict(X)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.scatter(df['DepDelay'], df['ArrDelay'], alpha=0.5)
ax1.set_title('Raw Data')
ax1.set_xlabel('Departure Delay')
ax1.set_ylabel('Arrival Delay')

ax2.scatter(df['DepDelay'], df['ArrDelay'], alpha=0.5, c=clusters)
ax2.set_title('Clipped Data with Clusters')
ax2.set_xlabel('Departure Delay')
ax2.set_ylabel('Arrival Delay')

plt.tight_layout()
plt.show()


In [None]:
z_scores = stats.zscore(df[['DepDelay', 'ArrDelay']])
outliers_removed = df[abs(z_scores).max(axis=1) <= 3]

### Machine Learning

In [None]:
X = df[['DepDelay', 'ArrDelay']]

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

kmeans = KMeans(n_clusters=3,random_state=0)
cluster_nums = kmeans.fit_predict(X)

Sanity check: Are the clusters too skewed?

In [None]:
cluster_nums[:100]

print("Cluster 1:", np.bincount(cluster_nums)[0])
print("Cluster 2:", np.bincount(cluster_nums)[1]) 
print("Cluster 3:", np.bincount(cluster_nums)[2])

I mean, kinda, but this is real world data so this may not be too bad

In [None]:
pd.DataFrame(kmeans.cluster_centers_, columns=['RedMeat', 'Fr&Veg'])

I also want to see the clusters

In [None]:
centers = pd.DataFrame(kmeans.cluster_centers_,
columns=['ArrDelay', 'DepDelay'])

centers.plot.barh()
plt.title('Cluster centers')
plt.xlabel('value'); plt.ylabel('cluster number')

Now in 2D

In [None]:
cluster_labels = {
    0: 'Low Delay',
    1: 'Medium Delay',
    2: 'High Delay'
}

labels = [cluster_labels[c] for c in clusters]

plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, label=labels)
plt.legend(['Low Delay', 'Medium Delay', 'High Delay'])
plt.tight_layout()

In [None]:
X2 = PCA(n_components=2).fit_transform(X)
ax = sns.scatterplot(x=X2[:,0], y=X2[:,1], hue=cluster_nums,
palette='Set1', s=20, legend=True)
ax.set_title('Flight delays')
ax.set_xlabel('1st principal component')
ax.set_ylabel('2nd principal component');
plt.legend(['Low Delay', 'High Delay', 'Medium Delay']);

I don't think these predictor variable are working out. Im going to try distance and airport.

In [None]:
df_num = df[['Distance', 'DepDelay']]
numeric_vars = df_num.columns.values

from scipy.stats import zscore
zscore1 = lambda s: zscore(s, nan_policy='omit')

df_scaled = df_num.apply(zscore1)

In [None]:
airport_stats = df.groupby('Origin').agg({
   'Distance': 'mean',
   'CarrierDelay': 'mean'
}).dropna()

X_scaled = scaler.fit_transform(airport_stats)

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0)
airport_stats['Cluster'] = kmeans.fit_predict(X_scaled)

In [None]:
# Blue is decent, red is bad and blue is good airport
colors = ['#1f77b4', '#d62728', '#2ca02c']  # RGB

plt.scatter(X_scaled[:, 0], X_scaled[:, 1],
           c=[colors[i] for i in airport_stats['Cluster']],
           s=30, alpha=0.3)

distances = np.sqrt(X_scaled[:, 0]**2 + X_scaled[:, 1]**2)
outlier_threshold = np.percentile(distances, 90)  # Top 10% as outliers
outliers = distances > outlier_threshold

for i, txt in enumerate(airport_stats.index):
    if outliers[i]:
        plt.annotate(txt, (X_scaled[i, 0], X_scaled[i, 1]), 
                    fontsize=8, xytext=(5, 5),
                    textcoords='offset points')

plt.xlabel('Average Flight Distance (scaled)')
plt.ylabel('Average Airport Delay (scaled)')
plt.title('Airport Clusters by Distance and Delay')
plt.show();

Next up, let's try using a regression algorithm to predict flight delay. First off, we perform a test/train split on our data. Let's use a test size of 0.15 to preserve a good amount of data for training

In [None]:
from sklearn.model_selection import train_test_split

X = df['DepDelay']
y = df['Distance']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)