# ðŸŽ¯ Airport Clustering Analysis

Segment airports based on delay patterns and congestion using unsupervised learning.

**Objectives:**
- Cluster airports by delay characteristics
- Identify high-risk vs low-risk airports
- Visualize cluster patterns


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

COLORS = ['#2E86AB', '#A23B72', '#18A558', '#F18F01', '#C73E1D', '#6C757D']
print("âœ“ Libraries imported")


In [None]:
# Load data
df = pd.read_csv('../data/processed/flights_cleaned.csv')

# Create airport statistics
origin_col = 'origin' if 'origin' in df.columns else 'Origin'
if origin_col not in df.columns:
    # Try to extract from route
    route_col = [c for c in df.columns if 'route' in c.lower()]
    if route_col:
        df['origin'] = df[route_col[0]].str.split('-').str[0]
        origin_col = 'origin'

airport_stats = df.groupby(origin_col).agg({
    'arrival_delay': ['mean', 'std', 'count'],
    'is_delayed': 'mean'
}).reset_index()

airport_stats.columns = ['airport', 'avg_delay', 'std_delay', 'total_flights', 'delay_rate']
airport_stats['delay_rate'] = airport_stats['delay_rate'] * 100

print(f"âœ“ Created statistics for {len(airport_stats)} airports")
display(airport_stats.head(10))


## K-Means Clustering


In [None]:
# Prepare features for clustering
features = ['avg_delay', 'std_delay', 'total_flights', 'delay_rate']
X_cluster = airport_stats[features].fillna(0)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# Find optimal K using elbow method and silhouette
k_range = range(2, min(8, len(airport_stats)))
inertias = []
silhouettes = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot elbow and silhouette
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(k_range, inertias, 'bo-', linewidth=2)
axes[0].set_xlabel('Number of Clusters (K)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')

axes[1].plot(k_range, silhouettes, 'go-', linewidth=2)
axes[1].set_xlabel('Number of Clusters (K)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis')

plt.tight_layout()
plt.savefig('../reports/figures/clustering_optimization.png', dpi=150)
plt.show()

# Select optimal K
optimal_k = k_range[np.argmax(silhouettes)]
print(f"âœ“ Optimal K: {optimal_k} (silhouette: {max(silhouettes):.3f})")


In [None]:
# Final clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
airport_stats['cluster'] = kmeans.fit_predict(X_scaled)

# Cluster profiles
print("\nðŸ“Š Cluster Profiles:")
cluster_profiles = airport_stats.groupby('cluster')[features].mean()
display(cluster_profiles.round(2))

# Visualize clusters
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Scatter: Delay vs Volume
ax1 = axes[0]
for i, cluster in enumerate(sorted(airport_stats['cluster'].unique())):
    mask = airport_stats['cluster'] == cluster
    ax1.scatter(airport_stats[mask]['total_flights'], airport_stats[mask]['avg_delay'],
               c=COLORS[i], label=f'Cluster {cluster}', s=100, alpha=0.7)
ax1.set_xlabel('Total Flights')
ax1.set_ylabel('Average Delay (min)')
ax1.set_title('Airport Clusters: Volume vs Delay')
ax1.legend()

# Cluster distribution
ax2 = axes[1]
cluster_counts = airport_stats['cluster'].value_counts().sort_index()
ax2.bar(cluster_counts.index, cluster_counts.values, color=COLORS[:len(cluster_counts)])
ax2.set_xlabel('Cluster')
ax2.set_ylabel('Number of Airports')
ax2.set_title('Airports per Cluster')

plt.tight_layout()
plt.savefig('../reports/figures/airport_clusters.png', dpi=150)
plt.show()

print("\nâœ“ Clustering complete!")
