In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Load the dataset
df = pd.read_csv("flight_delays.csv")  # Replace with your actual dataset path
print("Dataset Loaded")

In [None]:
# Initial overview
print(df.info())
print(df.describe())
print(df.head())

In [None]:
# Target Variable: Delay Minutes
plt.figure(figsize=(6, 4))
sns.histplot(df["DelayMinutes"], bins=50, kde=True)
plt.title("Distribution of Flight Delay (minutes)")
plt.xlabel("Delay in Minutes")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Check outliers in delay
plt.figure(figsize=(6, 4))
sns.boxplot(x=df["DelayMinutes"])
plt.title("Boxplot of Delay Minutes")
plt.show()

In [None]:
# Distribution of Flights by Day of Week
sns.countplot(x="DayOfWeek", data=df)
plt.title("Flights by Day of Week")
plt.xlabel("Day")
plt.ylabel("Flight Count")
plt.show()

In [None]:
# Airline frequency
airline_counts = df["Airline"].value_counts()
airline_counts.plot(kind="bar", figsize=(8, 4), color="skyblue")
plt.title("Flight Count per Airline")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Scatter plot of Distance vs Delay
plt.figure(figsize=(6, 4))
sns.scatterplot(x="Distance", y="DelayMinutes", data=df)
plt.title("Flight Distance vs Delay")
plt.xlabel("Distance (km)")
plt.ylabel("Delay (min)")
plt.show()

In [None]:
# Encoding categorical columns
categorical_cols = ["Airline", "Origin", "Destination", "Weather"]
label_encoded = []
for col in categorical_cols:
    if df[col].nunique() <= 2:  # Binary
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoded.append(col)
    else:
        df = pd.get_dummies(df, columns=[col], drop_first=True)

In [None]:
print(f"Label Encoded: {label_encoded} | One-Hot Encoded: {[col for col in categorical_cols if col not in label_encoded]}")

In [None]:
# Standardize numerical features
scaler = StandardScaler()
numeric_cols = ["Distance", "DepartureTime", "ArrivalTime", "DelayMinutes"]
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print("Numerical features standardized")

In [None]:
# Correlation heatmap (excluding dummies)
corr_cols = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
plt.figure(figsize=(10, 8))
sns.heatmap(df[corr_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Variance Inflation Factor
X_vif = df.drop(columns=["DelayMinutes"])  # Target column
vif_data = pd.DataFrame()
vif_data["Feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
print("VIF Scores:")
print(vif_data)

Model Training, Evaluation & Selection

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import joblib

In [None]:
# Load preprocessed dataset
df = pd.read_csv("processed_flight_data.csv")  # Replace with your updated CSV


In [None]:
# Define features and target
X = df.drop(columns=["DelayMinutes"])  # Adjust target if needed
y = df["DelayMinutes"]

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize models
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}

In [None]:
# Train & Evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)
    r2 = r2_score(y_test, y_pred_test)

    results[name] = {
        "Train MSE": train_mse,
        "Test MSE": test_mse,
        "R2 Score": r2
    }

    print(f"{name} — Test MSE: {test_mse:.2f}, R²: {r2:.2f}")

In [None]:
# Plot Loss Curve
for name, model in models.items():
    y_pred_test = model.predict(X_test)
    residuals = y_test - y_pred_test
    plt.plot(np.arange(len(residuals)), residuals, label=name)

plt.legend()
plt.title("Residuals / Loss Curves")
plt.xlabel("Sample Index")
plt.ylabel("Residuals")
plt.show()

In [None]:
# Scatter Plot: Actual vs Predicted
best_model_name = min(results, key=lambda k: results[k]["Test MSE"])
best_model = models[best_model_name]
y_best_pred = best_model.predict(X_test)

plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_best_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
plt.title(f"{best_model_name}: Actual vs Predicted Delay")
plt.xlabel("Actual Delay (min)")
plt.ylabel("Predicted Delay (min)")
plt.grid(True)
plt.show()

In [None]:
# Save the best model
joblib.dump(best_model, "best_model.pkl")
print(f"✅ Saved best model: {best_model_name}")

Old Code

In [None]:
# Calculate derived features
df["DepartureDelay"] = pd.to_datetime(df["ActualDeparture"]) - pd.to_datetime(df["ScheduledDeparture"]
df["DepartureDelayMinutes"] = df["DepartureDelay"].dt.total_seconds() / 60

In [None]:
# Categorical Overview
plt.figure(figsize=(10, 4))
sns.countplot(y="Airline", data=df, order=df["Airline"].value_counts().index)
plt.title("Flight Count per Airline")
plt.xlabel("Count")
plt.ylabel("Airline")
plt.show()

In [None]:
plt.figure(figsize=(12, 4))
sns.countplot(x="Origin", data=df, order=df["Origin"].value_counts().index)
plt.title("Departure Flights by Origin Airport")
plt.xlabel("Origin")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Distribution of Departure Delay
sns.histplot(df["DepartureDelayMinutes"], bins=50, kde=True)
plt.title("Distribution of Departure Delays (minutes)")
plt.xlabel("Departure Delay (min)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Target Delay Distribution
sns.histplot(df["DelayMinutes"], bins=50, color="orange", kde=True)
plt.title("Reported Delay Minutes Distribution")
plt.xlabel("Delay Minutes")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Early Arrival Patterns
sns.boxplot(x="Airline", y="EarlyArrival", data=df)
plt.xticks(rotation=45)
plt.title("Early Arrival Distribution by Airline")
plt.ylabel("Early Arrival (min)")
plt.show()

In [None]:
# Encode categorical columns
cat_cols = ["Airline", "Origin", "Destination"]
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [None]:
# Standardize numerical features
scaler = StandardScaler()
num_cols = ["DelayMinutes", "EarlyArrival", "DepartureDelayMinutes"]
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

In [None]:
# Confirm standardization
print(df_encoded[num_cols].describe())

In [None]:
for col in numeric_cols:
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col} After Scaling')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()