# 0-1 Dataset

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [23]:
# Load the dataset
file_path = "data/flight_data_0-1.csv"
data = pd.read_csv(file_path)

In [24]:
# Inspect the dataset to identify categorical columns
categorical_columns = ['AIRLINE', 'ORIGIN', 'DEST']

# Separate features and target
X = data.drop(columns=['ARR_DELAY'])
y = data['ARR_DELAY']

# Encode categorical features
encoder = OrdinalEncoder()
X[categorical_columns] = encoder.fit_transform(X[categorical_columns])

In [25]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Initialize LightGBM regressor
model = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', random_state=42)

# Train the model
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055363 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1147
[LightGBM] [Info] Number of data points in the train set: 678251, number of used features: 11
[LightGBM] [Info] Start training from score 0.209418


In [27]:
# Make predictions
y_pred = model.predict(X_test)

In [28]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Root Mean Squared Error (RMSE): 0.3641


In [29]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

Feature Importance:
                    Feature  Importance
3              ELAPSED_TIME         920
5                  DISTANCE         657
4                  AIR_TIME         258
1                    ORIGIN         222
0                   AIRLINE         215
10    ROUTE_DELAY_INDICATOR         202
8                  DEP_HOUR         180
2                      DEST         165
9   MONTHLY_DELAY_INDICATOR          80
6                     MONTH          53
7               DAY_OF_WEEK          48


# 15+ Dataset

In [30]:
# Load the dataset
file_path_15_plus = "data/flight_data_15+.csv"
data_15_plus = pd.read_csv(file_path_15_plus)

In [31]:
# Separate features and target
X_15_plus = data_15_plus.drop(columns=['ARR_DELAY'])
y_15_plus = data_15_plus['ARR_DELAY']

In [32]:
# Encode categorical features
encoder_15_plus = OrdinalEncoder()
X_15_plus[categorical_columns] = encoder_15_plus.fit_transform(X_15_plus[categorical_columns])

# Separate features and target
X_15_plus = data_15_plus.drop(columns=['ARR_DELAY'])
y_15_plus = data_15_plus['ARR_DELAY']

# Encode categorical features
encoder_15_plus = OrdinalEncoder()
X_15_plus[categorical_columns] = encoder_15_plus.fit_transform(X_15_plus[categorical_columns])

In [33]:
# Split the dataset into training and testing sets
X_train_15_plus, X_test_15_plus, y_train_15_plus, y_test_15_plus = train_test_split(
    X_15_plus, y_15_plus, test_size=0.2, random_state=42
)

In [34]:
# Initialize LightGBM regressor
model_15_plus = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', random_state=42)

# Train the model
model_15_plus.fit(X_train_15_plus, y_train_15_plus)

# Make predictions
y_pred_15_plus = model_15_plus.predict(X_test_15_plus)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1147
[LightGBM] [Info] Number of data points in the train set: 678251, number of used features: 11
[LightGBM] [Info] Start training from score 13.208501


In [35]:
# Evaluate the model
mse_15_plus = mean_squared_error(y_test_15_plus, y_pred_15_plus)
rmse_15_plus = mse_15_plus ** 0.5
print(f"Root Mean Squared Error (RMSE) for 15+ dataset: {rmse_15_plus:.4f}")

Root Mean Squared Error (RMSE) for 15+ dataset: 36.1211


In [36]:
# Feature importance
feature_importance_15_plus = pd.DataFrame({
    'Feature': X_15_plus.columns,
    'Importance': model_15_plus.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Feature Importance for 15+ dataset:")
print(feature_importance_15_plus)

Feature Importance for 15+ dataset:
                    Feature  Importance
3              ELAPSED_TIME         831
5                  DISTANCE         505
4                  AIR_TIME         349
10    ROUTE_DELAY_INDICATOR         278
8                  DEP_HOUR         227
0                   AIRLINE         220
1                    ORIGIN         139
9   MONTHLY_DELAY_INDICATOR         137
2                      DEST         116
6                     MONTH         104
7               DAY_OF_WEEK          94


# No encoders

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = "data/flight_data_0-1.csv"
data = pd.read_csv(file_path)

# Inspect the dataset to identify categorical columns
categorical_columns = ['AIRLINE', 'ORIGIN', 'DEST']

# Convert categorical columns to 'category' dtype
data[categorical_columns] = data[categorical_columns].astype('category')

# Separate features and target
X = data.drop(columns=['ARR_DELAY'])
y = data['ARR_DELAY']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize LightGBM regressor
model = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', random_state=42)

# Train the model
model.fit(
    X_train,
    y_train,
    categorical_feature=categorical_columns  # Specify categorical columns
)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1147
[LightGBM] [Info] Number of data points in the train set: 678251, number of used features: 11
[LightGBM] [Info] Start training from score 0.209418
Root Mean Squared Error (RMSE): 0.3613
Feature Importance:
                    Feature  Importance
3              ELAPSED_TIME         886
5                  DISTANCE         583
1                    ORIGIN         443
2                      DEST         385
0                   AIRLINE         185
8                  DEP_HOUR         142
4                  AIR_TIME         141
9   MONTHLY_DELAY_INDICATOR          80
10    ROUTE_DELAY_INDICATOR          65
6                     MONTH          46
7               DAY_OF_WEEK          44


In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = "data/flight_data_15+.csv"
data = pd.read_csv(file_path)

# Inspect the dataset to identify categorical columns
categorical_columns = ['AIRLINE', 'ORIGIN', 'DEST']

# Convert categorical columns to 'category' dtype
data[categorical_columns] = data[categorical_columns].astype('category')

# Separate features and target
X = data.drop(columns=['ARR_DELAY'])
y = data['ARR_DELAY']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize LightGBM regressor
model = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', random_state=42)

# Train the model
model.fit(
    X_train, 
    y_train,
    categorical_feature=categorical_columns  # Specify categorical columns
)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015213 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1147
[LightGBM] [Info] Number of data points in the train set: 678251, number of used features: 11
[LightGBM] [Info] Start training from score 13.208501
Root Mean Squared Error (RMSE): 36.0279
Feature Importance:
                    Feature  Importance
3              ELAPSED_TIME         754
5                  DISTANCE         416
1                    ORIGIN         410
2                      DEST         353
4                  AIR_TIME         251
0                   AIRLINE         223
8                  DEP_HOUR         214
10    ROUTE_DELAY_INDICATOR         128
9   MONTHLY_DELAY_INDICATOR         103
6                     MONTH          88
7               DAY_OF_WEEK          60
