In [56]:
# Install all dependencies
%pip install pandas lightgbm scikit-learn numpy matplotlib

# load libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

Note: you may need to restart the kernel to use updated packages.


In [57]:
# load data from csv flights.csv file
flights = pd.read_csv('data/flights.csv')
rows = []
for _, row in flights.iterrows():
    rows.append({
        'Month': row['Month'],
        'DayofMonth': row['DayofMonth'],
        'Carrier': row['Carrier'],
        'OriginAirportID': row['OriginAirportID'],
        'DestAirportID': row['DestAirportID'],
        'DepDelay': row['DepDelay']
    });

# convert to pandas dataframe
features = pd.DataFrame(rows)

# Encode Carrier as categorical codes
features['Carrier'] = features['Carrier'].astype('category').cat.codes

labels = flights['ArrDel15'].astype(int).values

# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [58]:
# Prepare LightGBM datasets
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# Set parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': 42
}

# Train the model
gbm = lgb.train(
	params,
	lgb_train,
	valid_sets=[lgb_train, lgb_eval],
	num_boost_round=100,
	callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)]
)


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.295655	valid_1's binary_logloss: 0.293816
[20]	training's binary_logloss: 0.253636	valid_1's binary_logloss: 0.252521
[30]	training's binary_logloss: 0.241665	valid_1's binary_logloss: 0.241199
[40]	training's binary_logloss: 0.237002	valid_1's binary_logloss: 0.237243
[50]	training's binary_logloss: 0.234381	valid_1's binary_logloss: 0.235345
[60]	training's binary_logloss: 0.232579	valid_1's binary_logloss: 0.234291
[70]	training's binary_logloss: 0.231233	valid_1's binary_logloss: 0.233745
[80]	training's binary_logloss: 0.230092	valid_1's binary_logloss: 0.233391
[90]	training's binary_logloss: 0.229025	valid_1's binary_logloss: 0.233063
[100]	training's binary_logloss: 0.228236	valid_1's binary_logloss: 0.232966
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.228236	valid_1's binary_logloss: 0.232966


## Explanation of LightGBM Training Results

- The model was trained for 100 boosting rounds, as early stopping was not triggered (validation loss kept improving or plateaued).
- `binary_logloss` is the loss function for binary classification; lower values indicate better model fit.
- Both training and validation logloss decreased steadily, showing the model is learning and not overfitting.
- Final logloss values:  
  - Training: **0.228**  
  - Validation: **0.233**
- The small gap between training and validation logloss suggests good generalization to unseen data.

In [59]:
# Measure model accuracy on the test set
# Predict probabilities and convert to binary predictions
y_pred_proba = gbm.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy * 100:.2f}%")

Test set accuracy: 91.54%


In [60]:
# Save the trained LightGBM model to a file
gbm.save_model('flight_delay_lgbm.txt')
print("Model saved to flight_delay_lgbm.txt")

Model saved to flight_delay_lgbm.txt


In [25]:
# Load the saved LightGBM model
loaded_gbm = lgb.Booster(model_file='flight_delay_lgbm.txt')

def predict_delay(origin_airport_id, dest_airport_id, day_of_week):
	# Use median values from training data for other features
	sample = X_train.median(numeric_only=True).to_dict()
	sample['OriginAirportID'] = origin_airport_id
	sample['DestAirportID'] = dest_airport_id
	sample['DayOfWeek'] = day_of_week
	# Carrier is categorical, use most frequent value
	sample['Carrier'] = X_train['Carrier'].mode()[0]
	# Ensure correct order of columns
	columns = X_train.columns
	sample_df = pd.DataFrame([sample], columns=columns)
	# Predict probability
	proba = loaded_gbm.predict(sample_df)[0]
	prediction = int(proba > 0.5)
	return prediction, proba

# Example usage:
# pred, prob = predict_delay(15304, 12478, 1)
# print(f"Predicted delay: {pred} (probability: {prob:.2f})")

In [None]:
# Test the prediction function
pred, prob = predict_delay(12478, 11057, 5)
print(f"Predicted delay: {pred} (probability: {prob * 100:.2f}%)")

Predicted delay: 0 (probability: 2.89%)
Confusion Matrix:
[[41815   951]
 [ 3649  7973]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     42766
           1       0.89      0.69      0.78     11622

    accuracy                           0.92     54388
   macro avg       0.91      0.83      0.86     54388
weighted avg       0.91      0.92      0.91     54388



In [55]:
# testing the accuracy of the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[41815   951]
 [ 3649  7973]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     42766
           1       0.89      0.69      0.78     11622

    accuracy                           0.92     54388
   macro avg       0.91      0.83      0.86     54388
weighted avg       0.91      0.92      0.91     54388

