In [27]:
import warnings
warnings.filterwarnings('ignore')

In [28]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [29]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [31]:
# Load the data
file_path = Path('mlfinal_df.csv')
df = pd.read_csv(file_path)


df.head()

Unnamed: 0,DAY_OF_WEEK,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,DEP_TIME,ARR_TIME,DEP_DEL15,DISTANCE,DISTANCE_GROUP,AWND,PRCP,TAVG,WDF2,WDF5,WSF2,WSF5
0,7,10397,11150,1643,1720,0,83,1,6.49,0.0,53.0,320.0,320.0,13.0,15.0
1,7,10397,10980,1631,1719,0,106,1,6.49,0.0,53.0,320.0,320.0,13.0,15.0
2,7,10397,11150,1018,1105,0,83,1,6.49,0.0,53.0,320.0,320.0,13.0,15.0
3,7,10397,15249,1629,1725,0,223,1,6.49,0.0,53.0,320.0,320.0,13.0,15.0
4,7,10397,10990,2128,2300,0,457,2,6.49,0.0,53.0,320.0,320.0,13.0,15.0


# Split the Data into Training and Testing

In [33]:
# Create our features
X = pd.get_dummies(df.drop('DEP_DEL15', axis=1))

# Create our target
y = df["DEP_DEL15"]

In [34]:
X.describe()

Unnamed: 0,DAY_OF_WEEK,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,DEP_TIME,ARR_TIME,DISTANCE,DISTANCE_GROUP,AWND,PRCP,TAVG,WDF2,WDF5,WSF2,WSF5
count,372868.0,372868.0,372868.0,372868.0,372868.0,372868.0,372868.0,372868.0,372868.0,372868.0,372868.0,372868.0,372868.0,372868.0
mean,3.838492,12608.079833,12732.823358,1366.45879,1511.935004,878.087382,3.978719,8.607354,0.09792,44.049133,211.519546,213.073206,18.727928,24.649397
std,1.920216,1458.701082,1524.685104,497.787017,533.379405,636.495386,2.501331,4.420188,0.276636,16.440001,101.940382,101.419552,7.230996,9.796632
min,1.0,10397.0,10135.0,1.0,1.0,66.0,1.0,1.34,0.0,-22.0,10.0,10.0,6.0,6.9
25%,2.0,11278.0,11292.0,942.0,1124.0,404.0,2.0,4.92,0.0,33.0,130.0,130.0,13.0,17.0
50%,4.0,12889.0,12892.0,1351.0,1537.0,719.0,3.0,7.83,0.0,44.0,220.0,230.0,17.0,23.0
75%,5.0,13930.0,14100.0,1759.0,1944.0,1118.0,5.0,11.18,0.03,56.0,300.0,300.0,23.0,30.0
max,7.0,15376.0,16218.0,2400.0,2400.0,4983.0,11.0,26.62,2.32,79.0,360.0,360.0,45.0,63.1


In [35]:
# Check the balance of our target values
y.value_counts()

0    306329
1     66539
Name: DEP_DEL15, dtype: int64

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(279651, 14)

### Balanced Random Forest Classifier

In [37]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest_classifer = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_forest_classifier= random_forest_classifer.fit(X_train, y_train)

In [38]:
# Calculated the balanced accuracy score
from sklearn.metrics import accuracy_score
y_pred = random_forest_classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.7149983372131693

In [39]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[55598, 20984],
       [ 5583, 11052]], dtype=int64)

In [40]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.73      0.66      0.81      0.69      0.49     76582
          1       0.34      0.66      0.73      0.45      0.69      0.48     16635

avg / total       0.81      0.71      0.68      0.74      0.69      0.48     93217



In [41]:
# List the features sorted in descending order by feature importance
features = sorted(zip(random_forest_classifer.feature_importances_, X.columns), reverse=True)
features

[(0.2064866167777747, 'DEP_TIME'),
 (0.18764576457929624, 'ARR_TIME'),
 (0.1214045756826761, 'DISTANCE'),
 (0.11554878094578111, 'DEST_AIRPORT_ID'),
 (0.05199626007964711, 'TAVG'),
 (0.04752276919585736, 'DISTANCE_GROUP'),
 (0.04329920134341289, 'AWND'),
 (0.04175211065670366, 'ORIGIN_AIRPORT_ID'),
 (0.03769932401049945, 'WSF5'),
 (0.0348039050557218, 'WDF5'),
 (0.033917038150990744, 'WDF2'),
 (0.030267261472523673, 'WSF2'),
 (0.024719976716958955, 'DAY_OF_WEEK'),
 (0.022936415332156335, 'PRCP')]

### Easy Ensemble AdaBoost Classifier

In [42]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy_ensemble_classifier = EasyEnsembleClassifier(random_state=1)
easy_ensemble_classifier.fit(X_train, y_train)

EasyEnsembleClassifier(random_state=1)

In [43]:
# Calculated the balanced accuracy score
y_pred = easy_ensemble_classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.648604868210734

In [44]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[49641, 26941],
       [ 5815, 10820]], dtype=int64)

In [45]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.65      0.65      0.75      0.65      0.42     76582
          1       0.29      0.65      0.65      0.40      0.65      0.42     16635

avg / total       0.79      0.65      0.65      0.69      0.65      0.42     93217

