 # Predicting Airline Delays
 

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [6]:
file_path = Path('final_df.csv')
df = pd.read_csv(file_path, index_col=0)
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DATE,DAY_OF_WEEK,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,DEST_CITY_NAME,DEP_TIME,DEP_DELAY_NEW,ARR_TIME,...,DISTANCE,DISTANCE_GROUP,NAME,AWND,PRCP,TAVG,WDF2,WDF5,WSF2,WSF5
0,1,6,1/6/2019,7,10397,11150,"Columbus, GA",1643,0,1720,...,83,1,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,6.49,0.0,53.0,320.0,320.0,13.0,15.0
1,1,6,1/6/2019,7,10397,10980,"Chattanooga, TN",1631,0,1719,...,106,1,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,6.49,0.0,53.0,320.0,320.0,13.0,15.0
2,1,6,1/6/2019,7,10397,11150,"Columbus, GA",1018,0,1105,...,83,1,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,6.49,0.0,53.0,320.0,320.0,13.0,15.0
3,1,6,1/6/2019,7,10397,15249,"Tallahassee, FL",1629,12,1725,...,223,1,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,6.49,0.0,53.0,320.0,320.0,13.0,15.0
4,1,6,1/6/2019,7,10397,10990,"Charlottesville, VA",2128,8,2300,...,457,2,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,6.49,0.0,53.0,320.0,320.0,13.0,15.0


In [35]:
columns = ["DAY_OF_WEEK", "ORIGIN_AIRPORT_ID", "DEST_AIRPORT_ID","DEP_TIME", "ARR_TIME", "DEP_DEL15", "DISTANCE", "DISTANCE_GROUP", "AWND", "PRCP", "TAVG", "WDF2", "WDF5", "WSF2", "WSF5"]
ml_df= df[columns]
ml_df.head()

Unnamed: 0,DAY_OF_WEEK,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,DEP_TIME,ARR_TIME,DEP_DEL15,DISTANCE,DISTANCE_GROUP,AWND,PRCP,TAVG,WDF2,WDF5,WSF2,WSF5
0,7,10397,11150,1643,1720,0,83,1,6.49,0.0,53.0,320.0,320.0,13.0,15.0
1,7,10397,10980,1631,1719,0,106,1,6.49,0.0,53.0,320.0,320.0,13.0,15.0
2,7,10397,11150,1018,1105,0,83,1,6.49,0.0,53.0,320.0,320.0,13.0,15.0
3,7,10397,15249,1629,1725,0,223,1,6.49,0.0,53.0,320.0,320.0,13.0,15.0
4,7,10397,10990,2128,2300,0,457,2,6.49,0.0,53.0,320.0,320.0,13.0,15.0


In [36]:
#ml_df.to_csv('mlfinal_df.csv', index=False)

 ## Separate the Features (X) from the Target (y)

In [37]:
# Create our features
X = pd.get_dummies(ml_df.drop("DEP_DEL15", axis=1))

# Create our target
y = ml_df["DEP_DEL15"]


In [38]:
#Scale the data
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()
flight_data_scaled = data_scaler.fit_transform(ml_df)

In [39]:
flight_data_scaled[:5]

array([[ 1.64643511, -1.51578882, -1.03813271,  0.55554197,  0.39008869,
        -0.46606241, -1.24916608, -1.19085537, -0.47901952, -0.35396975,
         0.54445733,  1.06415728,  1.054303  , -0.79213642, -0.9849722 ],
       [ 1.64643511, -1.51578882, -1.14963129,  0.53143524,  0.38821385,
        -0.46606241, -1.21303066, -1.19085537, -0.47901952, -0.35396975,
         0.54445733,  1.06415728,  1.054303  , -0.79213642, -0.9849722 ],
       [ 1.64643511, -1.51578882, -1.03813271, -0.70001676, -0.76293825,
        -0.46606241, -1.24916608, -1.19085537, -0.47901952, -0.35396975,
         0.54445733,  1.06415728,  1.054303  , -0.79213642, -0.9849722 ],
       [ 1.64643511, -1.51578882,  1.65029488,  0.52741746,  0.3994629 ,
        -0.46606241, -1.02921133, -1.19085537, -0.47901952, -0.35396975,
         0.54445733,  1.06415728,  1.054303  , -0.79213642, -0.9849722 ],
       [ 1.64643511, -1.51578882, -1.14307255,  1.52985555,  1.47749622,
        -0.46606241, -0.66157266, -0.79106762, 

In [40]:
print(np.mean(flight_data_scaled[:,0]))
print(np.std(flight_data_scaled[:,0]))

4.1466175510206776e-17
0.9999999999999999


In [41]:
flight_df = flight_data_scaled

In [42]:
flight_data_scaled.shape

(372868, 15)

 ## Split our data into training and testing

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(279651, 14)

 ## Create a Logistic Regression Model

In [44]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000,
                                random_state=1)

 ## Fit (train) or model using the training data

In [45]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=1)

 ## Make predictions

In [46]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,1
5,0,0
6,0,0
7,0,1
8,0,0
9,0,1


In [47]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8239913320531662


In [48]:
confusion_matrix(y_test, y_pred)

array([[76358,   224],
       [16183,   452]], dtype=int64)

In [49]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      1.00      0.03      0.90      0.16      0.03     76582
          1       0.67      0.03      1.00      0.05      0.16      0.02     16635

avg / total       0.80      0.82      0.20      0.75      0.16      0.03     93217

