In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from keras.layers import Dense, Dropout




In [2]:
flights_df=pd.read_csv('./Resources/Atlanta_Flight_Status.csv').drop('Unnamed: 0',axis=1)

flights_df

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,MKT_UNIQUE_CARRIER,MKT_CARRIER_FL_NUM,ORIGIN_CITY_NAME,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DELAY_GROUP,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,7,AA,1006,"Miami, FL","Atlanta, GA",2100,2058.0,0.0,-1.0,3,,,,,
1,1,1,7,AA,1045,"Miami, FL","Atlanta, GA",1925,1922.0,0.0,-1.0,3,,,,,
2,1,1,7,AA,1051,"Atlanta, GA","Dallas/Fort Worth, TX",1817,1816.0,0.0,-1.0,3,,,,,
3,1,1,7,AA,1051,"Dallas/Fort Worth, TX","Atlanta, GA",1425,1420.0,0.0,-1.0,3,,,,,
4,1,1,7,AA,1151,"Atlanta, GA","Miami, FL",602,553.0,0.0,-1.0,3,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593798,9,30,6,WN,778,"Indianapolis, IN","Atlanta, GA",810,806.0,0.0,-1.0,2,,,,,
593799,9,30,6,WN,854,"Miami, FL","Atlanta, GA",510,505.0,0.0,-1.0,3,,,,,
593800,9,30,6,WN,888,"Baltimore, MD","Savannah, GA",1505,1527.0,22.0,1.0,3,,,,,
593801,9,30,6,WN,924,"Atlanta, GA","West Palm Beach/Palm Beach, FL",1055,1058.0,3.0,0.0,3,,,,,


In [3]:
flights_df.describe()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,MKT_CARRIER_FL_NUM,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DELAY_GROUP,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
count,593803.0,593803.0,593803.0,593803.0,593803.0,587287.0,587287.0,587287.0,593803.0,121231.0,121231.0,121231.0,121231.0,121231.0
mean,4.893544,15.83798,3.979845,2110.437495,1345.712629,1344.986775,15.866375,0.213134,3.181274,30.026231,4.26852,12.346479,0.11304,22.934654
std,2.504191,8.772622,1.993397,1342.466807,511.045747,527.795295,52.036827,2.322178,1.892323,79.21078,32.03912,30.364191,3.125093,51.516719
min,1.0,1.0,1.0,2.0,1.0,1.0,0.0,-2.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,8.0,2.0,1054.0,910.0,910.0,0.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,16.0,4.0,2034.0,1327.0,1329.0,0.0,-1.0,3.0,8.0,0.0,0.0,0.0,0.0
75%,7.0,23.0,6.0,2727.0,1755.0,1759.0,10.0,0.0,4.0,29.0,0.0,15.0,0.0,26.0
max,9.0,31.0,7.0,9685.0,2359.0,2400.0,3221.0,12.0,11.0,3221.0,1197.0,1300.0,323.0,2216.0


In [4]:
flights_df['MKT_UNIQUE_CARRIER'].unique()

array(['AA', 'AS', 'B6', 'DL', 'F9', 'NK', 'UA', 'WN', 'G4'], dtype=object)

In [5]:
#Only look at flights leaving Atlanta on Delta, AA, WN
DL_flights_df = flights_df.loc[(flights_df['ORIGIN_CITY_NAME']=='Atlanta, GA')]
#Seperate the data into features and targets.
X=DL_flights_df[['MONTH','DAY_OF_MONTH','DAY_OF_WEEK','DEST_CITY_NAME','CRS_DEP_TIME','MKT_UNIQUE_CARRIER']]

X=X.reset_index(drop=True)
X

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEST_CITY_NAME,CRS_DEP_TIME,MKT_UNIQUE_CARRIER
0,1,1,7,"Dallas/Fort Worth, TX",1817,AA
1,1,1,7,"Miami, FL",602,AA
2,1,1,7,"Washington, DC",1424,AA
3,1,1,7,"Miami, FL",1822,AA
4,1,1,7,"Charlotte, NC",1253,AA
...,...,...,...,...,...,...
280337,9,30,6,"Milwaukee, WI",825,WN
280338,9,30,6,"Phoenix, AZ",1755,WN
280339,9,30,6,"Orlando, FL",1030,WN
280340,9,30,6,"West Palm Beach/Palm Beach, FL",1055,WN


In [6]:
dates = []
for row in X.iterrows():
    if len(str(row[1]['DAY_OF_MONTH'])) == 1:
        dates.append(f"0{int(row[1]['MONTH'])}-0{int(row[1]['DAY_OF_MONTH'])}")
    else:
        dates.append(f"0{int(row[1]['MONTH'])}-{int(row[1]['DAY_OF_MONTH'])}")
        
X['Date'] = dates

In [7]:
X=X.drop(['MONTH','DAY_OF_MONTH'],axis = 1)
X

Unnamed: 0,DAY_OF_WEEK,DEST_CITY_NAME,CRS_DEP_TIME,MKT_UNIQUE_CARRIER,Date
0,7,"Dallas/Fort Worth, TX",1817,AA,01-01
1,7,"Miami, FL",602,AA,01-01
2,7,"Washington, DC",1424,AA,01-01
3,7,"Miami, FL",1822,AA,01-01
4,7,"Charlotte, NC",1253,AA,01-01
...,...,...,...,...,...
280337,6,"Milwaukee, WI",825,WN,09-30
280338,6,"Phoenix, AZ",1755,WN,09-30
280339,6,"Orlando, FL",1030,WN,09-30
280340,6,"West Palm Beach/Palm Beach, FL",1055,WN,09-30


In [8]:
X["DAY_OF_WEEK"].unique()

array([7, 1, 2, 3, 4, 5, 6], dtype=int64)

In [9]:
day_of_weeks = []
days_of_the_week = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday']
for row in X.iterrows():
    day_of_weeks.append(days_of_the_week[row[1]['DAY_OF_WEEK']-1])

In [10]:
X['DAY_OF_WEEK'] = day_of_weeks
X

Unnamed: 0,DAY_OF_WEEK,DEST_CITY_NAME,CRS_DEP_TIME,MKT_UNIQUE_CARRIER,Date
0,Sunday,"Dallas/Fort Worth, TX",1817,AA,01-01
1,Sunday,"Miami, FL",602,AA,01-01
2,Sunday,"Washington, DC",1424,AA,01-01
3,Sunday,"Miami, FL",1822,AA,01-01
4,Sunday,"Charlotte, NC",1253,AA,01-01
...,...,...,...,...,...
280337,Saturday,"Milwaukee, WI",825,WN,09-30
280338,Saturday,"Phoenix, AZ",1755,WN,09-30
280339,Saturday,"Orlando, FL",1030,WN,09-30
280340,Saturday,"West Palm Beach/Palm Beach, FL",1055,WN,09-30


In [11]:
X = pd.get_dummies(X)

In [12]:
X

Unnamed: 0,CRS_DEP_TIME,DAY_OF_WEEK_Friday,DAY_OF_WEEK_Monday,DAY_OF_WEEK_Saturday,DAY_OF_WEEK_Sunday,DAY_OF_WEEK_Thursday,DAY_OF_WEEK_Tuesday,DAY_OF_WEEK_Wednesday,"DEST_CITY_NAME_Albany, GA","DEST_CITY_NAME_Albany, NY",...,Date_09-21,Date_09-22,Date_09-23,Date_09-24,Date_09-25,Date_09-26,Date_09-27,Date_09-28,Date_09-29,Date_09-30
0,1817,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,602,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1424,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1822,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1253,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280337,825,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
280338,1755,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
280339,1030,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
280340,1055,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [41]:
testing = pd.concat([X,X.tail(1)])
testing

Unnamed: 0,CRS_DEP_TIME,DAY_OF_WEEK_Friday,DAY_OF_WEEK_Monday,DAY_OF_WEEK_Saturday,DAY_OF_WEEK_Sunday,DAY_OF_WEEK_Thursday,DAY_OF_WEEK_Tuesday,DAY_OF_WEEK_Wednesday,"DEST_CITY_NAME_Albany, GA","DEST_CITY_NAME_Albany, NY",...,Date_09-21,Date_09-22,Date_09-23,Date_09-24,Date_09-25,Date_09-26,Date_09-27,Date_09-28,Date_09-29,Date_09-30
0,1817,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,602,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1424,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1822,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1253,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280338,1755,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
280339,1030,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
280340,1055,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
280341,1510,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
X.to_csv('./predictions/Resources/X.csv')

In [39]:
#Scale the feature data, X
from sklearn.preprocessing import StandardScaler
# Scaling the numeric columns
scaler = StandardScaler()
scaler.fit(testing)
transformed_data = scaler.transform(testing)
# Creating a DataFrame with with the scaled data
transformed_data_df = pd.DataFrame(transformed_data)

transformed_data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,427,428,429,430,431,432,433,434,435,436
0,0.714999,-0.422605,-0.41619,-0.380506,2.443443,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,-0.055310
1,-1.693684,-0.422605,-0.41619,-0.380506,2.443443,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,-0.055310
2,-0.064106,-0.422605,-0.41619,-0.380506,2.443443,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,-0.055310
3,0.724911,-0.422605,-0.41619,-0.380506,2.443443,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,-0.055310
4,-0.403106,-0.422605,-0.41619,-0.380506,2.443443,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,-0.055310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280338,0.592086,-0.422605,-0.41619,2.628076,-0.409259,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,18.080004
280339,-0.845194,-0.422605,-0.41619,2.628076,-0.409259,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,18.080004
280340,-0.795632,-0.422605,-0.41619,2.628076,-0.409259,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,18.080004
280341,0.106385,-0.422605,-0.41619,2.628076,-0.409259,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,18.080004


In [40]:
transformed_data_df.tail(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,427,428,429,430,431,432,433,434,435,436
280342,0.106385,-0.422605,-0.41619,2.628076,-0.409259,-0.4225,-0.399988,-0.40598,-0.050708,-0.054886,...,-0.060578,-0.060399,-0.05531,-0.060191,-0.060518,-0.057505,-0.057661,-0.060637,-0.06037,18.080004


In [15]:
#Seperate the data into features and targets
is_delayed=DL_flights_df['DEP_DELAY_GROUP'].tolist()
y=[]
#Determine if the flight was delayed using the DEP_DELAY_NEW column and
#convert to boolean values
for i in is_delayed:
    if i <= 0:
        y.append(0)
    else:
        y.append(1)
y = pd.DataFrame(y)
y.value_counts()

0    212161
1     68181
dtype: int64

In [16]:
y.to_csv('./predictions/Resources/y.csv')

In [17]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(transformed_data_df, y, random_state=78)

In [18]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=64, activation="tanh", input_dim=437))
nn_model.add(Dropout(0.2))
# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=64, activation="relu"))
# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=64, activation="relu"))
nn_model.add(Dropout(0.2))
# Third hidden layer
nn_model.add(tf.keras.layers.Dense(units=64, activation="sigmoid"))
# Third hidden layer
nn_model.add(tf.keras.layers.Dense(units=64, activation="sigmoid"))
nn_model.add(Dropout(0.2))
# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                28032     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 64)                

In [19]:
# Compile the model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])




In [20]:
# Train the model
fit_model = nn_model.fit(X_train, y_train, epochs=10)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
# Check the model's loss and accuracy 
model_loss, model_accuracy = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2191/2191 - 1s - loss: 0.4697 - accuracy: 0.7937 - 1s/epoch - 563us/step
Loss: 0.46974867582321167, Accuracy: 0.7936820387840271


In [22]:
nn_model.save('./predictions/Resources/nn_model.keras')

In [23]:
y_test.value_counts()

0    53157
1    16929
dtype: int64

In [24]:
#Compare the test data to the predictions
predictions = nn_model.predict(X_test).reshape(70086).tolist()
predictions_df = pd.DataFrame({'y_test':y_test[0],'predictions':predictions})



In [25]:
predictions_df.describe()

Unnamed: 0,y_test,predictions
count,70086.0,70086.0
mean,0.241546,0.228895
std,0.428024,0.201429
min,0.0,0.055682
25%,0.0,0.077609
50%,0.0,0.146956
75%,0.0,0.326862
max,1.0,0.908992


In [26]:
predictions_df.sort_values(by = 'predictions').tail(50)

Unnamed: 0,y_test,predictions
227509,1,0.90831
227764,1,0.908321
2369,1,0.908327
212577,1,0.908328
227843,1,0.908335
227376,1,0.90834
227752,1,0.908346
187762,1,0.908349
227908,1,0.908352
234012,0,0.908402


In [27]:
delayed_predictions = []
for i in predictions:
    if (i >=.5):
        delayed_predictions.append(i)
delayed_predictions

[0.6958984732627869,
 0.5225949287414551,
 0.8432033061981201,
 0.6515814661979675,
 0.5019915699958801,
 0.6013199090957642,
 0.6877286434173584,
 0.8888720273971558,
 0.8938652276992798,
 0.9070649147033691,
 0.6762658357620239,
 0.5806198716163635,
 0.9044972062110901,
 0.6967586874961853,
 0.5221697688102722,
 0.589235246181488,
 0.6408214569091797,
 0.7386317253112793,
 0.888862669467926,
 0.7035972476005554,
 0.6870138645172119,
 0.7351451516151428,
 0.90102219581604,
 0.6876509785652161,
 0.5091992616653442,
 0.6249498128890991,
 0.7151979804039001,
 0.5012924075126648,
 0.5155817866325378,
 0.89371657371521,
 0.6491725444793701,
 0.8500484824180603,
 0.6654386520385742,
 0.8300435543060303,
 0.5751740336418152,
 0.6319141983985901,
 0.7364382147789001,
 0.626079797744751,
 0.6324559450149536,
 0.6478130221366882,
 0.538375973701477,
 0.6132471561431885,
 0.6988605260848999,
 0.6889851093292236,
 0.833091676235199,
 0.5318971872329712,
 0.6836946606636047,
 0.6054195761680603,
 

In [28]:
len(delayed_predictions)

8365

## LogisticRegression

In [29]:

from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression

In [30]:
model = LogisticRegression(random_state=1)

# Fit the model using training data
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [31]:
model.score(X_train, y_train)

0.7808766456129671

In [32]:
# Make a prediction using the testing data
predictions = model.predict(X_test)
predictions

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [33]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test,predictions)

0.5979941086517138

In [34]:
# Generate a confusion matrix for the model
confusion_matrix(y_test,predictions)

array([[50877,  2280],
       [12885,  4044]], dtype=int64)

In [35]:
# Print the classification report for the model
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)
accuracy

0.7836229774848044