In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from sklearn.metrics import f1_score

In [2]:
lax = pd.read_csv('./flight-delays/lax.csv',index_col=0).drop(columns=['DESTINATION_AIRPORT','ARRIVAL_TIME',
    'TAXI_OUT','WHEELS_OFF','ELAPSED_TIME','AIR_TIME','WHEELS_ON','TAXI_IN','DEPARTURE_TIME','DEPARTURE_DELAY'])

In [3]:
lax['DELAYED'] = lax['ARRIVAL_DELAY']>0

In [4]:
lax['DELAYED'] = lax['DELAYED'].replace({True:1,False:0})

In [5]:
lax = lax.drop(columns=['ARRIVAL_DELAY','YEAR'])

In [6]:
lax.head(10).T

Unnamed: 0,101,103,129,156,195,196,197,330,342,344
MONTH,1,1,1,1,1,1,1,1,1,1
DAY,1,1,1,1,1,1,1,1,1,1
DAY_OF_WEEK,4,4,4,4,4,4,4,4,4,4
AIRLINE,OO,OO,OO,UA,UA,UA,UA,OO,OO,OO
FLIGHT_NUMBER,6512,4986,2599,1222,1224,1296,1431,2622,5242,5466
TAIL_NUMBER,N925SW,N719SK,N435SW,N68452,N87531,N37471,N36207,N492SW,N744SK,N568SW
ORIGIN_AIRPORT,FAT,AUS,DEN,MCO,SFO,SAT,BOS,SMF,BOI,CLD
SCHEDULED_DEPARTURE,535,538,545,555,600,600,600,600,600,600
SCHEDULED_TIME,75,212,150,341,88,207,404,90,130,51
DISTANCE,209,1242,862,2218,337,1211,2611,373,674,86


In [7]:
def time_to_minute(time):
    padded_time = str(time).zfill(4)
    minute = padded_time[2:]
    hour = padded_time[:2]
    return int(hour)*60+int(minute)

In [8]:
lax['SCHEDULED_ARRIVAL']=lax['SCHEDULED_ARRIVAL'].apply(lambda x: time_to_minute(x))
lax['SCHEDULED_DEPARTURE']=lax['SCHEDULED_DEPARTURE'].apply(lambda x: time_to_minute(x))

In [125]:
delayed_lax = lax[lax['DELAYED']==1][['MONTH','AIRLINE','SCHEDULED_DEPARTURE','SCHEDULED_TIME','FLIGHT_NUMBER',
                                     'SCHEDULED_ARRIVAL','DISTANCE','LATITUDE','LONGITUDE','DELAYED']]
not_delayed_lax = lax[lax['DELAYED']==0][['MONTH','AIRLINE','SCHEDULED_DEPARTURE','SCHEDULED_TIME','FLIGHT_NUMBER',
                                     'SCHEDULED_ARRIVAL','DISTANCE','LATITUDE','LONGITUDE','DELAYED']]

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(delayed_lax.drop('DELAYED',axis=1),
                                        delayed_lax['DELAYED'],test_size = 0.2,random_state = 1)
X_test_d, X_valid_d, y_test_d, y_valid_d = train_test_split(X_test_d,y_test_d,test_size = 0.5,random_state = 1)


X_train, X_test, y_train, y_test = train_test_split(not_delayed_lax.drop('DELAYED',axis=1),
                                        not_delayed_lax['DELAYED'],test_size = 0.2,random_state = 1)
X_test, X_valid, y_test, y_valid = train_test_split(X_test,y_test,test_size = 0.5,random_state = 1)

X_train = pd.concat([X_train_d,X_train])
X_test = pd.concat([X_test_d,X_test])
X_valid = pd.concat([X_valid_d,X_valid])

y_train = pd.concat([y_train_d,y_train])

y_test = pd.concat([y_test_d,y_test])

y_valid = pd.concat([y_valid_d,y_valid])


In [64]:
c_val = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
acc_scores = []
f1_scores = []
for i in c_val:
    col_trans = ColumnTransformer([('one_hot_airline',OneHotEncoder(handle_unknown='ignore'),['AIRLINE']),
                                  ('one_hot_month',OneHotEncoder(handle_unknown='ignore'),['MONTH']),
                                  ('one_hot_flight_num',OneHotEncoder(handle_unknown='ignore'),['FLIGHT_NUMBER'])],
                                  remainder='passthrough')
    pip = Pipeline(steps=[('col',col_trans),
                    ('logistic',LogisticRegression(solver=i))])
    pip.fit(X_train,y_train)
    y_predict = pip.predict(X_valid)
    acc_scores.append(accuracy_score(y_valid,y_predict))
    f1_scores.append(f1_score(y_valid, y_predict))



In [65]:
c_df = pd.DataFrame(data={'solver':c_val,'f1_score':f1_scores,'accuracy score':acc_scores })
c_df.set_index('solver')

Unnamed: 0_level_0,f1_score,accuracy score
solver,Unnamed: 1_level_1,Unnamed: 2_level_1
newton-cg,0.502011,0.639136
lbfgs,0.458613,0.62217
liblinear,0.461653,0.623731
sag,0.42629,0.611137
saga,0.423903,0.60916


In [107]:
c_val = [0.01,0.1,1,10,100]
acc_scores = []
f1_scores = []
for i in c_val:
    col_trans = ColumnTransformer([('one_hot_airline',OneHotEncoder(handle_unknown='ignore'),['AIRLINE']),
                                  ('one_hot_month',OneHotEncoder(handle_unknown='ignore'),['MONTH']),
                                  ('one_hot_flight_num',OneHotEncoder(handle_unknown='ignore'),['FLIGHT_NUMBER'])],
                                  remainder='passthrough')
    pip = Pipeline(steps=[('col',col_trans),
                    ('logistic',LogisticRegression(solver='newton-cg',C=i))])
    pip.fit(X_train,y_train)
    y_predict = pip.predict(X_valid)
    acc_scores.append(accuracy_score(y_valid,y_predict))
    f1_scores.append(f1_score(y_valid, y_predict))



In [108]:
c_df = pd.DataFrame(data={'C value':c_val,'f1_score':f1_scores,'accuracy score':acc_scores })
c_df.set_index('C value')

Unnamed: 0_level_0,f1_score,accuracy score
C value,Unnamed: 1_level_1,Unnamed: 2_level_1
0.01,0.472638,0.631382
0.1,0.493438,0.638407
1.0,0.502011,0.639136
10.0,0.502009,0.63872
100.0,0.501579,0.638616


In [39]:
#logistic Regression

In [126]:
col_trans = ColumnTransformer([('one_hot_airline',OneHotEncoder(handle_unknown='ignore'),['AIRLINE']),
                                  ('one_hot_month',OneHotEncoder(handle_unknown='ignore'),['MONTH']),
                                  ('one_hot_flight_num',OneHotEncoder(handle_unknown='ignore'),['FLIGHT_NUMBER'])],
                                  remainder='passthrough')
pip = Pipeline(steps=[('col',col_trans),
                ('logistic',LogisticRegression(solver='newton-cg',C=1))])
pip.fit(X_train,y_train)
print(accuracy_score(y_test,pip.predict(X_test)))
print(f1_score(y_test,pip.predict(X_test)))

0.6344662468120543
0.4950751312100079


Logistic Regression is the most commonly used statistical model predicting binary dependent variable. The parameters that are taken into accounts are MONTH, AIRLINE, SCHEDULED_DEPARTURE, SCHEDULED_TIME, FLIGHT_NUMBER, SCHEDULED_ARRIVAL, DISTANCE, LATITUDE, and LONGITUDE with one-hot encoding on MONTH, AIRLINE, and FLIGHT NUMBER. As running through different solver, "newton-cg" tends to have higher f1_score and accuracy score comparing to other solvers, which proves that it is good for larger dataset. From the different C values, logistic regression turns out to have best accuracy score and f1_score when C equals to 1. Although the accuracy score improves in the logistic regression, the f1_score still remain low. ___fernie___ f1_score... 
In the end, when C=1 and solver = newton-cg, the accuracy score for testing dataset is 0.634 and f1_score is 0.495. 

In [67]:
pip = Pipeline(steps=[('col',col_trans),
                    ('logistic',LogisticRegression(solver='newton-cg'))])
pip.fit(X_train,y_train)
accuracy_score(y_test,pip.predict(X_test))

0.6344662468120543

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [127]:
delayed_lax = lax[lax['DELAYED']==1][['MONTH','AIRLINE','SCHEDULED_DEPARTURE','SCHEDULED_TIME',
                                     'SCHEDULED_ARRIVAL','DISTANCE','LATITUDE','LONGITUDE','DELAYED']]
not_delayed_lax = lax[lax['DELAYED']==0][['MONTH','AIRLINE','SCHEDULED_DEPARTURE','SCHEDULED_TIME',
                                     'SCHEDULED_ARRIVAL','DISTANCE','LATITUDE','LONGITUDE','DELAYED']]

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(delayed_lax.drop('DELAYED',axis=1),
                                        delayed_lax['DELAYED'],test_size = 0.2,random_state = 1)
X_test_d, X_valid_d, y_test_d, y_valid_d = train_test_split(X_test_d,y_test_d,test_size = 0.5,random_state = 1)


X_train, X_test, y_train, y_test = train_test_split(not_delayed_lax.drop('DELAYED',axis=1),
                                        not_delayed_lax['DELAYED'],test_size = 0.2,random_state = 1)
X_test, X_valid, y_test, y_valid = train_test_split(X_test,y_test,test_size = 0.5,random_state = 1)

X_train = pd.concat([X_train_d,X_train])
X_test = pd.concat([X_test_d,X_test])
X_valid = pd.concat([X_valid_d,X_valid])

y_train = pd.concat([y_train_d,y_train])

y_test = pd.concat([y_test_d,y_test])

y_valid = pd.concat([y_valid_d,y_valid])

In [84]:
col_trans = ColumnTransformer([('one_hot_airline',OneHotEncoder(handle_unknown='ignore'),['AIRLINE']),
                              ('one_hot_month',OneHotEncoder(handle_unknown='ignore'),['MONTH'])],
                              remainder='passthrough')
pip1 = Pipeline(steps=[('col',col_trans),
                ('logistic',RandomForestClassifier(n_jobs=7,max_leaf_nodes=6500))])

In [85]:
pip1.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('col',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('one_hot_airline',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='ignore',
                                                                n_values=None,
                                                                sparse=True),
                                                  ['AIRLINE']),
                                          

In [86]:
accuracy_score(y_valid,pip1.predict(X_valid))

0.652615144418423

In [87]:
accuracy_score(y_test,pip1.predict(X_test))

0.6511216363920262

In [90]:
f1_score(y_valid,pip1.predict(X_valid))

0.5364261407042156

In [89]:
f1_score(y_test,pip1.predict(X_test))

0.5338340635649211

In [40]:
# Random Forest Classifier

In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [128]:
col_trans = ColumnTransformer([('one_hot_airline',OneHotEncoder(handle_unknown='ignore'),['AIRLINE']),
                              ('one_hot_month',OneHotEncoder(handle_unknown='ignore'),['MONTH'])],
                              remainder='passthrough')
pip2 = Pipeline(steps=[('col',col_trans),
                ('logistic',KNeighborsClassifier(n_neighbors=11))])

In [129]:
pip2.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('col',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('one_hot_airline',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='ignore',
                                                                n_values=None,
                                                                sparse=True),
                                                  ['AIRLINE']),
                                          

In [130]:
accuracy_score(y_valid,pip2.predict(X_valid))

KeyboardInterrupt: 

In [131]:
accuracy_score(y_test,pip2.predict(X_test))

0.6371727476187997

In [None]:
#K Neighbor Classifier

In [132]:
f1_score(y_test,pip2.predict(X_test))

0.5365334751678745

In [68]:
from sklearn.ensemble import GradientBoostingClassifier

In [74]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
acc = []
f1 = []
for i in lr_list:
    col_trans = ColumnTransformer([('one_hot_airline',OneHotEncoder(handle_unknown='ignore'),['AIRLINE']),
                                  ('one_hot_month',OneHotEncoder(handle_unknown='ignore'),['MONTH'])],
                                  remainder='passthrough')
    pip2 = Pipeline(steps=[('col',col_trans),
                    ('logistic',GradientBoostingClassifier(learning_rate = i))])
    pip2.fit(X_train,y_train)
    y_predict = pip2.predict(X_valid)
    acc.append(accuracy_score(y_valid,y_predict))
    f1.append(f1_score(y_valid, y_predict))

In [75]:
c_df = pd.DataFrame(data={'Learning Rate':lr_list,'f1_score':f1,'accuracy score':acc })
c_df.set_index('Learning Rate')

Unnamed: 0_level_0,f1_score,accuracy score
Learning Rate,Unnamed: 1_level_1,Unnamed: 2_level_1
0.05,0.4496,0.634556
0.075,0.462996,0.637106
0.1,0.475381,0.640125
0.25,0.493638,0.650013
0.5,0.512768,0.652459
0.75,0.518172,0.651574
1.0,0.516692,0.648139


In [139]:
n_est = [500,750,1000,2000,3000]
acc = []
f1 = []
for i in n_est:
    col_trans = ColumnTransformer([('one_hot_airline',OneHotEncoder(handle_unknown='ignore'),['AIRLINE']),
                                  ('one_hot_month',OneHotEncoder(handle_unknown='ignore'),['MONTH'])],
                                  remainder='passthrough')
    pip2 = Pipeline(steps=[('col',col_trans),
                    ('logistic',GradientBoostingClassifier(learning_rate = 0.75,n_estimators=i))])
    pip2.fit(X_train,y_train)
    y_predict = pip2.predict(X_valid)
    acc.append(accuracy_score(y_valid,y_predict))
    f1.append(f1_score(y_valid, y_predict))

In [140]:
c_df = pd.DataFrame(data={'n_estimators':n_est,'f1 score':f1,'accuracy score':acc })
c_df.set_index('n_estimators')

Unnamed: 0_level_0,f1 score,accuracy score
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1
500,0.540798,0.659381
750,0.544089,0.658808
1000,0.546287,0.659745
2000,0.54578,0.656622
3000,0.551246,0.656154


In [142]:
col_trans = ColumnTransformer([('one_hot_airline',OneHotEncoder(handle_unknown='ignore'),['AIRLINE']),
                                  ('one_hot_month',OneHotEncoder(handle_unknown='ignore'),['MONTH'])],
                                  remainder='passthrough')
pip2 = Pipeline(steps=[('col',col_trans),
                ('logistic',GradientBoostingClassifier(learning_rate = 0.75,n_estimators=1000))])
pip2.fit(X_train,y_train)
accuracy_score(y_test,pip2.predict(X_test))


0.6511216363920262

In [143]:
f1_score(y_test,pip2.predict(X_test))

0.5384562418233148