In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import category_encoders as ce
import warnings
warnings.filterwarnings('ignore')

In [2]:
def divide_time_of_day(df: pd.DataFrame)->pd.DataFrame:    
    
    columns = ['DATE_DEPARTURE_LCL', 'DATE_ARRIVAL_LCL']     
    time_columns = ['TIME_DEPARTURE_LCL', 'TIME_ARRIVAL_LCL']
    
    for col in columns:
        #Converting string timestamp columns to datetime timestamp columns
        date_time = pd.to_datetime(df[col])
        
        #Splitting timestamp columns into date column and time column
        df[col] = pd.to_datetime(date_time.dt.strftime('%Y-%m-%d')).dt.weekday       
        
        df[col.replace('DATE', 'TIME')] = date_time.dt.strftime('%H:%M:%S')
        
        #Categorizing time column
    for col in time_columns:
        index = pd.DatetimeIndex(df[col])
        df[col].iloc[index.indexer_between_time('04:00:00', '11:59:59')] = "morning"
        df[col].iloc[index.indexer_between_time('12:00:00', '19:59:59')] = "afternoon"
        df[col].iloc[index.indexer_between_time('20:00:00', '23:59:59')] = "night"   
        df[col].iloc[index.indexer_between_time('00:00:00', '03:59:59')] = "night"
        
    return df

In [3]:
train_df = pd.read_csv('data/train.csv')
X_train, y_train = train_df.drop('ARR_DEL15', axis=1), train_df.ARR_DEL15.copy()
X_test = pd.read_csv('data/test.csv')

train_df.DATE_DEPARTURE_LCL.nunique()

26687

In [4]:
train_df_1 = divide_time_of_day(train_df)
train_df_1.head(3)

Unnamed: 0,ID,DATE_DEPARTURE_UTC,DATE_ARRIVAL_UTC,DATE_DEPARTURE_LCL,DATE_ARRIVAL_LCL,ORIGIN,ORIGIN_AIRPORT_ID,DEST,DEST_AIRPORT_ID,TAIL_NUM,OP_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER_FL_NUM,DISTANCE,CANCELLED,DEP_DEL15,ARR_DEL15,TIME_DEPARTURE_LCL,TIME_ARRIVAL_LCL
0,345355,2020-01-18 22:10:00,2020-01-18 23:59:00,5,5,SLC,14869,SNA,14908,N128DU,DL,19790,1056,588.0,0,0.0,0,afternoon,afternoon
1,323447,2020-01-17 21:47:00,2020-01-18 04:10:00,4,4,BOS,10721,SEA,14747,N486AS,AS,19930,15,2496.0,0,,0,afternoon,night
2,242237,2020-01-13 13:26:00,2020-01-13 14:26:00,0,0,DAL,11259,HOU,12191,N901WN,WN,19393,3,239.0,0,,0,morning,morning


In [6]:
#From Fernanda

#Feature engineering
def create_columns(_df):
    mod_df = _df.copy()
    
    mod_df.drop(['ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'CANCELLED'], inplace = True, axis = 1)
    
    #transform string to datetime
    mod_df['DATE_DEPARTURE_UTC_ts'] = pd.to_datetime(mod_df['DATE_DEPARTURE_UTC'])
    mod_df['DATE_ARRIVAL_UTC_ts'] = pd.to_datetime(mod_df['DATE_ARRIVAL_UTC'])
    mod_df['DATE_DEPARTURE_LCL_ts'] = pd.to_datetime(mod_df['DATE_DEPARTURE_LCL'])
    mod_df['DATE_ARRIVAL_LCL_ts'] = pd.to_datetime(mod_df['DATE_ARRIVAL_LCL'])

    #duration of the flight
    mod_df['Duration_UTC'] = (mod_df['DATE_ARRIVAL_UTC_ts'] - mod_df['DATE_DEPARTURE_UTC_ts']).dt.seconds/60

    #Federal holidays
    mod_df['NewYear_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('01-01').map({True: 1, False: 0})
    mod_df['MartinLutherKingDay_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('01-20').map({True: 1, False: 0})
    mod_df['PresidentsDay_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('02-17').map({True: 1, False: 0})
    mod_df['MemorialDay_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('05-25').map({True: 1, False: 0})
    mod_df['IndependenceDay_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('07-04').map({True: 1, False: 0})
    mod_df['ColumbusDay_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('10-12').map({True: 1, False: 0})
    mod_df['VeteransDay_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('11-11').map({True: 1, False: 0})
    mod_df['ThanksgivingDay_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('11-26').map({True: 1, False: 0})
    mod_df['ChristmasDay_Flag'] = mod_df['DATE_DEPARTURE_UTC'].str.contains('12-25').map({True: 1, False: 0})
    
    #NaN values
    mod_df['DEP_DEL15_cat'] = mod_df['DEP_DEL15'].fillna(2).astype('category').map({1: 'delayed', 0: 'not delayed', 2: 'unknown'})
   
    mod_df = mod_df.join(pd.get_dummies(mod_df['DEP_DEL15_cat']))

    return mod_df
    
train_adj = create_columns(train_df)    
train_adj.columns

Index(['ID', 'DATE_DEPARTURE_UTC', 'DATE_ARRIVAL_UTC', 'DATE_DEPARTURE_LCL',
       'DATE_ARRIVAL_LCL', 'ORIGIN', 'DEST', 'TAIL_NUM', 'OP_CARRIER',
       'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'DISTANCE', 'DEP_DEL15',
       'ARR_DEL15', 'TIME_DEPARTURE_LCL', 'TIME_ARRIVAL_LCL',
       'DATE_DEPARTURE_UTC_ts', 'DATE_ARRIVAL_UTC_ts', 'DATE_DEPARTURE_LCL_ts',
       'DATE_ARRIVAL_LCL_ts', 'Duration_UTC', 'NewYear_Flag',
       'MartinLutherKingDay_Flag', 'PresidentsDay_Flag', 'MemorialDay_Flag',
       'IndependenceDay_Flag', 'ColumbusDay_Flag', 'VeteransDay_Flag',
       'ThanksgivingDay_Flag', 'ChristmasDay_Flag', 'DEP_DEL15_cat',
       'not delayed', 'delayed', 'unknown'],
      dtype='object')

In [7]:
#From Fernanda

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#sort by DATE_DEPARTURE_UTC_ts
train_adj = train_adj.sort_values(by="DATE_DEPARTURE_UTC_ts")

#features to be used in the baseline model
features_baseline = ['Duration_UTC', 'NewYear_Flag', 'MartinLutherKingDay_Flag','not delayed', 'delayed', 'unknown']

X = train_adj[features_baseline]
y = train_adj['ARR_DEL15']

X_train = X.head(294487)
X_test = X.tail(125000)

y_train = y.head(294487)
y_test =  y.tail(125000)

In [None]:
#Linear Regression

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

lr_score = lr.score(X_test, y_test)

In [None]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression(penalty='l2', random_state=1)
lgr.fit(X_train, y_train)

y_pred = lgr.predict(X_test)

lgr_score = lgr.score(X_test, y_test)

In [None]:
#Support Vector Classifier

from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

svc_score = svc.score(X_test, y_test)

In [None]:
# Support Vector Regressor

from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train, y_train)

y_pred = svr.predict(X_test)

svr_score = svr.score(X_test, y_test)

In [None]:
# Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

y_pred = dtr.predict(X_test)

dtr_score = dtr.score(X_test, y_test)

In [None]:
# Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

dtc_score = dtc.score(X_test, y_test)

In [None]:
# Random Forest Classifier

from sklearn.tree import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

rfc_score = rfc.score(X_test, y_test)

In [None]:
# Random Forest Regressor

from sklearn.tree import RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

y_pred = rfr.predict(X_test)

rfr_score = rfr.score(X_test, y_test)

In [None]:
#Cross Validation
from sklearn.model_selection import cross_val_score

classifier = ...
scores = cross_val_score(classifier, X, y, cv=5)