In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import datetime as dt
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer

In [None]:
df_airlines = pd.read_csv('airlines.csv', na_values=' ')
df_airlines.head()

In [None]:
df_airports = pd.read_csv('airports.csv', na_values=' ')
df_airports.head()

In [None]:
df_hdata = pd.read_csv('historic_data.csv', low_memory = False, na_values=' ', on_bad_lines='skip')
df_hdata.head()

In [None]:
df_fdata = pd.read_csv('future_data.csv', low_memory = False, na_values=' ')
df_fdata.head()

In [None]:
df_air_or = df_airports.rename(columns={'IATA_CODE':'ORIGIN_AIRPORT', 'CITY': 'OR_CITY', 'STATE': 'OR_STATE', 'COUNTRY': 'OR_COUNTRY', 'LATITUDE': 'OR_LATITUDE', 'LONGITUDE': 'OR_LONGITUDE'}, index={'ONE': 'Row_1'})
df_air_des = df_airports.rename(columns={'IATA_CODE':'DESTINATION_AIRPORT', 'CITY': 'DES_CITY', 'STATE': 'DES_STATE', 'COUNTRY': 'DES_COUNTRY', 'LATITUDE': 'DES_LATITUDE', 'LONGITUDE': 'DES_LONGITUDE'}, index={'ONE': 'Row_1'})

In [None]:
df_final1 = df_hdata.merge(df_air_or, how='left', on='ORIGIN_AIRPORT', left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
df_final2 = df_final1.merge(df_air_des, how='left', on='DESTINATION_AIRPORT', left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)

In [None]:
df_final = df_final2.drop(['AIRPORT_y','AIRPORT_x'], axis = 1)
df_final.head()

In [None]:
df_final['SCHEDULED_DEPARTURE'] = pd.to_datetime(df_final['SCHEDULED_DEPARTURE'], format='%Y-%m-%d %H:%M:%S')
df_final['SCHEDULED_ARRIVAL'] = pd.to_datetime(df_final['SCHEDULED_ARRIVAL'], format='%H:%M:%S')
df_final['TIME_AR'] = df_final['SCHEDULED_ARRIVAL'].dt.time
df_final['TIME_DE'] = df_final['SCHEDULED_DEPARTURE'].dt.time
df_final['YEAR_DE'] = df_final['SCHEDULED_DEPARTURE'].dt.year
df_final['MONTH_DE'] = df_final['SCHEDULED_DEPARTURE'].dt.month
df_final['DAY_DE'] = df_final['SCHEDULED_DEPARTURE'].dt.day

df_final['WEEKDAY_DE'] = df_final['SCHEDULED_DEPARTURE'].dt.weekday
# Weekend dummy maybe more relavent.
df_final['WEEKEND_DE'] = df_final['WEEKDAY_DE'].apply(lambda x: 1 if x >= 6 else 0)

df_final['HOUR_DE'] = df_final['SCHEDULED_DEPARTURE'].dt.hour

In [None]:
max_departure = max(df_final['DEPARTURE_DELAY'])
max_schedule = max(df_final['SCHEDULED_TIME'])

In [None]:
fmt="%Y/%m/%d %H:%M:%S"
fmt2="%H:%M:%S"

In [None]:
df_final['DEP_DATE_TIME']=pd.to_datetime(df_final['SCHEDULED_DEPARTURE'], format = fmt) + pd.to_timedelta(df_final['DEPARTURE_DELAY'], 'm')
df_final['ARR_DATE_TIME']=df_final['DEP_DATE_TIME']+ pd.to_timedelta(df_final['TAXI_IN'],'m') + pd.to_timedelta(df_final['TAXI_OUT'], 'm') + pd.to_timedelta(df_final['AIR_TIME'], 'm')
df_final['SCH_ARR_DATE_TIME']=pd.to_datetime(df_final['SCHEDULED_DEPARTURE'], format = fmt) + pd.to_timedelta(df_final['SCHEDULED_TIME'], 'm')
df_final['ARRIVAL_DELAY_v1']=(df_final['ARR_DATE_TIME'] - df_final['SCH_ARR_DATE_TIME']).astype('timedelta64[m]')
df_final['ARRIVAL_DELAY_v2']=(pd.to_datetime(df_final['ARRIVAL_TIME'], format=fmt2) - pd.to_datetime(df_final['SCHEDULED_ARRIVAL'], format = fmt2)).astype('timedelta64[m]')
df_final['ARRIVAL_DELAY'] = (df_final['ARRIVAL_DELAY_v1'] > 15).astype(int)
df_final['SPEED'] = df_final['DISTANCE']/df_final['SCHEDULED_TIME']
df_final = df_final.drop(['SCHEDULED_DEPARTURE','DEPARTURE_TIME','SCHEDULED_ARRIVAL','ARRIVAL_TIME','TAXI_OUT','WHEELS_OFF','ELAPSED_TIME','AIR_TIME','WHEELS_ON','TAXI_IN','OR_COUNTRY','DES_COUNTRY','YEAR_DE','WEEKDAY_DE','DEP_DATE_TIME','ARR_DATE_TIME','SCH_ARR_DATE_TIME','ARRIVAL_DELAY_v2'], axis = 1)
df_final['DEPARTURE_DELAY'] = df_final['DEPARTURE_DELAY'].fillna(0)
df_final['CANCELLATION_REASON'] = df_final['CANCELLATION_REASON'].fillna(0)
df_final['AIR_SYSTEM_DELAY'] = df_final['AIR_SYSTEM_DELAY'].fillna(0)
df_final['SECURITY_DELAY'] = df_final['SECURITY_DELAY'].fillna(0)
df_final['AIRLINE_DELAY'] = df_final['AIRLINE_DELAY'].fillna(0)
df_final['LATE_AIRCRAFT_DELAY'] = df_final['LATE_AIRCRAFT_DELAY'].fillna(0)
df_final['WEATHER_DELAY'] = df_final['WEATHER_DELAY'].fillna(0)
df_final['ARRIVAL_DELAY_v1'] = df_final['ARRIVAL_DELAY_v1'].fillna(0)

In [None]:
df_final = df_final.dropna(axis=0, how='any')


In [None]:
memory_usage = df_final.memory_usage(deep=True) / 1024 ** 2
memory_usage.sum()

In [None]:

def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
reduced_df = reduce_memory_usage(df_final, verbose=True)

In [None]:
memory_usage = df_final.memory_usage(deep=True) / 1024 ** 2
memory_usage.sum()

In [None]:
sample_df = df_final.sample(int(len(df_final) * 0.2))
sample_df.shape

In [None]:
fig, ax = plt.subplots(figsize=(12, 9))

sns.histplot(
    data=df_final, x="ARRIVAL_DELAY_v1", label="Original data", color="red", alpha=0.3, bins=15
)
sns.histplot(
    data=sample_df, x="ARRIVAL_DELAY_v1", label="Sample data", color="green", alpha=0.3, bins=15
)

plt.legend()
plt.show();

We're going to use the sample as main dataset, otherwise it will take too many times

In [None]:
df_final = sample_df 

In [None]:
df_model1 = df_final[['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','DEPARTURE_DELAY',
                      'SCHEDULED_TIME','DISTANCE','AIR_SYSTEM_DELAY','SECURITY_DELAY',
                      'AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY',
                      'OR_LATITUDE','OR_LONGITUDE','DES_LATITUDE','DES_LONGITUDE',
                      'WEEKEND_DE','HOUR_DE','SPEED','ARRIVAL_DELAY_v1','MONTH_DE']]
df_features = df_model1[['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','DEPARTURE_DELAY',
                         'SCHEDULED_TIME','DISTANCE','AIR_SYSTEM_DELAY','SECURITY_DELAY',
                         'AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY',
                         'OR_LATITUDE','OR_LONGITUDE','DES_LATITUDE','DES_LONGITUDE',
                         'WEEKEND_DE','HOUR_DE','SPEED']]

df_target = df_model1['ARRIVAL_DELAY_v1']
X_type = np.array(df_features.dtypes)
print(X_type)

In [None]:
df_features['WEEKEND_DE'] = df_features['WEEKEND_DE'].astype(str)
df_features['DISTANCE'] = df_features['DISTANCE'].astype(float)
df_features['HOUR_DE'] = df_features['HOUR_DE'].astype(float)

In [None]:
numerical_ix = X_type==np.dtype('float64')
numerical_cols = np.where(numerical_ix)

categorical_ix = X_type==np.dtype('O')
categorical_cols = np.where(categorical_ix)

print('categorical cols:',categorical_cols[0])
print('numerical cols:',numerical_cols[0])

In [None]:
t = [('cat', OneHotEncoder(), categorical_cols[0]), ('num', StandardScaler(), numerical_cols[0])]
col_transform = ColumnTransformer(transformers=t)
df_features = col_transform.fit_transform(df_features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, random_state=123, test_size=0.4, shuffle=True)

In [None]:
def Factor_model(model, X, y):
    np.random.seed(123)
    model.fit(X, y)
    pred_y = model.predict(X)
    kf = KFold(n_splits= 5)
    RMSE = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train,y_train)
        pred_y = model.predict(X_test)
        RMSE.append(math.sqrt(mean_squared_error(y_test, pred_y)))
        print('RMSE:',math.sqrt(mean_squared_error(y_test, pred_y)))
    print("Cross-Validation RMSE {0}".format(np.mean(RMSE)))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

model=LinearRegression()
model=model.fit(X_train,y_train)
slope=model.coef_
coef=model.intercept_
print(slope.flatten())
print(coef)

In [None]:
y_pred=model.predict(X_train)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_train,y_pred)

In [None]:
#!pip install scikit-learn-intelex

In [None]:
from sklearn.linear_model import SGDRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.linear_model import LinearRegression
from sklearnex import patch_sklearn


In [None]:
X_train.shape

In [None]:
patch_sklearn()
lr = SGDRegressor()
sfs = SFS(lr, k_features='best', forward=True, floating=False, 
          scoring='neg_mean_squared_error', cv=10)
model = sfs.fit(X_train, y_train)

fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')

plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

In [None]:
print('Selected features:', sfs.k_feature_idx_)

In [None]:
from sklearn import ensemble,gaussian_process,linear_model,naive_bayes,neighbors,svm,tree

In [None]:
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostRegressor(n_jobs = -1),
    ensemble.BaggingRegressor(n_jobs = -1),
    ensemble.ExtraTreesRegressor(n_jobs = -1),
    ensemble.GradientBoostingRegressor(n_jobs = -1),
    #ensemble.RandomForestRegressor(n_jobs = -1),
    #Nearest Neighbor
    neighbors.KNeighborsRegressor(n_jobs = -1),
    #Trees    
    tree.DecisionTreeRegressor(n_jobs = -1),
    tree.ExtraTreeRegressor(n_jobs = -1)
    ]

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score,precision_score,recall_score,auc

In [None]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
results=[]

row_index = 0
for alg in MLA:
    
    cv_results = cross_val_score(alg, X_train, y_train, cv=10)
    results.append(cv_results)
    predicted = alg.fit(X_train, y_train).predict(X_test)
    fp, tp, th = roc_curve(y_test, predicted)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round(alg.score(X_train, y_train), 4)
    MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round(alg.score(X_test, y_test), 4)
    MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp)
    
    
    row_index+=1
    
MLA_compare.sort_values(by = ['MLA Test Accuracy'], ascending = False, inplace = True)    
MLA_compare