# Model Development: Airline Delay
<img src="../assets/airline-pic.jpg" alt="Title-Pic">

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings

from datetime import datetime
from pandas import Series, DataFrame
from imblearn.over_sampling import BorderlineSMOTE
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score


np.set_printoptions(suppress=True)
sns.set(rc={"figure.figsize": (10, 6), "axes.titlesize": 20, "axes.titleweight": "bold", "axes.labelsize": 15})
warnings.filterwarnings('ignore')

# Data

In [2]:
def export_model(model, model_name):
    timestamp_string = str(datetime.now())[:19].replace(" ", "").replace(":", "").replace("-", "")
    export_path = f"../trained_models/{timestamp_string}_{model_name}"
    with open(export_path, "wb") as model_file:
        pickle.dump(model, model_file)
    
    return export_path

In [3]:
def load_model(model_path):
    with open(model_path, "rb") as model_file:
        return pickle.load(model_file)

In [8]:
def print_top_rankings(model, top=10):
    sorted_zipped_report = sorted(
        zip(
            model.cv_results_["params"], 
            model.cv_results_["mean_test_score"], 
            model.cv_results_["rank_test_score"]
        ), key=lambda item: item[2]
    )
    for (i, (params, score, rank)) in enumerate(sorted_zipped_report):
        if i >= top:
            break

        print(f"Rank: {rank}\tScore: {score}\nParam: {params}\n")
        

In [4]:
MODEL_DATA = np.load("../model_data/nasa_prep_data.npz", allow_pickle=True)
inputs = MODEL_DATA["inputs"]
targets = MODEL_DATA["targets"]

X_train, X_tmp_test, y_train, y_tmp_test = train_test_split(inputs, targets, test_size=0.2, random_state=101)
X_test, X_val, y_test, y_val = train_test_split(X_tmp_test, y_tmp_test, test_size=0.5, random_state=101)

(X_train.shape, X_test.shape, X_val.shape), (y_train.shape, y_test.shape, y_val.shape)

(((478590, 613), (59824, 613), (59824, 613)), ((478590,), (59824,), (59824,)))

In [5]:
test_arr = np.hstack([inputs, targets.reshape(-1, 1)])
np.random.shuffle(test_arr)
test_arr[:50000]

array([[ 0.56563027, -1.69404951,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-1.59228874, -1.99543203,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-1.51676157, -0.55738213,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.11447689,  1.27141407,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 0.137643  ,  1.83987034,  0.        , ...,  0.        ,
         1.        ,  1.        ],
       [-1.53834076,  0.5082208 ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
SIZE = 10000
test_X_train = test_arr[:SIZE, :-1]
test_y_train = test_arr[:SIZE, -1].reshape(-1, 1)
test_X_test = test_arr[SIZE:, :-1]
test_y_test = test_arr[SIZE:, -1].reshape(-1, 1)

In [12]:
test_y_train.mean()

0.5012

# Build the Model

## Model1: Gradient Boosting

In [46]:
xgb_tmp_model = XGBClassifier()

xgb_grid_params = {
    "eval_metric": ["logloss"],
    "n_estimators": [900],
    "learning_rate": [0.01],
    "min_child_weight": [5],
    "eta": [0.05],
    "gamma": [6],
    "subsample": [0.5],
    "max_depth": [10],
    "colsample_bytree": [0.5]
}

In [50]:
xgb_grid_model = GridSearchCV(xgb_tmp_model, xgb_grid_params, cv=2, verbose=3, scoring="f1_micro")

xgb_grid_model.fit(X_train, y_train);

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 1/2] END colsample_bytree=0.5, eta=0.05, eval_metric=logloss, gamma=6, learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=900, subsample=0.5;, score=0.652 total time=13.8min
[CV 2/2] END colsample_bytree=0.5, eta=0.05, eval_metric=logloss, gamma=6, learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=900, subsample=0.5;, score=0.653 total time=13.6min


In [51]:
print(classification_report(test_y_train, xgb_grid_model.best_estimator_.predict(test_X_train)))

              precision    recall  f1-score   support

         0.0       0.65      0.71      0.68      4982
         1.0       0.68      0.62      0.65      5018

    accuracy                           0.66     10000
   macro avg       0.67      0.66      0.66     10000
weighted avg       0.67      0.66      0.66     10000



In [52]:
print(classification_report(test_y_test, xgb_grid_model.best_estimator_.predict(test_X_test)))

              precision    recall  f1-score   support

         0.0       0.65      0.71      0.68    294137
         1.0       0.68      0.61      0.64    294101

    accuracy                           0.66    588238
   macro avg       0.66      0.66      0.66    588238
weighted avg       0.66      0.66      0.66    588238



In [62]:
f1_score(y_test, xgb_grid_model.best_estimator_.predict(X_test))

0.6402358763755067

In [53]:
print_top_rankings(xgb_grid_model, top=6)

Rank: 1	Score: 0.6525021417079337
Param: {'colsample_bytree': 0.5, 'eta': 0.05, 'eval_metric': 'logloss', 'gamma': 6, 'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 5, 'n_estimators': 900, 'subsample': 0.5}



In [54]:
xgb_grid_model.best_params_

{'colsample_bytree': 0.5,
 'eta': 0.05,
 'eval_metric': 'logloss',
 'gamma': 6,
 'learning_rate': 0.01,
 'max_depth': 10,
 'min_child_weight': 5,
 'n_estimators': 900,
 'subsample': 0.5}

In [55]:
xgb_grid_model.best_params_

{'colsample_bytree': 0.5,
 'eta': 0.05,
 'eval_metric': 'logloss',
 'gamma': 6,
 'learning_rate': 0.01,
 'max_depth': 10,
 'min_child_weight': 5,
 'n_estimators': 900,
 'subsample': 0.5}

In [58]:
export_model(xgb_grid_model.best_estimator_, "xgb_model")

In [16]:
xgb_loaded_model = load_model("../trained_models/20220724221507_xgb_model")
xgb_val_preds = xgb_loaded_model.predict(X_val)

xgb_loaded_model;

In [17]:
print(classification_report(y_val, xgb_val_preds))

              precision    recall  f1-score   support

           0       0.64      0.71      0.67     29868
           1       0.68      0.61      0.64     29956

    accuracy                           0.66     59824
   macro avg       0.66      0.66      0.66     59824
weighted avg       0.66      0.66      0.66     59824



## Model2: SVM

In [8]:
SIZE = 10000
test_X_train = test_arr[:SIZE, :-1]
test_y_train = test_arr[:SIZE, -1]
test_X_test = test_arr[SIZE:, :-1]
test_y_test = test_arr[SIZE:, -1].reshape(-1, 1)

In [9]:
test_y_train.shape, y_train.shape

((10000,), (478590,))

In [10]:
svm_tmp_model = SVC()

svm_grid_params = {
    "C": [7],
    "gamma": ["auto"],
    "kernel": ["poly"],
    "degree": [8],
    "coef0": [1]
}

In [12]:
svm_grid_model = GridSearchCV(svm_tmp_model, svm_grid_params, cv=2, scoring="f1_macro", verbose=3)

svm_grid_model.fit(X_train, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


In [None]:
top_rankings(svm_grid_model, top=10)

In [None]:
svm_grid_model.best_params_

{'C': 7, 'coef0': 1, 'degree': 8, 'gamma': 'auto', 'kernel': 'poly'}

In [None]:
export_model(svm_grid_model.best_estimator_, "svm_model")

In [None]:
print(classification_report(y_val, svm_grid_model.best_estimator_.predict(X_val)))

              precision    recall  f1-score   support

         0.0       0.64      0.68      0.66      4941
         1.0       0.67      0.62      0.64      5059

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000



In [22]:
f1_score(y_test, svm_grid_model.best_estimator_.predict(X_test))

0.5902748854643898

In [None]:
export_model(svm_model, "svm_model")

In [None]:
print(classification_report(y_test, svm_model.predict(X_test)))

## Model2: XXXXXXXXXX

## Model2: XXXXXXXXXX

## Model2: XXXXXXXXXX

# Final Model Selection

# Export the Model