In [167]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os, re, operator, warnings
warnings.filterwarnings('ignore')
import pickle

In [2]:
#Define the location of the Data
path = '../data/'

# Import Feature Engineered Dataset

In [3]:
df = pd.read_csv(f'{path}/interim/2017FeatEng.csv')

In [4]:
df.head()

Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,Origin,OriginState,...,ARR_HOUR,DEP_HOUR,DOM_DIRECTION,flight_freq,LoadFactor,mf_name,mf_year,plane_model,eng_model,plane_age
0,2,5,6,6,2017-05-06 00:00:00,WN,N7824A,4652,SJC,CA,...,8,6,East,17,0.745438,BOEING,2001.0,737-7BK,CFM56 SERIES,16.0
1,2,5,6,6,2017-05-06 00:00:00,WN,N8522P,4971,SJC,CA,...,18,17,East,7,0.745438,BOEING,2017.0,737-800,CFM56-7B27E/F,0.0
2,2,5,6,6,2017-05-06 00:00:00,WN,N8617E,5113,SJC,CA,...,12,11,East,8,0.745438,BOEING,2013.0,737-8H4,CFM56-7B27E,4.0
3,2,5,6,6,2017-05-06 00:00:00,WN,N450WN,5150,SJC,CA,...,21,20,East,3,0.745438,BOEING,2004.0,737-7H4,CFM56 SERIES,13.0
4,2,5,6,6,2017-05-06 00:00:00,WN,N498WN,5711,SJC,CA,...,14,13,East,10,0.745438,BOEING,2005.0,737-7H4,CFM56 SERIES,12.0


In [6]:
df.shape

(5579279, 30)

In [5]:
#create a list with top 20 airport origins by flight volume
top_20 = list(df.groupby('Origin').size().sort_values(ascending=False)[0:20].index)
#create a list with top 5 airlines by flight volume
top_5 = list(df.groupby('Reporting_Airline').size().sort_values(ascending=False)[0:5].index)
#only use top 20 origin airports
top_df = df[df.Origin.isin(top_20)]
#only use top 5 airlines
top_df = top_df[top_df.Reporting_Airline.isin(top_5)]

In [7]:
top_df.shape

(2458906, 30)

In [8]:
#Add a column classifying if the flight is delayed or not using 15 minutes as delayed
top_df['STATUS'] = top_df.ArrDelay.apply(lambda x: 1 if x >= 15 else 0)

# Label Encoding Categorical Values

In [9]:
from sklearn.preprocessing import LabelEncoder

In [11]:
#Ensure the columns are strings
top_df['mf_name'] = top_df['mf_name'].astype(str)
top_df['mf_year'] = top_df['mf_year'].astype(str)
top_df['plane_model'] = top_df['plane_model'].astype(str)
top_df['eng_model'] = top_df['eng_model'].astype(str)

In [13]:
le = LabelEncoder()
top_df['Reporting_Airline'] = le.fit_transform(top_df.Reporting_Airline.values)
top_df['Tail_Number'] = le.fit_transform(top_df.Tail_Number.values)
top_df['Origin'] = le.fit_transform(top_df.Origin.values)
top_df['Dest'] = le.fit_transform(top_df.Dest.values)
top_df['DOM_DIRECTION'] = le.fit_transform(top_df.DOM_DIRECTION.values)
top_df['OriginState'] = le.fit_transform(top_df.OriginState.values)
top_df['DestState'] = le.fit_transform(top_df.DestState.values)
top_df['mf_name'] = le.fit_transform(top_df.mf_name.values)
top_df['mf_year'] = le.fit_transform(top_df.mf_year.values)
top_df['plane_model'] = le.fit_transform(top_df.plane_model.values)
top_df['eng_model'] = le.fit_transform(top_df.eng_model.values)

In [14]:
top_df = top_df.drop(columns='FlightDate')

In [15]:
top_df.head()

Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,Origin,OriginState,DestState,...,DEP_HOUR,DOM_DIRECTION,flight_freq,LoadFactor,mf_name,mf_year,plane_model,eng_model,plane_age,STATUS
25,2,5,6,6,4,1691,435,19,15,4,...,18,2,1,0.340491,12,39,48,43,16.0,0
26,2,5,6,6,4,1691,451,19,15,4,...,8,2,22,0.340491,12,39,48,43,16.0,0
27,2,5,6,6,4,3116,4720,19,15,18,...,9,0,23,0.805389,12,54,69,57,1.0,0
28,2,5,6,6,4,3048,2435,19,15,42,...,14,0,7,0.833898,12,52,69,56,3.0,0
29,2,5,6,6,4,2598,4757,19,15,5,...,13,0,15,0.795838,12,39,57,35,16.0,0


# Looking at the data Imbalance

In [23]:
down = top_df[top_df.STATUS == 1]
up = top_df[top_df.STATUS == 0]
down = down.Reporting_Airline.count()
up = up.Reporting_Airline.count()
print(f'Delay percentage: {down/(up+down)}')

Delay percentage: 0.1848797798695843


# Resampling the data to deal with Imbalance

In [20]:
from sklearn.utils import resample

In [24]:
#let's first separate majority class and minority class and resample

df_majority = top_df[top_df.STATUS == 0]
df_minority = top_df[top_df.STATUS == 1]


# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=down,     # to match minority class
                                 random_state=42) # reproducible results
# combine the new dataframes
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.STATUS.value_counts()

1    454602
0    454602
Name: STATUS, dtype: int64

# Splitting the Data into Train / Test Split

In [25]:
df_downsampled.columns

Index(['Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'Reporting_Airline',
       'Tail_Number', 'Flight_Number_Reporting_Airline', 'Origin',
       'OriginState', 'DestState', 'Dest', 'CRSElapsedTime', 'ArrDelay',
       'Distance', 'DistanceGroup', 'tempF', 'wind', 'ave_vis', 'precip_sum',
       'ARR_HOUR', 'DEP_HOUR', 'DOM_DIRECTION', 'flight_freq', 'LoadFactor',
       'mf_name', 'mf_year', 'plane_model', 'eng_model', 'plane_age',
       'STATUS'],
      dtype='object')

In [None]:
df_downsampled.describe()

In [191]:
df_downsampled['tempF'] = df_downsampled.tempF.apply(lambda x: (x - 32)*(5/9) + 273.15)

In [181]:
df_downsampled['plane_age'] = df_downsampled.plane_age.apply(lambda x: 0 if x < 0 else x)

In [196]:
from sklearn.model_selection import train_test_split
y = df_downsampled.STATUS.values
X = df_downsampled.drop(columns=['STATUS', 'ArrDelay']).values

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [29]:
#clean up memory
del df, top_df

# Modeling (untuned)

## Neural Network Classifier

In [96]:
from sklearn.neural_network import MLPClassifier

In [97]:
mlp = MLPClassifier(random_state=seed)

In [98]:
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
# Evaluate
accuracy = accuracy_score(y_test, y_pred_mlp)
precision = precision_score(y_test, y_pred_mlp)
recall = recall_score(y_test, y_pred_mlp)
cm = confusion_matrix(y_test, y_pred_mlp)
# Evaluate clf's accuracy on the test set
print(f'MLP: accuracy: {accuracy}, precision: {precision},\
          recall: {recall}, confusion matrix: {cm}')

name: AdaBoost, accuracy: 0.6047810999719535, precision: 0.5916942475892525,          recall: 0.6809998463126001, confusion matrix: [[47939 42808]
 [29059 62035]]


In [99]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [100]:
from sklearn.model_selection import GridSearchCV

clf_mlp = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf_mlp.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=200,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_sta...
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alph

In [101]:
# Best paramete set
print('Best parameters found:\n', clf_mlp.best_params_)

# All results
means = clf_mlp.cv_results_['mean_test_score']
stds = clf_mlp.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_mlp.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'adam'}
0.521 (+/-0.011) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.548 (+/-0.039) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
0.525 (+/-0.006) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
0.548 (+/-0.039) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.520 (+/-0.008) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.523 (+/-0.006) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver

In [102]:
y_true, y_pred = y_test, clf_mlp.predict(X_test)

from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Results on the test set:
              precision    recall  f1-score   support

           0       0.63      0.60      0.61     90747
           1       0.62      0.64      0.63     91094

    accuracy                           0.62    181841
   macro avg       0.62      0.62      0.62    181841
weighted avg       0.62      0.62      0.62    181841



## LightGBM

In [134]:
import lightgbm as lgb
params = {'learning_rate':0.01,
          'objective' :'binary',
          'num_leaves' : 100,
          'feature_fraction': 0.75, 
          'bagging_fraction': 0.8, 
          'bagging_freq':1,
          'boosting_type' : 'gbdt',
          'metric': 'binary_logloss'}
train_data = lgb.Dataset(X_train, label=y_train)

lgbm = lgb.train(params, train_data, 100)

y_pred_lgb = lgbm.predict(X_test)

In [135]:
y_pred_lgb

array([0.49025986, 0.49985671, 0.48524547, ..., 0.495948  , 0.34475376,
       0.47941924])

In [136]:
y_binary = np.where(y_pred_lgb < 0.5, 0, 1)

In [137]:
print(classification_report(y_test, y_binary))

              precision    recall  f1-score   support

           0       0.64      0.68      0.66     90747
           1       0.66      0.62      0.64     91094

    accuracy                           0.65    181841
   macro avg       0.65      0.65      0.65    181841
weighted avg       0.65      0.65      0.65    181841



In [133]:
# Evaluate
accuracy = accuracy_score(y_test, y_binary)
precision = precision_score(y_test, y_binary)
recall = recall_score(y_test, y_binary)
cm = confusion_matrix(y_test, y_binary)
# Evaluate clf's accuracy on the test set
print(f'LightGBM: accuracy: {accuracy}, precision: {precision},\
        recall: {recall}, confusion matrix: {cm}')

LightGBM: accuracy: 0.6498809399420373, precision: 0.6603526495486647,        recall: 0.6199749709091708, confusion matrix: [[61699 29048]
 [34618 56476]]


## Scikit-Learn Classifiers and XGBoost

In [152]:
#Import classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

In [153]:
#instantiate models
seed = 42
clf = RandomForestClassifier(random_state=seed)
xgb = XGBClassifier(random_state=seed)
ada = AdaBoostClassifier(random_state=seed)
xtr = ExtraTreesClassifier(random_state=seed)

In [154]:
classifiers = [('Random Forest', clf), ('XGBoost', xgb), ('ExtraTrees', xtr)]

In [155]:
from sklearn.metrics import confusion_matrix, accuracy_score,\
precision_score, recall_score, precision_recall_curve, classification_report

In [156]:
# Iterate over the pre-defined list of classifiers
for clf_name, clf_algo in classifiers:    
 
    # Fit clf to the training set
    clf_algo.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = clf_algo.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
   
    # Evaluate clf's accuracy on the test set
    print(f'name: {clf_name}')
    print(classification_report(y_test, y_pred))
    # Evaluate clf's accuracy on the test set 
    print(f'accuracy: {accuracy}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'confusion matrix: {cm}')
    print('------------------------------')

name: Random Forest
              precision    recall  f1-score   support

           0       0.62      0.72      0.67     90747
           1       0.67      0.57      0.62     91094

    accuracy                           0.64    181841
   macro avg       0.65      0.64      0.64    181841
weighted avg       0.65      0.64      0.64    181841

accuracy: 0.644420125274278
precision: 0.6708415732805977
recall: 0.5697521241794191
confusion matrix: [[65281 25466]
 [39193 51901]]
------------------------------
name: XGBoost
              precision    recall  f1-score   support

           0       0.64      0.66      0.65     90747
           1       0.65      0.62      0.64     91094

    accuracy                           0.64    181841
   macro avg       0.64      0.64      0.64    181841
weighted avg       0.64      0.64      0.64    181841

accuracy: 0.6436392232774787
precision: 0.6502714751100188
recall: 0.6245087492041188
confusion matrix: [[60151 30596]
 [34205 56889]]
------------

In [158]:
# Import VotingClassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)     

# Fit vc to the training set
vc.fit(X_train, y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)


# Evaluate clf's accuracy on the test set
print(f'name: Voting Classifier')
print(classification_report(y_test, y_pred))
# Evaluate clf's accuracy on the test set 
print(f'accuracy: {accuracy}')
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'confusion matrix: {cm}')
print('------------------------------')

name: Voting Classifier
              precision    recall  f1-score   support

           0       0.64      0.73      0.68     90747
           1       0.69      0.60      0.64     91094

    accuracy                           0.66    181841
   macro avg       0.66      0.66      0.66    181841
weighted avg       0.66      0.66      0.66    181841

accuracy: 0.6621718974268729
precision: 0.6864354581222581
recall: 0.5994686807034492
confusion matrix: [[65802 24945]
 [36486 54608]]
------------------------------


# Tuning

In [46]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

In [163]:
# Number of trees for tree ensambles
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 150, num = 2)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 105, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [6, 8, 12]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

## Random Forest

In [77]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [50, 63, 77, 91, 105, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [6, 8, 12],
 'min_samples_split': [5, 10, 20],
 'n_estimators': [70, 85, 100]}


In [78]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 20, \
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 37.0min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 64.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [79]:
rf_random.best_params_

{'n_estimators': 100,
 'min_samples_split': 10,
 'min_samples_leaf': 8,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

### Save the best params

In [170]:
#define and store ExtraTreeClassifier best parameters
rf_params = rf_random.best_params_
with open('rf_params', 'wb') as f:
    pickle.dump(rf_params, f)

## XGBoost

In [80]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              #'objective':['reg:linear'],
              'learning_rate': [0.01, 0.1],
              'max_depth': [6, 12], 
              'min_child_weight': [2, 6],
              #'verbosity': [1],
              'subsample': [1.0],
              'colsample_bytree': [0.3, 0.5],
              'gamma': [0],
              'n_estimators': [100, 150]}

xgb_rand = RandomizedSearchCV(xgb,
                              parameters,
                              cv = 3,
                              n_jobs = 4,
                              verbose=True)

xgb_rand.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 28.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=42, reg_alpha=0,
                                           reg_lambda=1, scal...
                   iid='warn', n_iter=10, n_jobs=4,
                   param_distributions={'colsample_bytree': [0.3, 0.5],
                                        'learning_rate': [0.01, 0.1],
                           

In [81]:
xgb_rand.best_params_

{'subsample': 0.7,
 'silent': 1,
 'objective': 'reg:linear',
 'nthread': 4,
 'n_estimators': 100,
 'min_child_weight': 6,
 'max_depth': 10,
 'learning_rate': 0.1,
 'colsample_bytree': 0.3}

### Save the best params

In [169]:
#define and store ExtraTreeClassifier best parameters
xgb_params = xgb_rand.best_params_
with open('xgb_params', 'wb') as f:
    pickle.dump(xgb_params, f)

## ExtraTrees

In [162]:
pprint(xtr.get_params)

<bound method BaseEstimator.get_params of ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=42, verbose=0,
                     warm_start=False)>


In [164]:
param_dist = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap
             }

In [165]:
xtr_tune = RandomizedSearchCV(estimator = xtr, param_distributions = param_dist, n_iter = 50, \
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
xtr_tune.fit(X_train, y_train)
xtr_tune.best_params_

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 26.0min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 107.1min finished


{'n_estimators': 85,
 'min_samples_split': 10,
 'min_samples_leaf': 6,
 'max_features': 'sqrt',
 'max_depth': 77,
 'bootstrap': False}

### Save the best params

In [168]:
#define and store ExtraTreeClassifier best parameters
xtr_params = xtr_tune.best_params_
with open('xtr_params', 'wb') as f:
    pickle.dump(xtr_params, f)

# Loading best parameters

Random Forest

In [None]:
#import Random Forest best parameters
with open('rf_params', 'rb') as f:
    rf_params = pickle.load(f)

XGBoost

In [None]:
#import XGBoost best parameters
with open('xgb_params', 'rb') as f:
    xgb_params = pickle.load(f)

ExtraTreeClassifier

In [None]:
#import ExtraTreeClassifier best parameters
with open('xtr_params', 'rb') as f:
    xtr_params = pickle.load(f)

# Modeling (tuned)

In [174]:
#instantiate models
seed = 42
new_clf = RandomForestClassifier(**rf_random.best_params_, random_state=seed)
new_xgb = XGBClassifier(**xgb_rand.best_params_, random_state=seed)
new_xtr = ExtraTreesClassifier(**xtr_tune.best_params_, random_state=seed)

In [175]:
classifiers = [('Random Forest', new_clf), ('XGBoost', new_xgb), ('ExtraTrees', new_xtr)]

In [176]:
# Iterate over the pre-defined list of classifiers
for clf_name, clf_algo in classifiers:    

    # Fit clf to the training set
    clf_algo.fit(X_train, y_train)    
    
    # Predict y_pred
    y_pred = clf_algo.predict(X_test)
          
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f'name: {clf_name}')
    print(classification_report(y_test, y_pred))
    # Evaluate clf's accuracy on the test set 
    print(f'accuracy: {accuracy}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'confusion matrix: {cm}')

name: Random Forest
              precision    recall  f1-score   support

           0       0.68      0.71      0.69     90747
           1       0.70      0.66      0.68     91094

    accuracy                           0.69    181841
   macro avg       0.69      0.69      0.69    181841
weighted avg       0.69      0.69      0.69    181841

accuracy: 0.6856154552603648
precision: 0.6950487535645294
recall: 0.6635673041034535
confusion matrix: [[64226 26521]
 [30647 60447]]
name: XGBoost
              precision    recall  f1-score   support

           0       0.68      0.71      0.70     90747
           1       0.70      0.67      0.68     91094

    accuracy                           0.69    181841
   macro avg       0.69      0.69      0.69    181841
weighted avg       0.69      0.69      0.69    181841

accuracy: 0.6894264769771393
precision: 0.6983737694397011
recall: 0.6689573407688761
confusion matrix: [[64428 26319]
 [30156 60938]]
name: ExtraTrees
              precision  

In [177]:
# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)     

# Fit vc to the training set
vc.fit(X_train, y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Voting Classifier: ')
print(classification_report(y_test, y_pred))
    # Evaluate clf's accuracy on the test set 
print(f'accuracy: {accuracy}')
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'confusion matrix: {cm}')

Voting Classifier: 
              precision    recall  f1-score   support

           0       0.68      0.71      0.69     90747
           1       0.70      0.67      0.68     91094

    accuracy                           0.69    181841
   macro avg       0.69      0.69      0.69    181841
weighted avg       0.69      0.69      0.69    181841

accuracy: 0.6892120038935113
precision: 0.6978623088894992
recall: 0.6694403583111951
confusion matrix: [[64345 26402]
 [30112 60982]]


# Results Summary

## Untuned

 - Random Forest
 - accuracy: 0.6444
 - precision: 0.6708
 - recall: 0.5697
 - confusion matrix: [[65281 25466]
 [39193 51901]]

 
 |' | Predicted On-time | Predicted Delayed |
 |-- | --- | --- |
 |Actual On-time |65281|25466|
 |Actual Delayed |39193|51901|

 - XGBoost: 
 - accuracy: 0.6436
 - precision: 0.6502
 - recall: 0.6245
 - confusion matrix: [[60151 30596]
 [34205 56889]]

 
 |' | Predicted On-time | Predicted Delayed |
 |-- | --- | --- |
 |Actual On-time |60151|30596|
 |Actual Delayed |34205|56889|

 - Extremely Randomized Trees: 
 - accuracy: 0.6424
 - precision: 0.6659
 - recall: 0.5744
 - confusion matrix: [[64499 26248]
 [38767 52327]]

 
 |' | Predicted On-time | Predicted Delayed |
 |-- | --- | --- |
 |Actual On-time |64499|26248|
 |Actual Delayed |38767|52327|

## Tuned

 - Random Forest, 
 - accuracy: 0.6856
 - precision: 0.6950
 - recall: 0.6635
 - confusion matrix: [[64226 26521]
 [30647 60447]]

 
 |' | Predicted On-time | Predicted Delayed |
 |-- | --- | --- |
 |Actual On-time |64226|26521|
 |Actual Delayed |30647|60447|

 - XGBoost, 
 - accuracy: 0.6894
 - precision: 0.6983
 - recall: 0.6689
 - confusion matrix: [[64428 26319]
 [30156 60938]]

 |' | Predicted On-time | Predicted Delayed |
 |-- | --- | --- |
 |Actual On-time |64428|26319|
 |Actual Delayed |30156|60938|

 - Extremely Randomized Trees
 - accuracy: 0.6824
 - precision: 0.6895
 - recall: 0.6657
 - confusion matrix: [[63448 27299]
 [30449 60645]]

 |' | Predicted On-time | Predicted Delayed |
 |-- | --- | --- |
 |Actual On-time |63448|27299|
 |Actual Delayed |30449|60645|

## Voting Classifier

 - Voting Classifier:
 - accuracy: 0.6892
 - precision: 0.6978
 - recall: 0.6694
 - confusion matrix: [[64345 26402]
 [30112 60982]]

 |' | Predicted On-time | Predicted Delayed |
 |-- | --- | --- |
 |Actual On-time |64345|26402|
 |Actual Delayed |30112|60982|