In [2]:
import pandas as pd 
import numpy as np 
from scipy.stats import loguniform
from sklearn import metrics, svm
import xgboost
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")

In [3]:
x_train_df = pd.read_excel('./X_train_encoded.xlsx')
x_val_df = pd.read_excel('./X_val_encoded.xlsx')
x_test_df = pd.read_excel('./X_test_encoded.xlsx')

In [4]:
y_train_df = pd.read_excel('./Y_train.xlsx')
y_train = y_train_df.values.ravel()
y_val_df = pd.read_excel('./Y_val.xlsx')
y_val = y_val_df.values.ravel()
y_test_df = pd.read_excel('./Y_test.xlsx')
y_test = y_test_df.values.ravel()

### XGBoost

In [5]:
#Randomized search for xgboost parameters
xgb_params = {
    'n_estimators': range(50, 400, 50),
    'min_child_weight': range(1, 10, 1),
    'max_depth': range(3, 10, 1),
    'eta': [0.01, 0.05, 0.1, 0.3, 0.5],
    'colsample_bytree': np.arange(0.5, 1, 0.1),
    'subsample': np.arange(0.5, 1, 0.1),
    'gamma': [1, 3, 5, 10, 20, 50]
}

search_xgb = RandomizedSearchCV(XGBClassifier(objective='binary:logistic', seed=105), param_distributions=xgb_params, n_iter=100, 
                                scoring='f1_weighted', n_jobs=-1, verbose=4, random_state=105, refit=True)

search_xgb.fit(x_train_df, y_train)
xgb = search_xgb.best_estimator_
print(f'Best model: {xgb}')

y_pred_train_xgb = xgb.predict(x_train_df)
xgb_train_f1 = metrics.f1_score(y_train, y_pred_train_xgb, average="weighted")
xgb_train_precision = metrics.precision_score(y_train, y_pred_train_xgb, average="weighted")
xgb_train_recall = metrics.recall_score(y_train, y_pred_train_xgb, average="weighted")
print(f'train f1 score: {xgb_train_f1}')
print(f'train precision: {xgb_train_precision}')
print(f'train recall: {xgb_train_recall}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, early_stopping_rounds=None,
              enable_categorical=False, eta=0.05, eval_metric=None,
              feature_types=None, gamma=5, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=9,
              max_leaves=None, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=250, n_jobs=None,
              num_parallel_tree=None, predictor=None, ...)
train f1 score: 0.644545991653195
train precision: 0.6451200017402402
train recall: 0.6448214285714285


In [6]:
#Implement best model on validate set
xgb.fit(x_train_df, y_train)
y_pred_xgb = xgb.predict(x_val_df)

xgb_validate_f1 = metrics.f1_score(y_val, y_pred_xgb, average="weighted")
xgb_validate_precision = metrics.precision_score(y_val, y_pred_xgb, average="weighted")
xgb_validate_recall = metrics.recall_score(y_val, y_pred_xgb, average="weighted")
print(f'validation f1 score: {xgb_validate_f1}')
print(f'validation precision: {xgb_validate_precision}')
print(f'validation recall: {xgb_validate_recall}')

validation f1 score: 0.6244221119975949
validation precision: 0.6261525243706334
validation recall: 0.6248333333333334


### Logistic Regression

In [7]:
#Randomized search for logistic regression parameters
log_params = {
    'solver'      : ['newton-cg', 'lbfgs', 'sag'],
    'penalty'     : [None,'l2'],
    'C'           : [100, 10, 1.0, 0.1, 0.01],
}

search_log = RandomizedSearchCV(LogisticRegression(), param_distributions=log_params, n_iter=100, scoring='f1_weighted', verbose=4, random_state=105, refit=True)
search_log.fit(x_train_df, y_train)
log = search_log.best_estimator_
print(f'Best model: {log}')

y_pred_train_log = log.predict(x_train_df)
log_train_f1 = metrics.f1_score(y_train, y_pred_train_log, average="weighted")
log_train_precision = metrics.precision_score(y_train, y_pred_train_log, average="weighted")
log_train_recall = metrics.recall_score(y_train, y_pred_train_log, average="weighted")
print(f'train f1 score: {log_train_f1}')
print(f'train precision: {log_train_precision}')
print(f'train recall: {log_train_recall}')

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END C=100, penalty=None, solver=newton-cg;, score=0.605 total time=   0.0s
[CV 2/5] END C=100, penalty=None, solver=newton-cg;, score=0.601 total time=   0.0s
[CV 3/5] END C=100, penalty=None, solver=newton-cg;, score=0.609 total time=   0.0s
[CV 4/5] END C=100, penalty=None, solver=newton-cg;, score=0.606 total time=   0.0s
[CV 5/5] END C=100, penalty=None, solver=newton-cg;, score=0.599 total time=   0.0s
[CV 1/5] END .C=100, penalty=None, solver=lbfgs;, score=0.605 total time=   0.0s
[CV 2/5] END .C=100, penalty=None, solver=lbfgs;, score=0.601 total time=   0.0s
[CV 3/5] END .C=100, penalty=None, solver=lbfgs;, score=0.609 total time=   0.0s
[CV 4/5] END .C=100, penalty=None, solver=lbfgs;, score=0.606 total time=   0.0s
[CV 5/5] END .C=100, penalty=None, solver=lbfgs;, score=0.599 total time=   0.0s
[CV 1/5] END ...C=100, penalty=None, solver=sag;, score=0.605 total time=   0.1s
[CV 2/5] END ...C=100, penalty=N

In [8]:
#Implement best model on validate set
log.fit(x_train_df, y_train)
y_pred_log = log.predict(x_val_df)

log_validate_f1 = metrics.f1_score(y_val, y_pred_log, average="weighted")
log_validate_precision = metrics.precision_score(y_val, y_pred_log, average="weighted")
log_validate_recall = metrics.recall_score(y_val, y_pred_log, average="weighted")
print(f'validation f1 score: {log_validate_f1}')
print(f'validation precision: {log_validate_precision}')
print(f'validation recall: {log_validate_recall}')

validation f1 score: 0.6018285954261583
validation precision: 0.6168562982072094
validation recall: 0.6076666666666667


### RBF SVM

In [9]:
#Randomized search for rbf svm classifier parameters
rbfSVM_params = {
    "kernel" : ["rbf"],
    "gamma" : np.logspace(-3, 8, 10)
}

search_rbfSVM = RandomizedSearchCV(svm.SVC(), param_distributions=rbfSVM_params, n_iter=100, scoring="f1_weighted", n_jobs=-1, verbose=4, random_state=105)
search_rbfSVM.fit(x_train_df, y_train)
rbfSVM = search_rbfSVM.best_estimator_
print(f'Best model: {rbfSVM}')

y_pred_train_rbfsvm = rbfSVM.predict(x_train_df)
rbfsvm_train_f1 = metrics.f1_score(y_train, y_pred_train_rbfsvm, average="weighted")
rbfsvm_train_precision = metrics.precision_score(y_train, y_pred_train_rbfsvm, average="weighted")
rbfsvm_train_recall = metrics.recall_score(y_train, y_pred_train_rbfsvm, average="weighted")
print(f'train f1 score: {rbfsvm_train_f1}')
print(f'train precision: {rbfsvm_train_precision}')
print(f'train recall: {rbfsvm_train_recall}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best model: SVC(gamma=77.42636826811278)
train f1 score: 0.6295778659164355
train precision: 0.6333656477205505
train recall: 0.6312857142857143


In [10]:
#Implement best model on validate set
rbfSVM.fit(x_train_df, y_train)
y_pred_rbfsvm = rbfSVM.predict(x_val_df)

rbfsvm_validate_f1 = metrics.f1_score(y_val, y_pred_rbfsvm, average="weighted")
rbfsvm_validate_precision = metrics.precision_score(y_val, y_pred_rbfsvm, average="weighted")
rbfsvm_validate_recall = metrics.recall_score(y_val, y_pred_rbfsvm, average="weighted")
print(f'validation f1 score: {rbfsvm_validate_f1}')
print(f'validation precision: {rbfsvm_validate_precision}')
print(f'validation recall: {rbfsvm_validate_recall}')

validation f1 score: 0.6068139006814125
validation precision: 0.6125852963195944
validation recall: 0.6088333333333333


### Model Results Summary

In [11]:
results = {
    'Train F1-score': [xgb_train_f1, log_train_f1, rbfsvm_train_f1],
    'Validation F1-score': [xgb_validate_f1, log_validate_f1, rbfsvm_validate_f1],
    'Train Precision': [xgb_train_precision, log_train_precision, rbfsvm_train_precision],
    'Validation Precision': [xgb_validate_precision, log_validate_precision, rbfsvm_validate_precision],
    'Train Recall': [xgb_train_recall, log_train_recall, rbfsvm_train_recall],
    'Validation Recall': [xgb_validate_recall, log_validate_recall, rbfsvm_validate_recall]
}

results_df = pd.DataFrame(results, index=['XGBoost', 'Logistic Regression', 'RBF SVM'])
results_df

Unnamed: 0,Train F1-score,Validation F1-score,Train Precision,Validation Precision,Train Recall,Validation Recall
XGBoost,0.644546,0.624422,0.64512,0.626153,0.644821,0.624833
Logistic Regression,0.603482,0.601829,0.614305,0.616856,0.608804,0.607667
RBF SVM,0.629578,0.606814,0.633366,0.612585,0.631286,0.608833


In [12]:
xgb.fit(x_test_df, y_test)
y_pred_xgb_test = xgb.predict(x_test_df)

xgb_test_f1 = metrics.f1_score(y_test, y_pred_xgb_test, average="weighted")
xgb_test_precision = metrics.precision_score(y_test, y_pred_xgb_test, average="weighted")
xgb_test_recall = metrics.recall_score(y_test, y_pred_xgb_test, average="weighted")
print(f'test f1 score: {xgb_test_f1}')
print(f'test precision: {xgb_test_precision}')
print(f'test recall: {xgb_test_recall}')

test f1 score: 0.6670612954517966
test precision: 0.6672497191812925
test recall: 0.6671666666666667


In [17]:
from sklearn.metrics import classification_report

report_val = classification_report(y_val, y_pred_xgb, labels=[0,1], target_names=["No delay", "Delay"])
print(report_val)

              precision    recall  f1-score   support

    No delay       0.64      0.59      0.61      6089
       Delay       0.61      0.66      0.63      5911

    accuracy                           0.62     12000
   macro avg       0.63      0.63      0.62     12000
weighted avg       0.63      0.62      0.62     12000



In [19]:
report_test = classification_report(y_test, y_pred_xgb_test, labels=[0,1], target_names=["No delay", "Delay"])
print(report_test)

              precision    recall  f1-score   support

    No delay       0.66      0.68      0.67      6039
       Delay       0.67      0.65      0.66      5961

    accuracy                           0.67     12000
   macro avg       0.67      0.67      0.67     12000
weighted avg       0.67      0.67      0.67     12000

