In [1]:
import sys
import os
import time

# Get the current working directory
current_dir = os.getcwd()

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.insert(0, parent_dir)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
from urllib.request import urlretrieve
from scipy.stats import pearsonr

from config.config import DATA_PATH
pd.set_option('display.max_columns', None)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, average_precision_score, precision_recall_curve, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, cross_validate
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from imblearn.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier

from sklearn.decomposition import PCA
from sklearn.svm import SVC


In [3]:
X_train = pd.read_csv(DATA_PATH + '/modeling/X_train.csv')
X_test = pd.read_csv(DATA_PATH + '/modeling/X_test.csv')
y_train = pd.read_csv(DATA_PATH + '/modeling/y_train.csv')
y_test = pd.read_csv(DATA_PATH + '/modeling/y_test.csv')

In [4]:
y_train = y_train['cancelled']
y_test = y_test['cancelled']

In [5]:
X_train.head()

Unnamed: 0,quarter,month,day_of_month,day_of_week,hour_of_day,scheduled_elapsed_time,distance,dep_delay_mean_10D,dep_delay_median_10D,dep_delay_max_10D,cancelled_sum_10D,div_airport_landings_sum_10D,n_flights_10D,dep_delay_mean_30D,dep_delay_median_30D,dep_delay_max_30D,cancelled_sum_30D,div_airport_landings_sum_30D,cancelled_sum_90D,div_airport_landings_sum_90D,temperature_2m_origin,precipitation_origin,snow_depth_origin,visibility_origin,wind_speed_10m_origin,wind_direction_10m_origin,wind_gusts_10m_origin,temperature_2m_dest,precipitation_dest,snow_depth_dest,visibility_dest,wind_speed_10m_dest,wind_direction_10m_dest,wind_gusts_10m_dest,origin_weather_impact_category,dest_weather_impact_category,origin_encoded,dest_encoded,airline_mkt_encoded,airline_ops_encoded,origin_division_encoded,dest_division_encoded,code_share_flight,is_holiday,dep_window_early afternoon,dep_window_early morning,dep_window_evening,dep_window_late night,dep_window_midday,dep_window_morning,dep_window_night,dep_window_overnight
0,-0.462372,-0.742801,1.627043,1.513039,-1.191404,0.962666,0.793271,-0.637097,-0.433316,-0.848334,-0.321698,-0.026067,-0.485351,-0.255407,-0.357089,1.020906,-0.455582,-0.044988,-0.613999,-0.075349,-0.378699,-0.059103,-0.09847,-0.803722,-1.054689,0.527084,0.099992,-0.765814,0.196989,-0.097696,-1.568529,2.247381,-1.538187,1.803467,-0.240644,4.093543,-0.392666,0.727335,0.79095,0.114362,-0.140199,1.415694,0,0,0,0,0,0,0,1,0,0
1,-1.361143,-1.035081,1.741076,0.51336,1.374386,0.864121,0.490951,0.410658,0.711883,-0.172918,-0.321698,-0.026067,-0.375945,-0.130465,0.046913,-0.854923,-0.455582,-0.044988,-0.613999,-0.075349,0.764768,-0.059103,-0.09847,-0.577887,0.383432,-0.133266,1.007014,-0.568275,-0.060985,-0.097696,-0.492279,-1.0064,-1.37721,-1.049016,-0.240644,-0.240942,-0.392666,2.555209,0.79095,0.114362,-0.140199,2.167403,0,0,0,0,0,0,0,0,1,0
2,-1.361143,-1.619639,1.627043,-1.485998,0.190175,2.879816,2.311011,-0.816002,-0.540678,-1.135745,-0.321698,-0.026067,-0.922973,-0.922882,-0.312199,-1.741141,-0.455582,-0.044988,-0.613999,-0.075349,-0.591698,-0.059103,-0.09847,1.214675,-0.078821,-1.023739,0.156681,-1.160893,-0.060985,-0.097696,-0.651376,-0.928929,0.725541,-0.441804,-0.240644,-0.240942,-0.503124,1.580374,-0.491204,-0.702297,-1.015747,2.167403,0,0,1,0,0,0,0,0,0,0
3,0.436398,0.426315,-0.19748,-1.485998,0.979649,0.21014,-0.346952,-0.292467,-0.075441,-0.560923,-0.321698,-0.026067,-0.485351,1.470541,0.226469,1.109528,1.075347,-0.044988,1.225667,-0.075349,2.266969,-0.059103,-0.09847,1.186446,-1.593984,-0.983718,-1.147163,1.102344,-0.060985,-0.097696,-0.69817,-0.903105,-0.260438,-0.921925,-0.240644,-0.240942,-0.392666,0.001885,0.79095,0.114362,-0.140199,-0.406655,0,0,0,0,1,0,0,0,0,0
4,1.335169,1.303153,1.627043,0.01352,-0.007193,-0.775311,-0.265617,-0.303767,-0.325953,-0.48907,-0.321698,-0.026067,-0.485351,-0.771946,-0.26731,-1.179869,-0.455582,-0.044988,-0.613999,-0.075349,-0.2722,-0.059103,-0.09847,1.054709,-0.258586,-0.133266,-0.353519,-1.262485,-0.060985,-0.097696,0.209624,0.465549,1.359384,-0.060532,-0.240644,-0.240942,-0.900817,-0.343188,-0.491204,-0.702297,-0.363483,-0.376839,0,0,1,0,0,0,0,0,0,0


## Model Selection, Evaluation and GridSearch Functions

In [6]:
cross_val_results = pd.DataFrame(columns=['model_name', 'model', 'mean_f1', 'mean_recall', 'mean_precision'])

In [9]:
def cross_validate_model(model_name, model, X_train, y_train):

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    scoring = ['f1', 'recall', 'precision']

    scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=kf, n_jobs=-1)

    mean_f1 = scores['test_f1'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_precision = scores['test_precision'].mean()

    cross_val_results.loc[len(cross_val_results)] = [model_name, model, mean_f1, mean_recall, mean_precision]

    return cross_val_results


In [10]:
def evaluate_model(model_name, model, X_train, y_train, X_test, y_test, params=None):


    start_time = time.time()

    if params:
        model.set_params(**params)

    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    end_time = time.time()
    elapsed_time = end_time - start_time

    # calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba[:,1])
    auprc = average_precision_score(y_test, y_pred_proba[:,1])

    model_params = model.get_params()

    
    metrics =  {
        'model_name': model_name,
        'model': model, 
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'auprc': auprc,
        'parameters': params if params else dict(model.get_params()),
        'training_testing_time_seconds': elapsed_time

    }

    print(f'Results for {model_name} model:')
    print('Classifcation report: \n', classification_report(y_test, y_pred))
    print()
    print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))

    return pd.DataFrame([metrics]), y_pred, y_pred_proba



In [11]:
def grid_search_model(model, params, X_train, y_train):

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    gs = GridSearchCV(model, param_grid=params, cv=kf, scoring='recall', n_jobs=-1)

    gs_fit = gs.fit(X_train, y_train)

    best_estimator, best_params = gs_fit.best_estimator_, gs_fit.best_params_

    return best_estimator, best_params

In [12]:
from sklearn.model_selection import RandomizedSearchCV

def random_grid_search_model(model, params, X_train, y_train):

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=40)

    rs  = RandomizedSearchCV(model, params, n_iter=50, n_jobs=-1, cv=kf)

    rs_fit = rs.fit(X_train, y_train)

    best_estimator, best_params = rs_fit.best_estimator_, rs_fit.best_params_

    return best_estimator, best_params

## Class Imbalance Techniques

In [49]:
undersampler = RandomUnderSampler(random_state=40)
X_train_rus, y_train_rus = undersampler.fit_resample(X_train, y_train)

rf_metrics, y_pred, y_pred_proba = evaluate_model(model_name='RF_random_undersample', 
                                                  model=RandomForestClassifier(random_state=40), 
                                                  X_train=X_train_rus, 
                                                  y_train=y_train_rus, 
                                                  X_test=X_test, 
                                                  y_test=y_test, 
                                                  params=best_params)
    
model_results = pd.concat([model_results, rf_metrics], axis=0, ignore_index=True)

Results for RF_under_sample model:
Classifcation report: 
               precision    recall  f1-score   support

           0       1.00      0.88      0.94    441812
           1       0.10      1.00      0.19      6318

    accuracy                           0.88    448130
   macro avg       0.55      0.94      0.56    448130
weighted avg       0.99      0.88      0.92    448130


Confusion matrix: 
 [[388040  53772]
 [    25   6293]]


In [14]:
undersampler_nm = NearMiss(version=1)

X_train_undersampled, y_train_undersampled = undersampler_nm.fit_resample(X_train, y_train)

In [42]:
rf_metrics, y_pred, y_pred_proba = evaluate_model(model_name='RF_undersample_nm_1', 
                                                  model=RandomForestClassifier(random_state=40), 
                                                  X_train=X_train_undersampled, 
                                                  y_train=y_train_undersampled, 
                                                  X_test=X_test, 
                                                  y_test=y_test, 
                                                  params=best_params)
    
model_results = pd.concat([model_results, rf_metrics], axis=0, ignore_index=True)

Results for RF_undersample_nm_bp model:
Classifcation report: 
               precision    recall  f1-score   support

           0       1.00      0.92      0.96    441812
           1       0.13      0.82      0.22      6318

    accuracy                           0.92    448130
   macro avg       0.56      0.87      0.59    448130
weighted avg       0.98      0.92      0.95    448130


Confusion matrix: 
 [[406063  35749]
 [  1166   5152]]


In [None]:
tomek = TomekLinks()

X_train_tomek, y_train_tomek = tomek.fit_resample(X_train, y_train)

rf_metrics, y_pred, y_pred_proba = evaluate_model(model_name='RF_tomek', 
                                                  model=RandomForestClassifier(random_state=40), 
                                                  X_train=X_train_tomek, 
                                                  y_train=y_train_tomek, 
                                                  X_test=X_test, 
                                                  y_test=y_test, 
                                                  params=best_params)
    
model_results = pd.concat([model_results, rf_metrics], axis=0, ignore_index=True)

# Baseline Models & Performance

In [None]:
model_dict = {
    'LogReg_base_cv': LogisticRegression(),
    'RF_base_cv': RandomForestClassifier(),
    'GB_base_cv': GradientBoostingClassifier(),
    'Balanced_RF_cv': BalancedRandomForestClassifier(),
    'Easy_Ensemble_cv': EasyEnsembleClassifier()
}

In [14]:
cross_val_results

Unnamed: 0,model_name,model,mean_f1,mean_recall,mean_precision
0,LogReg_base_cv,LogisticRegression(),0.075863,0.044381,0.261396


# Evaluate Best Base Models

In [9]:
logreg_model = LogisticRegression()

log_reg_base_result, y_pred, y_pred_proba = evaluate_model('LogReg_base', logreg_model, X_train, y_train, X_test, y_test)

model_results = pd.DataFrame(log_reg_base_result)

Results for LogReg_base model:
Classifcation report: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    441812
           1       0.25      0.04      0.08      6318

    accuracy                           0.98    448130
   macro avg       0.62      0.52      0.53    448130
weighted avg       0.98      0.98      0.98    448130


Confusion matrix: 
 [[440969    843]
 [  6035    283]]


In [10]:
rf_model = RandomForestClassifier(random_state=40)

rf_base_result, y_pred, y_pred_proba = evaluate_model('RF_base', rf_model, X_train, y_train, X_test, y_test)
model_results = pd.concat([model_results, rf_base_result], axis=0, ignore_index=True)

Results for RF_base model:
Classifcation report: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    441812
           1       0.83      0.16      0.27      6318

    accuracy                           0.99    448130
   macro avg       0.91      0.58      0.63    448130
weighted avg       0.99      0.99      0.98    448130


Confusion matrix: 
 [[441603    209]
 [  5290   1028]]


In [11]:
svm_model = SVC(random_state=44)

svm_base_result, y_pred, y_pred_proba = evaluate_model('SVM_base', svm_model, X_train, y_train, X_test, y_test)
model_results = pd.concat([model_results, svm_base_result], axis=0, ignore_index=True)

AttributeError: This 'SVC' has no attribute 'predict_proba'

In [12]:
gb_model = GradientBoostingClassifier(random_state=44)

gb_base_result, y_pred, y_pred_proba = evaluate_model('GB_base', gb_model, X_train, y_train, X_test, y_test)
model_results = pd.concat([model_results, gb_base_result], axis=0, ignore_index=True)

Results for GB_base model:
Classifcation report: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    441812
           1       0.73      0.09      0.17      6318

    accuracy                           0.99    448130
   macro avg       0.86      0.55      0.58    448130
weighted avg       0.98      0.99      0.98    448130


Confusion matrix: 
 [[441597    215]
 [  5724    594]]


In [13]:
knn_model = KNeighborsClassifier(n_neighbors=6)

knn_base_result, y_pred, y_pred_proba = evaluate_model('KNN_base', knn_model, X_train, y_train, X_test, y_test)
model_results = pd.concat([model_results, knn_base_result], axis=0, ignore_index=True)

Results for KNN_base model:
Classifcation report: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    441812
           1       0.71      0.07      0.12      6318

    accuracy                           0.99    448130
   macro avg       0.85      0.53      0.56    448130
weighted avg       0.98      0.99      0.98    448130


Confusion matrix: 
 [[441637    175]
 [  5895    423]]


In [50]:
model_results.to_csv('./model_results/model_results.10.25_v1.csv', index=False)

In [44]:
model_results.sort_values('recall', ascending=False)

Unnamed: 0,model_name,model,accuracy,precision,recall,f1,roc_auc,auprc,parameters,training_testing_time_seconds
13,Easy_Ensemble,"((RandomUnderSampler(random_state=1839144522),...",0.875699,0.101869,1.0,0.184902,0.963964,0.250732,"{'estimator': None, 'n_estimators': 10, 'n_job...",25.250789
12,Balanced_RF,"(DecisionTreeClassifier(max_features='sqrt', r...",0.877431,0.103082,0.99905,0.186881,0.970026,0.312254,"{'bootstrap': 'warn', 'ccp_alpha': 0.0, 'class...",13.858661
1,RF_base,"(DecisionTreeClassifier(max_features='sqrt', r...",0.987729,0.831043,0.16271,0.272138,0.973997,0.46951,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",73.010128
10,RF_balanced,"(DecisionTreeClassifier(max_features='sqrt', r...",0.987149,0.824623,0.112377,0.197799,0.970189,0.429369,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",59.886441
11,RF_balanced_sub,"(DecisionTreeClassifier(max_features='sqrt', r...",0.987142,0.830952,0.110478,0.195027,0.970756,0.428888,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",66.992011
2,GB_base,([DecisionTreeRegressor(criterion='friedman_ms...,0.986747,0.73424,0.094017,0.16669,0.970053,0.360844,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",268.339116
9,GradientBoosting_vif_features,([DecisionTreeRegressor(criterion='friedman_ms...,0.986723,0.727723,0.093067,0.165029,0.969271,0.353383,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",187.907525
7,RandomForest_red_features,"(DecisionTreeClassifier(max_features='sqrt', r...",0.986303,0.611386,0.078189,0.138647,0.963682,0.295461,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",44.700374
3,KNN_base,KNeighborsClassifier(n_neighbors=6),0.986455,0.707358,0.066952,0.122325,0.751843,0.186499,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",404.518375
0,LogReg_base,LogisticRegression(),0.984652,0.251332,0.044793,0.076034,0.905328,0.16409,"{'C': 1.0, 'class_weight': None, 'dual': False...",1.937835


In [41]:
model_results = model_results.drop([9,10])

# Hyper Parameter Tuning

In [24]:
rf_param_grid = {
    'n_estimators':[350, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth':[3, 5, None],
    'max_features':['sqrt', 'log2', None],
    'class_weight': [None, 'balanced', 'balanced_subsample'],
    'n_jobs': [-1], 
    'random_state':[40]}

best_model, best_params = grid_search_model(RandomForestClassifier(), params=rf_param_grid, X_train=X_train_reduced, y_train=y_train)

KeyboardInterrupt: 

In [150]:
rf_metrics, y_pred, y_pred_proba = evaluate_model(model_name='RF_reduced_grid_search', 
                                                  model=best_model, 
                                                  X_train=X_train_reduced, 
                                                  y_train=y_train, 
                                                  X_test=X_test_reduced, 
                                                  y_test=y_test, 
                                                  params=best_params)
    
model_results = pd.concat([model_results, rf_metrics], axis=0, ignore_index=True)

Results for RF_reduced_grid_search model:
Classifcation report: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    441812
           1       0.68      0.06      0.10      6318

    accuracy                           0.99    448130
   macro avg       0.83      0.53      0.55    448130
weighted avg       0.98      0.99      0.98    448130


Confusion matrix: 
 [[441650    162]
 [  5970    348]]


In [151]:
model_results

Unnamed: 0,model_name,model,accuracy,precision,recall,f1,roc_auc,auprc,parameters
0,LogReg_base,LogisticRegression(),0.984652,0.251332,0.044793,0.076034,0.905328,0.16409,"{'C': 1.0, 'class_weight': None, 'dual': False..."
1,RF_base,"(DecisionTreeClassifier(max_features='sqrt', r...",0.98778,0.840065,0.164609,0.275278,0.973177,0.469523,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
2,GB_base,([DecisionTreeRegressor(criterion='friedman_ms...,0.986747,0.73424,0.094017,0.16669,0.970053,0.360844,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'..."
3,LogReg_pca_base,LogisticRegression(),0.984667,0.24183,0.040994,0.070104,0.899958,0.15883,"{'C': 1.0, 'class_weight': None, 'dual': False..."
4,RF_pca_base,"(DecisionTreeClassifier(max_features='sqrt', r...",0.986234,0.752542,0.035138,0.06714,0.940529,0.276127,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
5,GB_pca_base,([DecisionTreeRegressor(criterion='friedman_ms...,0.98577,0.447415,0.039728,0.072976,0.942209,0.197646,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'..."
6,LinearRegression,LogisticRegression(random_state=44),0.98454,0.195,0.030864,0.053293,0.916535,0.139606,"{'C': 1.0, 'class_weight': None, 'dual': False..."
7,RandomForest,"(DecisionTreeClassifier(max_features='sqrt', r...",0.986303,0.611386,0.078189,0.138647,0.963682,0.295461,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
8,GradientBoosting,([DecisionTreeRegressor(criterion='friedman_ms...,0.986107,0.664286,0.02944,0.056381,0.963288,0.263669,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'..."
9,RF_reduced_grid_search,"(DecisionTreeClassifier(criterion='entropy', m...",0.986316,0.682353,0.055081,0.101933,0.968305,0.319051,"{'n_estimators': 200, 'max_features': 'sqrt', ..."


In [25]:
rf_param_grid = {
    'n_estimators':[350, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth':[3, 5, None],
    'max_features':['sqrt', 'log2', None],
    'class_weight': [None, 'balanced', 'balanced_subsample'],
    'n_jobs': [-1], 
    'random_state':[40]}

best_model_r, best_params_r = random_grid_search_model(RandomForestClassifier(), params=rf_param_grid, X_train=X_train, y_train=y_train)



In [26]:
rf_metrics, y_pred, y_pred_proba = evaluate_model(model_name='RF_grid_search', 
                                                  model=best_model, 
                                                  X_train=X_train, 
                                                  y_train=y_train, 
                                                  X_test=X_test, 
                                                  y_test=y_test, 
                                                  params=best_params)
    
model_results = pd.concat([model_results, rf_metrics], axis=0, ignore_index=True)

Results for RF_grid_search model:
Classifcation report: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    441812
           1       0.78      0.23      0.36      6318

    accuracy                           0.99    448130
   macro avg       0.88      0.62      0.68    448130
weighted avg       0.99      0.99      0.99    448130


Confusion matrix: 
 [[441385    427]
 [  4835   1483]]


In [27]:
model_results.sort_values('recall', ascending=False)

Unnamed: 0,model_name,model,accuracy,precision,recall,f1,roc_auc,auprc,parameters
7,RF_grid_search,"(DecisionTreeClassifier(criterion='entropy', r...",0.988258,0.77644,0.234726,0.360476,0.977791,0.506163,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
1,RF_base,"(DecisionTreeClassifier(max_features='sqrt', r...",0.987729,0.831043,0.16271,0.272138,0.973997,0.46951,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
2,GB_base,([DecisionTreeRegressor(criterion='friedman_ms...,0.986747,0.73424,0.094017,0.16669,0.970053,0.360844,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'..."
5,RandomForest_red_features,"(DecisionTreeClassifier(max_features='sqrt', r...",0.986303,0.611386,0.078189,0.138647,0.963682,0.295461,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
3,KNN_base,KNeighborsClassifier(n_neighbors=6),0.986455,0.707358,0.066952,0.122325,0.751843,0.186499,"{'algorithm': 'auto', 'leaf_size': 30, 'metric..."
0,LogReg_base,LogisticRegression(),0.984652,0.251332,0.044793,0.076034,0.905328,0.16409,"{'C': 1.0, 'class_weight': None, 'dual': False..."
4,LinearRegression_red_features,LogisticRegression(random_state=44),0.98454,0.195,0.030864,0.053293,0.916535,0.139606,"{'C': 1.0, 'class_weight': None, 'dual': False..."
6,GradientBoosting_red_features,([DecisionTreeRegressor(criterion='friedman_ms...,0.986107,0.664286,0.02944,0.056381,0.963288,0.263669,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'..."


In [43]:
model_results.sort_values('recall', ascending=False)

Unnamed: 0,model_name,model,accuracy,precision,recall,f1,roc_auc,auprc,parameters
8,RF_under_sample,"(DecisionTreeClassifier(criterion='entropy', r...",0.878881,0.104122,0.998259,0.188575,0.970602,0.34883,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
10,RF_undersample_nm_p3,"(DecisionTreeClassifier(criterion='entropy', r...",0.917745,0.126184,0.815923,0.218566,0.958633,0.213455,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
11,RF_undersample_nm_bp,"(DecisionTreeClassifier(criterion='entropy', r...",0.917624,0.125963,0.815448,0.218217,0.958597,0.213159,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
9,RF_undersample_nm_p2,"(DecisionTreeClassifier(criterion='entropy', r...",0.917589,0.125877,0.815131,0.218077,0.958621,0.213478,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
7,RF_grid_search,"(DecisionTreeClassifier(criterion='entropy', r...",0.988258,0.77644,0.234726,0.360476,0.977791,0.506163,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
1,RF_base,"(DecisionTreeClassifier(max_features='sqrt', r...",0.987729,0.831043,0.16271,0.272138,0.973997,0.46951,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
2,GB_base,([DecisionTreeRegressor(criterion='friedman_ms...,0.986747,0.73424,0.094017,0.16669,0.970053,0.360844,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'..."
5,RandomForest_red_features,"(DecisionTreeClassifier(max_features='sqrt', r...",0.986303,0.611386,0.078189,0.138647,0.963682,0.295461,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
3,KNN_base,KNeighborsClassifier(n_neighbors=6),0.986455,0.707358,0.066952,0.122325,0.751843,0.186499,"{'algorithm': 'auto', 'leaf_size': 30, 'metric..."
0,LogReg_base,LogisticRegression(),0.984652,0.251332,0.044793,0.076034,0.905328,0.16409,"{'C': 1.0, 'class_weight': None, 'dual': False..."


In [44]:
model_results.to_csv('./model_results/model_results.10.25.csv', index=False)

In [38]:
params_2 = {'random_state': 40,
 'n_jobs': -1,
 'n_estimators': 600,
 'max_features': None,
 'max_depth': None,
 'criterion': 'entropy',
 'class_weight': 'balanced'}

params_3 = {'random_state': 40,
 'n_jobs': -1,
 'n_estimators': 600,
 'max_features': None,
 'max_depth': None,
 'criterion': 'entropy',
 'class_weight': 'balanced_subsample'}

In [21]:
model_results = pd.read_csv('./model_results/model_results.10.25.csv')


In [23]:
undersampler_nm_3 = NearMiss(version=3, n_neighbors=10)

X_train_undersampled_3, y_train_undersampled_3 = undersampler_nm_3.fit_resample(X_train, y_train)

In [25]:
rf_metrics, y_pred, y_pred_proba = evaluate_model(model_name='RF_undersample_nm_3', 
                                                  model=RandomForestClassifier(random_state=40), 
                                                  X_train=X_train_undersampled_3, 
                                                  y_train=y_train_undersampled_3, 
                                                  X_test=X_test, 
                                                  y_test=y_test)
    
model_results = pd.concat([model_results, rf_metrics], axis=0, ignore_index=True)

Results for RF_undersample_nm_3 model:
Classifcation report: 
               precision    recall  f1-score   support

           0       1.00      0.90      0.95    441812
           1       0.12      0.89      0.21      6318

    accuracy                           0.90    448130
   macro avg       0.56      0.90      0.58    448130
weighted avg       0.99      0.90      0.94    448130


Confusion matrix: 
 [[399629  42183]
 [   712   5606]]


In [32]:
model_results.to_csv('./model_results/model_results.10.25.csv')

In [14]:
model_results.sort_values('recall', ascending=False)

Unnamed: 0.1,Unnamed: 0,model_name,model,accuracy,precision,recall,f1,roc_auc,auprc,parameters
8,8.0,RF_under_sample,"RandomForestClassifier(criterion='entropy', ma...",0.878881,0.104122,0.998259,0.188575,0.970602,0.34883,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
12,,RF_undersample_nm_3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.90428,0.117307,0.887306,0.207219,0.947417,0.123457,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
10,10.0,RF_undersample_nm_p3,RandomForestClassifier(class_weight='balanced_...,0.917745,0.126184,0.815923,0.218566,0.958633,0.213455,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
11,11.0,RF_undersample_nm_bp,"RandomForestClassifier(criterion='entropy', ma...",0.917624,0.125963,0.815448,0.218217,0.958597,0.213159,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
9,9.0,RF_undersample_nm_p2,RandomForestClassifier(class_weight='balanced'...,0.917589,0.125877,0.815131,0.218077,0.958621,0.213478,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
7,7.0,RF_grid_search,"RandomForestClassifier(criterion='entropy', ma...",0.988258,0.77644,0.234726,0.360476,0.977791,0.506163,"{'random_state': 40, 'n_jobs': -1, 'n_estimato..."
1,1.0,RF_base,RandomForestClassifier(random_state=40),0.987729,0.831043,0.16271,0.272138,0.973997,0.46951,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
2,2.0,GB_base,GradientBoostingClassifier(random_state=44),0.986747,0.73424,0.094017,0.16669,0.970053,0.360844,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'..."
5,5.0,RandomForest_red_features,RandomForestClassifier(random_state=44),0.986303,0.611386,0.078189,0.138647,0.963682,0.295461,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
3,3.0,KNN_base,KNeighborsClassifier(n_neighbors=6),0.986455,0.707358,0.066952,0.122325,0.751843,0.186499,"{'algorithm': 'auto', 'leaf_size': 30, 'metric..."


In [141]:
gb_model = GradientBoostingClassifier(random_state=44)

gb_PCA_result, y_pred, y_pred_proba = evaluate_model('GB_pca_base', gb_model, X_train_final, y_train, X_test_final, y_test)
model_results = pd.concat([model_results, gb_PCA_result], axis=0, ignore_index=True)


Results for GB_pca_base model:
Classifcation report: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    441812
           1       0.45      0.04      0.07      6318

    accuracy                           0.99    448130
   macro avg       0.72      0.52      0.53    448130
weighted avg       0.98      0.99      0.98    448130


Confusion matrix: 
 [[441502    310]
 [  6067    251]]
