In [1]:
import pickle
import os
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from tqdm import tqdm
import itertools
from scipy import sparse
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier , RandomForestClassifier
from matplotlib import pyplot
from sklearn import metrics
from sklearn import preprocessing
from mlxtend.classifier import StackingCVClassifier 
from sklearn.preprocessing import StandardScaler  
project_folder = "."



In [2]:
with open(os.path.join(project_folder,"train_test.pkl"), "rb") as f:
    X_train, X_test, y_train, y_test = pickle.load(f)
  
print("X_train",X_train.shape)
print("y_train",y_train.shape)
print("X_test",X_test.shape)
print("y_test",y_test.shape)

FileNotFoundError: [Errno 2] No such file or directory: '.\\train_test.pkl'

In [3]:
X_train_array = X_train.toarray()
X_test_array = X_test.toarray()

In [4]:
scaler = StandardScaler()
X_train_array = scaler.fit_transform(X_train_array)
X_test_array = scaler.transform(X_test_array)

In [6]:
X_train_array.shape

(128000, 68)

In [8]:
# get a stacking ensemble of models
clf1 = KNeighborsClassifier(n_neighbors=3)
clf2 = RandomForestClassifier(random_state=76244)
clf3 = LogisticRegression(max_iter=1000,random_state=76244)
lr = LogisticRegression(max_iter=1000)
params = {'kneighborsclassifier__n_neighbors': [2, 5,7],
          'kneighborsclassifier__leaf_size' : [10,30,50],
          'randomforestclassifier__n_estimators': [10, 50,100],
          'randomforestclassifier__max_features' : ['auto', 'sqrt'],
          'logisticregression__C':[0.01,.02,0.05,1],
          'meta_classifier__C': [0.1,1, 10]}

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], 
                            meta_classifier=lr,
                            shuffle=False,
                            random_state=76244)

In [9]:
probas = list(np.arange(0.01, 1.01, 0.01))

    # Initialize GridSearchCV
grid = RandomizedSearchCV(estimator = sclf, 
                        param_distributions = params, 
                        cv = 2,
                        scoring = "roc_auc",
                        verbose = 2,
                        n_jobs = -1)
    
# Fit GridSearchCV
grid.fit(X_train_array, y_train)



Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed: 141.0min remaining: 35.3min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 145.2min finished


RandomizedSearchCV(cv=2,
                   estimator=StackingCVClassifier(classifiers=[KNeighborsClassifier(n_neighbors=3),
                                                               RandomForestClassifier(random_state=76244),
                                                               LogisticRegression(max_iter=1000,
                                                                                  random_state=76244)],
                                                  meta_classifier=LogisticRegression(max_iter=1000),
                                                  random_state=76244,
                                                  shuffle=False),
                   n_jobs=-1,
                   param_distributions={'kneighborsclassifier__leaf_size': [10,
                                                                            30,
                                                                            50],
                                        'kneighborsclassifi

In [10]:
best_estimator = grid.best_estimator_
best_estimator.fit(X_train_array, y_train)

StackingCVClassifier(classifiers=[KNeighborsClassifier(leaf_size=50),
                                  RandomForestClassifier(max_features='sqrt',
                                                         random_state=76244),
                                  LogisticRegression(C=1, max_iter=1000,
                                                     random_state=76244)],
                     meta_classifier=LogisticRegression(C=1, max_iter=1000),
                     random_state=76244, shuffle=False)

In [11]:
best_estimator

StackingCVClassifier(classifiers=[KNeighborsClassifier(leaf_size=50),
                                  RandomForestClassifier(max_features='sqrt',
                                                         random_state=76244),
                                  LogisticRegression(C=1, max_iter=1000,
                                                     random_state=76244)],
                     meta_classifier=LogisticRegression(C=1, max_iter=1000),
                     random_state=76244, shuffle=False)

In [12]:
pkl_filename = "best_estimator.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_estimator, file)



In [24]:

thresholds = list(np.arange(0.01, 1.01, 0.01))

 

y_hat_proba = best_estimator.predict_proba(X_test_array)

 

#Pass in a single dim array from predict
# y_hat[:,1]
def calcCost(truth, probabilities, threshold, fp = 10, fn = 500):
    y_hat = (probabilities > threshold).astype(int)
    mt = metrics.confusion_matrix(truth, y_hat)
    acc = metrics.accuracy_score(truth, y_hat)
    recall = metrics.recall_score(truth, y_hat)
    return {
        "probability":threshold,
        "accuracy": acc,
        "recall": recall,
        "fn":mt[1,0],
        "fn_cost":mt[1,0]*fn,
        "fp":mt[0,1],
        "fp_cost":mt[0,1]*fp
    }

 



In [88]:
cost = [ calcCost(y_test, y_hat_proba[:,1], threshold) for threshold in tqdm(thresholds) ]
    


  0%|                                                                                          | 0/100 [00:00<?, ?it/s][A
  3%|██▍                                                                               | 3/100 [00:00<00:03, 25.91it/s][A
  6%|████▉                                                                             | 6/100 [00:00<00:03, 26.74it/s][A
  9%|███████▍                                                                          | 9/100 [00:00<00:03, 27.28it/s][A
 12%|█████████▋                                                                       | 12/100 [00:00<00:03, 27.49it/s][A
 15%|████████████▏                                                                    | 15/100 [00:00<00:03, 27.68it/s][A
 18%|██████████████▌                                                                  | 18/100 [00:00<00:03, 27.27it/s][A
 21%|█████████████████                                                                | 21/100 [00:00<00:02, 27.52it/s][A
 24%|██████████

In [89]:
cost_df = pd.DataFrame(cost)

cost_df["total_cost"] = cost_df.fn_cost + cost_df.fp_cost




In [90]:
cost_df.sort_values(by=["total_cost"], ascending=True).head(10)

Unnamed: 0,probability,accuracy,recall,fn,fn_cost,fp,fp_cost,total_cost
0,0.01,0.398938,1.0,0,0,19234,192340,192340
1,0.02,0.398938,1.0,0,0,19234,192340,192340
2,0.03,0.398938,1.0,0,0,19234,192340,192340
3,0.04,0.398938,1.0,0,0,19234,192340,192340
4,0.05,0.398938,1.0,0,0,19234,192340,192340
5,0.06,0.398938,1.0,0,0,19234,192340,192340
6,0.07,0.46575,0.988172,151,75500,16945,169450,244950
19,0.2,0.874812,0.908429,1169,584500,2837,28370,612870
18,0.19,0.874812,0.908429,1169,584500,2837,28370,612870
16,0.17,0.874812,0.908429,1169,584500,2837,28370,612870


In [23]:
cost_df.to_csv("stacking_model_cost.csv")

In [24]:
cost_df.tail

<bound method NDFrame.tail of     probability  accuracy  precision     fn  fn_cost     fp  fp_cost  \
0          0.01  0.398938   0.398938      0        0  19234   192340   
1          0.02  0.398938   0.398938      0        0  19234   192340   
2          0.03  0.398938   0.398938      0        0  19234   192340   
3          0.04  0.398938   0.398938      0        0  19234   192340   
4          0.05  0.398938   0.398938      0        0  19234   192340   
..          ...       ...        ...    ...      ...    ...      ...   
95         0.96  0.601063   0.000000  12766  6383000      0        0   
96         0.97  0.601063   0.000000  12766  6383000      0        0   
97         0.98  0.601063   0.000000  12766  6383000      0        0   
98         0.99  0.601063   0.000000  12766  6383000      0        0   
99         1.00  0.601063   0.000000  12766  6383000      0        0   

    total_cost  
0       192340  
1       192340  
2       192340  
3       192340  
4       192340  
.. 

## Run Grid Search for Randomforest, KNN and Logistic Regrssion

In [35]:
ModelsDict = {'RandomForestClassifier':RandomForestClassifier(random_state=76244), 
            'LogisticRegression': LogisticRegression(max_iter=1000,random_state=76244), 
            'KNeighborsClassifier': KNeighborsClassifier()
            }
           

In [36]:
ParamDict = {'RandomForestClassifier': {"min_samples_split": [2,3,4],
                                        "n_estimators": [10,12,15],'max_features': ['auto', 'sqrt', 'log2'] },
            'LogisticRegression': {"tol": [0.001,0.01,0.1], "C": [0.001, 0.01, 1, 3, 100],
                                  "penalty":["none","l2"]},
           'KNeighborsClassifier': {"n_neighbors": [4,5,6,10], 
                                    "leaf_size":[20,30,40], "algorithm":["auto", "ball_tree", "kd_tree"]}}                     


In [40]:
class MyGridSearch:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=-1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring= "roc_auc", refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs  
            
       

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return {**params,**d}

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))
        return rows
        

In [41]:
# run the Grid Search evaluation method and export results in a table
GSresult = MyGridSearch(ModelsDict, ParamDict)
GSresult.fit(X_train_array, y_train, scoring='roc_auc', n_jobs=-1)


Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  3.2min finished


Running GridSearchCV for LogisticRegression.
Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  2.3min finished


Running GridSearchCV for KNeighborsClassifier.
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 533.7min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 1354.4min finished


In [42]:
print("Grid Search Result, Below is the top 5 classifiers and hyperparameter sets:")

results=GSresult.score_summary()
print(results)

Grid Search Result, Below is the top 5 classifiers and hyperparameter sets:
RandomForestClassifier
LogisticRegression
KNeighborsClassifier
[{'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 10, 'estimator': 'RandomForestClassifier', 'min_score': 0.9456843994882311, 'max_score': 0.9477854874920015, 'mean_score': 0.9469719383005558, 'std_score': 0.0009209206699177587}, {'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 12, 'estimator': 'RandomForestClassifier', 'min_score': 0.9505594731433529, 'max_score': 0.952190964760545, 'mean_score': 0.9516096421746423, 'std_score': 0.0007439999848816254}, {'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 15, 'estimator': 'RandomForestClassifier', 'min_score': 0.954876289699377, 'max_score': 0.9569507307043326, 'mean_score': 0.9561777179734422, 'std_score': 0.0009256506218013736}, {'max_features': 'auto', 'min_samples_split': 3, 'n_estimators': 10, 'estimator': 'RandomForestClassifier', 'min_score': 0.9480

In [49]:
grid_df = pd.DataFrame(GSresult.score_summary(sort_by='mean_score'))

RandomForestClassifier
LogisticRegression
KNeighborsClassifier


In [97]:
grid_df.to_csv("GridSearch_result.csv")

In [None]:
grid_df.groupby(['estimator']).max()['mean_score']

In [98]:
grid_df.sort_values(by='mean_score',ascending=False).groupby(['estimator']).head(1)

Unnamed: 0,max_features,min_samples_split,n_estimators,estimator,min_score,max_score,mean_score,std_score,C,penalty,tol,algorithm,leaf_size,n_neighbors
17,sqrt,4.0,15.0,RandomForestClassifier,0.956097,0.958818,0.957197,0.00117,,,,,,
88,,,,KNeighborsClassifier,0.900857,0.905903,0.903941,0.002208,,,,kd_tree,30.0,10.0
44,,,,LogisticRegression,0.757219,0.761613,0.759457,0.001795,1.0,l2,0.1,,,


## Finding best stack model combination of Randomforest, KNN and Logistic Regrssion

In [78]:
estimators = [('rf', RandomForestClassifier(random_state=76244,max_features='sqrt',min_samples_split=4,n_estimators=15)),
              ('lr',  LogisticRegression(max_iter=1000,random_state=76244,C=1,tol=0.1)),
              ('knn',KNeighborsClassifier(algorithm='kd_tree',leaf_size=30,n_neighbors=10))]
    
combo_classifiers = []
for ii in range(2, len(estimators)+1):
    for subset in itertools.combinations(estimators, ii):
        combo_classifiers.append(subset)

In [79]:
combo_classifiers

[(('rf',
   RandomForestClassifier(max_features='sqrt', min_samples_split=4,
                          n_estimators=15, random_state=76244)),
  ('lr', LogisticRegression(C=1, max_iter=1000, random_state=76244, tol=0.1))),
 (('rf',
   RandomForestClassifier(max_features='sqrt', min_samples_split=4,
                          n_estimators=15, random_state=76244)),
  ('knn', KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10))),
 (('lr', LogisticRegression(C=1, max_iter=1000, random_state=76244, tol=0.1)),
  ('knn', KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10))),
 (('rf',
   RandomForestClassifier(max_features='sqrt', min_samples_split=4,
                          n_estimators=15, random_state=76244)),
  ('lr', LogisticRegression(C=1, max_iter=1000, random_state=76244, tol=0.1)),
  ('knn', KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10)))]

In [109]:
def evaluate_model(model):
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=1, random_state=76244)
    scores = cross_val_score(model, X_train_array, y_train, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
    return scores
for combo in combo_classifiers:
    clf = StackingClassifier( estimators=combo, final_estimator=LogisticRegression(max_iter=1000,random_state=76244)
                             ,cv=2,n_jobs=-1,verbose=2)
    score = evaluate_model(model=clf)
    print(f"AUC of stack {combo}: {score.mean():.3f}")


AUC of stack (('rf', RandomForestClassifier(max_features='sqrt', min_samples_split=4,
                       n_estimators=15, random_state=76244)), ('lr', LogisticRegression(C=1, max_iter=1000, random_state=76244, tol=0.1))): 0.958
AUC of stack (('rf', RandomForestClassifier(max_features='sqrt', min_samples_split=4,
                       n_estimators=15, random_state=76244)), ('knn', KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10))): 0.959
AUC of stack (('lr', LogisticRegression(C=1, max_iter=1000, random_state=76244, tol=0.1)), ('knn', KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10))): 0.869
AUC of stack (('rf', RandomForestClassifier(max_features='sqrt', min_samples_split=4,
                       n_estimators=15, random_state=76244)), ('lr', LogisticRegression(C=1, max_iter=1000, random_state=76244, tol=0.1)), ('knn', KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10))): 0.960


## Best Stack Model Randomforest + KNN + Logistic Regrssion

In [110]:
best_model =  StackingClassifier( estimators=combo_classifiers[3], final_estimator=LogisticRegression(max_iter=1000,random_state=76244)
                             ,cv=2,n_jobs=-1,verbose=2)

In [111]:
best_model.fit(X_train_array,y_train)

StackingClassifier(cv=2,
                   estimators=(('rf',
                                RandomForestClassifier(max_features='sqrt',
                                                       min_samples_split=4,
                                                       n_estimators=15,
                                                       random_state=76244)),
                               ('lr',
                                LogisticRegression(C=1, max_iter=1000,
                                                   random_state=76244,
                                                   tol=0.1)),
                               ('knn',
                                KNeighborsClassifier(algorithm='kd_tree',
                                                     n_neighbors=10))),
                   final_estimator=LogisticRegression(max_iter=1000,
                                                      random_state=76244),
                   n_jobs=-1, verbose=2)

In [129]:
with open('best_stack_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [112]:
y_hat_proba = best_model.predict_proba(X_test_array)

In [113]:
cost_stack_model = [ calcCost(y_test, y_hat_proba[:,1], threshold) for threshold in tqdm(thresholds) ]

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 24.01it/s]


In [114]:
cost_stack_model_df = pd.DataFrame(cost_stack_model)

cost_stack_model_df["total_cost"] = cost_stack_model_df.fn_cost + cost_stack_model_df.fp_cost
cost_stack_model_df.sort_values(by=["total_cost"], ascending=True).head(10)

Unnamed: 0,probability,accuracy,recall,fn,fn_cost,fp,fp_cost,total_cost
1,0.02,0.613125,0.995065,63,31500,12317,123170,154670
2,0.03,0.67975,0.991227,112,56000,10136,101360,157360
3,0.04,0.722938,0.987858,155,77500,8711,87110,164610
4,0.05,0.752406,0.985273,188,94000,7735,77350,171350
0,0.01,0.487844,0.998747,16,8000,16373,163730,171730
5,0.06,0.775344,0.981905,231,115500,6958,69580,185080
6,0.07,0.792406,0.978458,275,137500,6368,63680,201180
7,0.08,0.806438,0.975403,314,157000,5880,58800,215800
8,0.09,0.818969,0.972035,357,178500,5436,54360,232860
9,0.1,0.827219,0.968823,398,199000,5131,51310,250310


## Randomforest Model

In [116]:
rf =  RandomForestClassifier(max_features='sqrt', min_samples_split=4,
                                                       n_estimators=15,
                                                       random_state=76244)
rf.fit(X_train_array,y_train)

RandomForestClassifier(max_features='sqrt', min_samples_split=4,
                       n_estimators=15, random_state=76244)

In [117]:
rf_y_hat_proba = rf.predict_proba(X_test_array)

In [119]:
cost_rf= [ calcCost(y_test, rf_y_hat_proba[:,1], threshold) for threshold in tqdm(thresholds) ]

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 30.07it/s]


In [120]:
cost_rf_df = pd.DataFrame(cost_rf)

cost_rf_df["total_cost"] = cost_rf_df.fn_cost + cost_rf_df.fp_cost
cost_rf_df.sort_values(by=["total_cost"], ascending=True).head(10)

Unnamed: 0,probability,accuracy,recall,fn,fn_cost,fp,fp_cost,total_cost
10,0.11,0.638938,0.99342,84,42000,11470,114700,156700
9,0.1,0.625875,0.994047,76,38000,11896,118960,156960
8,0.09,0.623781,0.994047,76,38000,11963,119630,157630
11,0.12,0.651563,0.992558,95,47500,11055,110550,158050
7,0.08,0.610719,0.994595,69,34500,12388,123880,158380
6,0.07,0.607563,0.994752,67,33500,12491,124910,158410
12,0.13,0.655563,0.992167,100,50000,10922,109220,159220
13,0.14,0.715,0.988642,145,72500,8975,89750,162250
14,0.15,0.721281,0.987858,155,77500,8764,87640,165140
5,0.06,0.524813,0.997102,37,18500,15169,151690,170190


## XGBOOST Model (just trying)

In [5]:
import xgboost as xgb
from xgboost import XGBClassifier
dtrain = xgb.DMatrix(X_train_array, label=y_train)
dtest = xgb.DMatrix(X_test_array, label=y_test)

In [6]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=76244
)

In [13]:
parameters = {
    'max_depth': range (1, 12, 3),
    'n_estimators': range(60, 220, 80),
    'learning_rate': [0.1, 0.01, 0.05],
    'min_child_weight':[5,6,7]
}

In [14]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = -1,
    cv = 2,
    verbose=True
)

In [15]:
grid_search.fit(X_train_array, y_train)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 58.7min finished


GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=76244,
                                     subsample=None, tree_me

In [28]:
best_model = grid_search.best_estimator_

In [29]:
best_model 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=140, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=76244, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=76244, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [40]:
eval_set = [(X_test_array, y_test)]
best_model .fit(X_train_array, y_train, early_stopping_rounds=10, eval_metric="aucpr", eval_set=eval_set, verbose=True)

[0]	validation_0-aucpr:0.87925
Will train until validation_0-aucpr hasn't improved in 10 rounds.
[1]	validation_0-aucpr:0.90897
[2]	validation_0-aucpr:0.91733
[3]	validation_0-aucpr:0.92788
[4]	validation_0-aucpr:0.92981
[5]	validation_0-aucpr:0.93371
[6]	validation_0-aucpr:0.93760
[7]	validation_0-aucpr:0.94107
[8]	validation_0-aucpr:0.94260
[9]	validation_0-aucpr:0.94430
[10]	validation_0-aucpr:0.94674
[11]	validation_0-aucpr:0.94784
[12]	validation_0-aucpr:0.94890
[13]	validation_0-aucpr:0.94955
[14]	validation_0-aucpr:0.95030
[15]	validation_0-aucpr:0.95208
[16]	validation_0-aucpr:0.95328
[17]	validation_0-aucpr:0.95431
[18]	validation_0-aucpr:0.95521
[19]	validation_0-aucpr:0.95580
[20]	validation_0-aucpr:0.95690
[21]	validation_0-aucpr:0.95788
[22]	validation_0-aucpr:0.95844
[23]	validation_0-aucpr:0.95928
[24]	validation_0-aucpr:0.96040
[25]	validation_0-aucpr:0.96100
[26]	validation_0-aucpr:0.96161
[27]	validation_0-aucpr:0.96256
[28]	validation_0-aucpr:0.96299
[29]	validation_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=140, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=76244, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=76244, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [41]:
xgb_y_pred1 = best_model.predict_proba(X_test_array)

In [42]:
cost_xgb_model1 = [ calcCost(y_test, xgb_y_pred1[:,1], threshold) for threshold in tqdm(thresholds) ]

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:06<00:00, 15.84it/s]


In [43]:
cost_xgb_df1 = pd.DataFrame(cost_xgb_model1)

cost_xgb_df1["total_cost"] = cost_xgb_df1.fn_cost + cost_xgb_df1.fp_cost
cost_xgb_df1.sort_values(by=["total_cost"], ascending=True).head(10)

Unnamed: 0,probability,accuracy,recall,fn,fn_cost,fp,fp_cost,total_cost
5,0.06,0.788969,0.992793,92,46000,6661,66610,112610
4,0.05,0.765938,0.99389,78,39000,7412,74120,113120
6,0.07,0.809719,0.991305,111,55500,5978,59780,115280
3,0.04,0.733812,0.994752,67,33500,8451,84510,118010
7,0.08,0.823063,0.989817,130,65000,5532,55320,120320
8,0.09,0.836,0.98872,144,72000,5104,51040,123040
2,0.03,0.690719,0.995692,55,27500,9842,98420,125920
9,0.1,0.8475,0.987623,158,79000,4722,47220,126220
10,0.11,0.857875,0.98637,174,87000,4374,43740,130740
11,0.12,0.866031,0.98543,186,93000,4101,41010,134010


In [35]:
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=10,
              min_child_weight=8, monotone_constraints='()',
              n_estimators=140, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=76244, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=76244, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [36]:
eval_set = [(X_test_array, y_test)]
model.fit(X_train_array, y_train, early_stopping_rounds=10, eval_metric="aucpr", eval_set=eval_set, verbose=True)

[0]	validation_0-aucpr:0.87894
Will train until validation_0-aucpr hasn't improved in 10 rounds.
[1]	validation_0-aucpr:0.91237
[2]	validation_0-aucpr:0.92263
[3]	validation_0-aucpr:0.93059
[4]	validation_0-aucpr:0.93827
[5]	validation_0-aucpr:0.94264
[6]	validation_0-aucpr:0.94452
[7]	validation_0-aucpr:0.94818
[8]	validation_0-aucpr:0.95056
[9]	validation_0-aucpr:0.95276
[10]	validation_0-aucpr:0.95589
[11]	validation_0-aucpr:0.95749
[12]	validation_0-aucpr:0.95877
[13]	validation_0-aucpr:0.96028
[14]	validation_0-aucpr:0.96185
[15]	validation_0-aucpr:0.96269
[16]	validation_0-aucpr:0.96399
[17]	validation_0-aucpr:0.96464
[18]	validation_0-aucpr:0.96502
[19]	validation_0-aucpr:0.96563
[20]	validation_0-aucpr:0.96597
[21]	validation_0-aucpr:0.96641
[22]	validation_0-aucpr:0.96708
[23]	validation_0-aucpr:0.96825
[24]	validation_0-aucpr:0.96894
[25]	validation_0-aucpr:0.96950
[26]	validation_0-aucpr:0.96965
[27]	validation_0-aucpr:0.97017
[28]	validation_0-aucpr:0.97075
[29]	validation_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=10,
              min_child_weight=8, missing=nan, monotone_constraints='()',
              n_estimators=140, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=76244, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=76244, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [37]:
xgb_y_pred = model.predict_proba(X_test_array)

In [38]:
cost_xgb_model = [ calcCost(y_test, xgb_y_pred[:,1], threshold) for threshold in tqdm(thresholds) ]

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 16.86it/s]


In [39]:
cost_xgb_df = pd.DataFrame(cost_xgb_model)

cost_xgb_df["total_cost"] = cost_xgb_df.fn_cost + cost_xgb_df.fp_cost
cost_xgb_df.sort_values(by=["total_cost"], ascending=True).head(10)

Unnamed: 0,probability,accuracy,recall,fn,fn_cost,fp,fp_cost,total_cost
4,0.05,0.802,0.992245,99,49500,6237,62370,111870
5,0.06,0.821375,0.991227,112,56000,5604,56040,112040
3,0.04,0.776438,0.993263,86,43000,7068,70680,113680
6,0.07,0.834125,0.990052,127,63500,5181,51810,115310
2,0.03,0.741375,0.99436,72,36000,8204,82040,118040
7,0.08,0.845313,0.988172,151,75500,4799,47990,123490
1,0.02,0.690375,0.995457,58,29000,9850,98500,127500
8,0.09,0.855437,0.986448,173,86500,4453,44530,131030
9,0.1,0.864437,0.98543,186,93000,4152,41520,134520
10,0.11,0.871875,0.983785,207,103500,3893,38930,142430


## Second best Stack model Randomforest + KNN

In [131]:
second_best_stack_model = StackingClassifier( estimators=combo_classifiers[1], final_estimator=LogisticRegression(max_iter=1000,random_state=76244)
                             ,cv=2,n_jobs=-1,verbose=2)

In [132]:
second_best_stack_model.fit(X_train_array,y_train)

StackingClassifier(cv=2,
                   estimators=(('rf',
                                RandomForestClassifier(max_features='sqrt',
                                                       min_samples_split=4,
                                                       n_estimators=15,
                                                       random_state=76244)),
                               ('knn',
                                KNeighborsClassifier(algorithm='kd_tree',
                                                     n_neighbors=10))),
                   final_estimator=LogisticRegression(max_iter=1000,
                                                      random_state=76244),
                   n_jobs=-1, verbose=2)

In [144]:
with open('second_best_stack_model.pkl', 'wb') as file:
    pickle.dump(second_best_stack_model, file)

In [135]:
second_y_hat_proba = second_best_stack_model.predict_proba(X_test_array)

In [136]:
cost_secon_stack_model = [ calcCost(y_test, second_y_hat_proba[:,1], threshold) for threshold in tqdm(thresholds) ]


  0%|                                                                                          | 0/100 [00:00<?, ?it/s][A
  3%|██▍                                                                               | 3/100 [00:00<00:03, 25.44it/s][A
  6%|████▉                                                                             | 6/100 [00:00<00:03, 25.33it/s][A
  9%|███████▍                                                                          | 9/100 [00:00<00:03, 25.09it/s][A
 12%|█████████▋                                                                       | 12/100 [00:00<00:03, 25.15it/s][A
 15%|████████████▏                                                                    | 15/100 [00:00<00:03, 24.93it/s][A
 18%|██████████████▌                                                                  | 18/100 [00:00<00:03, 24.79it/s][A
 21%|█████████████████                                                                | 21/100 [00:00<00:03, 24.81it/s][A
 24%|██████████

In [137]:
cost_secon_stack_df = pd.DataFrame(cost_secon_stack_model)

cost_secon_stack_df["total_cost"] = cost_secon_stack_df.fn_cost + cost_secon_stack_df.fp_cost
cost_secon_stack_df.sort_values(by=["total_cost"], ascending=True).head(10)

Unnamed: 0,probability,accuracy,recall,fn,fn_cost,fp,fp_cost,total_cost
1,0.02,0.6085,0.995065,63,31500,12465,124650,156150
2,0.03,0.680406,0.991305,111,55500,10116,101160,156660
3,0.04,0.717156,0.988485,147,73500,8904,89040,162540
4,0.05,0.749719,0.985508,185,92500,7824,78240,170740
0,0.01,0.476406,0.998825,15,7500,16740,167400,174900
5,0.06,0.7695,0.982218,227,113500,7149,71490,184990
6,0.07,0.790656,0.979477,262,131000,6437,64370,195370
7,0.08,0.806063,0.975638,311,155500,5895,58950,214450
8,0.09,0.816656,0.972897,346,173000,5521,55210,228210
9,0.1,0.825937,0.969763,386,193000,5184,51840,244840
