Import all the needed libraries.

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import os
from pathlib import Path
os.chdir(Path(globals()['_dh'][0]).parent)
from src.myncv import *

Import and redo the cleaning / normalization / labeling (as we are on another notebook).
(We don't have any duplicates to drop as we saw previously.)

In [2]:
data = pd.read_csv('data/breast_cancer.csv')
data = data.fillna(data.median(numeric_only=True))
X = data.drop(['id', 'diagnosis'], axis=1).values
y = LabelEncoder().fit_transform(data['diagnosis'])

Form the estimators and parameters to be passed to our helper class.

In [3]:
estimators = {
    'LogisticRegression': LogisticRegression(solver='saga', max_iter=10000, random_state=42),
    'GaussianNB': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVM': SVC(probability=True, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1, force_col_wise=True)
}

paramSpaces = {
    'LogisticRegression': {
        'C': {'type': 'float', 'low': 0.001, 'high': 10, 'log': True},
        'l1_ratio': {'type': 'float', 'low': 0, 'high': 1},
        'penalty': {'type': 'categorical', 'values': ['elasticnet']}
    },
    'GaussianNB': {},
    'LDA': {
        'solver': {'type': 'categorical', 'values': ['svd', 'lsqr', 'eigen']}
    },
    'SVM': {
        'C': {'type': 'float', 'low': 0.001, 'high': 10, 'log': True},
        'kernel': {'type': 'categorical', 'values': ['rbf', 'linear']},
        'gamma': {'type': 'float', 'low': 1e-4, 'high': 1, 'log': True}
    },
    'RandomForest': {
        'n_estimators': {'type': 'int', 'low': 50, 'high': 200},
        'max_depth': {'type': 'int', 'low': 2, 'high': 10}
    },
    'LightGBM': {
        'num_leaves': {'type': 'int', 'low': 10, 'high': 100},
        'learning_rate': {'type': 'float', 'low': 0.01, 'high': 0.2},
        'n_estimators': {'type': 'int', 'low': 50, 'high': 200},
    }
}

Run the whole thing.

In [4]:
rncv = RepeatedNestedCV(estimators, paramSpaces, nRounds=10, nTrials=5, nOuter=5, nInner=3, randomState=42)
rncv.fit(X, y)

[I 2025-05-09 21:04:22,071] A new study created in memory with name: no-name-856886fd-7074-4e52-91f7-7ebe222b0d1a


Now starting round 0.





[I 2025-05-09 21:04:22,886] Trial 0 finished with value: 0.7996580214149427 and parameters: {'C': 0.061136113540150964, 'l1_ratio': 0.2358293251688287, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7996580214149427.
[I 2025-05-09 21:04:23,693] Trial 1 finished with value: 0.7996580214149427 and parameters: {'C': 0.35194765702319913, 'l1_ratio': 0.730012490603647, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7996580214149427.
[I 2025-05-09 21:04:24,352] Trial 2 finished with value: 0.7947073770772364 and parameters: {'C': 0.014748769229698743, 'l1_ratio': 0.9909310152375185, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7996580214149427.
[I 2025-05-09 21:04:25,194] Trial 3 finished with value: 0.7996580214149427 and parameters: {'C': 3.8633044528242784, 'l1_ratio': 0.7588077043741085, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7996580214149427.
[I 2025-05-09 21:04:26,030] Trial 4 finished with value: 0.7996580214149427 and parameters: {'C': 2.

Now starting round 1.





[I 2025-05-09 21:05:21,684] Trial 0 finished with value: 0.8060467054728854 and parameters: {'C': 0.0018186483368875252, 'l1_ratio': 0.4402859708628162, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8060467054728854.
[I 2025-05-09 21:05:22,496] Trial 1 finished with value: 0.8270972744819153 and parameters: {'C': 6.324953866584793, 'l1_ratio': 0.9958185260414374, 'penalty': 'elasticnet'}. Best is trial 1 with value: 0.8270972744819153.
[I 2025-05-09 21:05:23,008] Trial 2 finished with value: 0.800884289662854 and parameters: {'C': 0.003206388502937231, 'l1_ratio': 0.826111165045817, 'penalty': 'elasticnet'}. Best is trial 1 with value: 0.8270972744819153.
[I 2025-05-09 21:05:23,713] Trial 3 finished with value: 0.8164272949066076 and parameters: {'C': 0.010603036890692684, 'l1_ratio': 0.6643222192976618, 'penalty': 'elasticnet'}. Best is trial 1 with value: 0.8270972744819153.
[I 2025-05-09 21:05:24,525] Trial 4 finished with value: 0.8270972744819153 and parameters: {'C': 4.

Now starting round 2.





[I 2025-05-09 21:07:41,922] Trial 0 finished with value: 0.8183125620303452 and parameters: {'C': 0.025071615020302255, 'l1_ratio': 0.35105096931744095, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8183125620303452.
[I 2025-05-09 21:07:42,806] Trial 1 finished with value: 0.8183125620303452 and parameters: {'C': 0.09585510299118126, 'l1_ratio': 0.9574920577732119, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8183125620303452.
[I 2025-05-09 21:07:43,649] Trial 2 finished with value: 0.8238367229700087 and parameters: {'C': 0.0056727136123577515, 'l1_ratio': 0.0848540578368645, 'penalty': 'elasticnet'}. Best is trial 2 with value: 0.8238367229700087.
[I 2025-05-09 21:07:44,289] Trial 3 finished with value: 0.8185711800821913 and parameters: {'C': 0.003986077133838899, 'l1_ratio': 0.6924323517042081, 'penalty': 'elasticnet'}. Best is trial 2 with value: 0.8238367229700087.
[I 2025-05-09 21:07:45,240] Trial 4 finished with value: 0.8174974365671027 and parameters: {'C

Now starting round 3.





[I 2025-05-09 21:09:17,195] Trial 0 finished with value: 0.8225093200166934 and parameters: {'C': 0.11526000176626917, 'l1_ratio': 0.6205458747359709, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8225093200166934.
[I 2025-05-09 21:09:18,033] Trial 1 finished with value: 0.8225093200166934 and parameters: {'C': 0.027267767053228933, 'l1_ratio': 0.0334554542327975, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8225093200166934.
[I 2025-05-09 21:09:18,651] Trial 2 finished with value: 0.7904877947155903 and parameters: {'C': 0.0019479112192236356, 'l1_ratio': 0.2218680309472023, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8225093200166934.
[I 2025-05-09 21:09:19,489] Trial 3 finished with value: 0.8225093200166934 and parameters: {'C': 0.2029005743446041, 'l1_ratio': 0.46012502249317777, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8225093200166934.
[I 2025-05-09 21:09:20,423] Trial 4 finished with value: 0.8170243456337919 and parameters: {'C':

Now starting round 4.





[I 2025-05-09 21:10:47,707] Trial 0 finished with value: 0.8128008886326082 and parameters: {'C': 0.025169747508110206, 'l1_ratio': 0.38190428499700413, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8128008886326082.
[I 2025-05-09 21:10:48,391] Trial 1 finished with value: 0.8072012955254154 and parameters: {'C': 0.005111042398773974, 'l1_ratio': 0.8712473718602644, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8128008886326082.
[I 2025-05-09 21:10:49,271] Trial 2 finished with value: 0.8128008886326082 and parameters: {'C': 0.28123787513575876, 'l1_ratio': 0.07846267672542084, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8128008886326082.
[I 2025-05-09 21:10:50,116] Trial 3 finished with value: 0.8128008886326082 and parameters: {'C': 0.1423398693022496, 'l1_ratio': 0.741556092161361, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8128008886326082.
[I 2025-05-09 21:10:51,009] Trial 4 finished with value: 0.8128008886326082 and parameters: {'C': 

Now starting round 5.





[I 2025-05-09 21:13:27,515] Trial 0 finished with value: 0.8115169476435146 and parameters: {'C': 0.0054289488283640495, 'l1_ratio': 0.9607056719738192, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8115169476435146.
[I 2025-05-09 21:13:28,323] Trial 1 finished with value: 0.7937304846609444 and parameters: {'C': 0.02226129359960574, 'l1_ratio': 0.9595606132238076, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8115169476435146.
[I 2025-05-09 21:13:29,131] Trial 2 finished with value: 0.7990375180531983 and parameters: {'C': 0.03419715785836491, 'l1_ratio': 0.31686593493901394, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8115169476435146.
[I 2025-05-09 21:13:29,939] Trial 3 finished with value: 0.7990375180531983 and parameters: {'C': 0.04038788169642813, 'l1_ratio': 0.461218881371839, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8115169476435146.
[I 2025-05-09 21:13:30,674] Trial 4 finished with value: 0.7942406676828592 and parameters: {'C': 

Now starting round 6.





[I 2025-05-09 21:15:22,482] Trial 0 finished with value: 0.7956941902434059 and parameters: {'C': 0.37083985846976153, 'l1_ratio': 0.8510003787256845, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7956941902434059.
[I 2025-05-09 21:15:23,340] Trial 1 finished with value: 0.7956941902434059 and parameters: {'C': 0.029952759658581154, 'l1_ratio': 0.15421964233553442, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7956941902434059.
[I 2025-05-09 21:15:24,312] Trial 2 finished with value: 0.8008480633259382 and parameters: {'C': 4.567028504593941, 'l1_ratio': 0.27949568542479186, 'penalty': 'elasticnet'}. Best is trial 2 with value: 0.8008480633259382.
[I 2025-05-09 21:15:25,256] Trial 3 finished with value: 0.8008480633259382 and parameters: {'C': 0.7659969309136252, 'l1_ratio': 0.6078675583485268, 'penalty': 'elasticnet'}. Best is trial 2 with value: 0.8008480633259382.
[I 2025-05-09 21:15:26,240] Trial 4 finished with value: 0.8008480633259382 and parameters: {'C': 0.

Now starting round 7.





[I 2025-05-09 21:16:49,204] Trial 0 finished with value: 0.7950197403227737 and parameters: {'C': 0.0025648185295435283, 'l1_ratio': 0.43445137030909975, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7950197403227737.
[I 2025-05-09 21:16:50,224] Trial 1 finished with value: 0.7839395362274209 and parameters: {'C': 5.449996232500905, 'l1_ratio': 0.7375491541436952, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7950197403227737.
[I 2025-05-09 21:16:51,228] Trial 2 finished with value: 0.7839395362274209 and parameters: {'C': 0.4321238474429528, 'l1_ratio': 0.12314221819616278, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7950197403227737.
[I 2025-05-09 21:16:52,122] Trial 3 finished with value: 0.7889190622704141 and parameters: {'C': 0.028416626300713906, 'l1_ratio': 0.9498320695412132, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7950197403227737.
[I 2025-05-09 21:16:53,084] Trial 4 finished with value: 0.7839395362274209 and parameters: {'C': 

Now starting round 8.





[I 2025-05-09 21:19:29,199] Trial 0 finished with value: 0.8267154832994165 and parameters: {'C': 0.031061142891275408, 'l1_ratio': 0.5558715786473594, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.8267154832994165.
[I 2025-05-09 21:19:29,998] Trial 1 finished with value: 0.832715122697838 and parameters: {'C': 0.00434019727741795, 'l1_ratio': 0.3290132614049881, 'penalty': 'elasticnet'}. Best is trial 1 with value: 0.832715122697838.
[I 2025-05-09 21:19:30,891] Trial 2 finished with value: 0.8261907837950794 and parameters: {'C': 1.924707779680767, 'l1_ratio': 0.3263558167629609, 'penalty': 'elasticnet'}. Best is trial 1 with value: 0.832715122697838.
[I 2025-05-09 21:19:31,771] Trial 3 finished with value: 0.8261907837950794 and parameters: {'C': 0.8373602994927696, 'l1_ratio': 0.9901576424513591, 'penalty': 'elasticnet'}. Best is trial 1 with value: 0.832715122697838.
[I 2025-05-09 21:19:32,629] Trial 4 finished with value: 0.8267154832994165 and parameters: {'C': 0.031985

Now starting round 9.





[I 2025-05-09 21:21:15,035] Trial 0 finished with value: 0.7894903861552193 and parameters: {'C': 0.0026326429135257725, 'l1_ratio': 0.2887303390205195, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.7894903861552193.
[I 2025-05-09 21:21:15,841] Trial 1 finished with value: 0.8001340162034647 and parameters: {'C': 0.03530106374291589, 'l1_ratio': 0.95940487412946, 'penalty': 'elasticnet'}. Best is trial 1 with value: 0.8001340162034647.
[I 2025-05-09 21:21:16,696] Trial 2 finished with value: 0.8002441344170226 and parameters: {'C': 0.21361286490972142, 'l1_ratio': 0.6712952335275224, 'penalty': 'elasticnet'}. Best is trial 2 with value: 0.8002441344170226.
[I 2025-05-09 21:21:17,532] Trial 3 finished with value: 0.7948337868169927 and parameters: {'C': 0.13411655582675866, 'l1_ratio': 0.8918400633757382, 'penalty': 'elasticnet'}. Best is trial 2 with value: 0.8002441344170226.
[I 2025-05-09 21:21:18,443] Trial 4 finished with value: 0.8002441344170226 and parameters: {'C': 9.

<src.myncv.RepeatedNestedCV at 0x7fec5e3922d0>

And get / print our results

In [8]:
results = rncv.getResults()
display(results)

Unnamed: 0,LogisticRegression,GaussianNB,LDA,SVM,RandomForest,LightGBM
MCC,"(0.8185610964404444, [0.708175993372916, 0.876...","(0.8545865420294878, [0.7888634213269922, 0.93...","(0.8986132476675199, [0.8179669523347212, 0.95...","(0.8955476521397039, [0.7931990809775066, 0.93...","(0.8959004429201081, [0.8126711590701251, 0.97...","(0.9169613440442888, [0.8526078664776954, 0.97..."
AUC,"(0.9614488444669366, [0.92810523152834, 0.9884...","(0.9878700657894737, [0.9688939144736842, 0.99...","(0.9932154605263158, [0.9754214638157894, 0.99...","(0.9901579369095816, [0.9687088815789474, 0.99...","(0.988486842105263, [0.971225645242915, 0.9994...","(0.9923930921052632, [0.9747841282894736, 1.0])"
BalancedAccuracy,"(0.8999209261133604, [0.8370147499156546, 0.94...","(0.9258392375168691, [0.8845383666497976, 0.96...","(0.9395559210526316, [0.881578947368421, 0.979...","(0.9449013157894737, [0.8797800164473684, 0.97...","(0.9488075657894737, [0.9045358362854251, 0.98...","(0.9568256578947368, [0.919788240131579, 0.990..."
F1,"(0.8828571428571428, [0.8024324324324325, 0.92...","(0.9078787878787878, [0.8580357142857142, 0.95...","(0.9315068493150684, [0.8656716417910447, 0.97...","(0.9333333333333333, [0.857183257918552, 0.961...","(0.935064935064935, [0.8807012987012987, 0.983...","(0.9473684210526315, [0.9046849315068493, 0.98..."
F2,"(0.8597950268817205, [0.7489639037433155, 0.92...","(0.8994708994708994, [0.8227459016393442, 0.95...","(0.9090909090909091, [0.8011049723756906, 0.97...","(0.9259259259259259, [0.8120396495396495, 0.96...","(0.9317467754382096, [0.8666988416988417, 0.97...","(0.9358288770053476, [0.8834203036053131, 0.99..."
Recall,"(0.8421052631578947, [0.7121963562753036, 0.94...","(0.8947368421052632, [0.7953947368421053, 0.94...","(0.8947368421052632, [0.7631578947368421, 0.97...","(0.9210526315789473, [0.780921052631579, 0.974...","(0.9230769230769231, [0.8480263157894736, 0.97...","(0.9230769230769231, [0.868421052631579, 1.0])"
Specificity,"(0.961298076923077, [0.9108834134615384, 0.996...","(0.96875, [0.90625, 0.9965384615384615])","(1.0, [0.984375, 1.0])","(0.96875, [0.921875, 1.0])","(0.9765625, [0.9221454326923078, 1.0])","(0.984375, [0.953125, 1.0])"
Precision,"(0.9260752688172043, [0.8435406698564593, 0.99...","(0.9444444444444444, [0.8486263736263736, 0.99...","(1.0, [0.96796875, 1.0])","(0.9473684210526315, [0.8738553113553114, 1.0])","(0.9575551782682512, [0.8725160256410257, 1.0])","(0.9722222222222222, [0.918918918918919, 1.0])"
PRAUC,"(0.8395793354859089, [0.7486422910005747, 0.89...","(0.8754104512618445, [0.8001999363776586, 0.93...","(0.9188896107949882, [0.8507292825152545, 0.96...","(0.9022744666436993, [0.8041296538858459, 0.94...","(0.9006777675508325, [0.8201063861464685, 0.97...","(0.9260673563421618, [0.8614062979388057, 0.97..."
NPV,"(0.9111062335381914, [0.8481512762762763, 0.96...","(0.9384615384615385, [0.8895142916321459, 0.96...","(0.9402985074626866, [0.8767123287671232, 0.98...","(0.9538461538461539, [0.8835597826086956, 0.98...","(0.9545454545454546, [0.9155632411067193, 0.98...","(0.9555531167690957, [0.9256200614574188, 1.0])"


From these results we see that the best contenders are LDA and LightGBM. It seems like LightGBM has a bit better results especially for 95% percentile.

We will use it to train our model for step 4.

In [6]:
winner = lgb.LGBMClassifier(random_state=42, verbose=-1, force_col_wise=True)
paramGrid = {
    "num_leaves": [5, 10, 31, 63, 100],
    "learning_rate": (0.01, 0.15, 0.3, 0.4, 0.5, 0.6, 0.8),
    "n_estimators": [50, 100, 200, 300]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gridSearch = GridSearchCV(winner, paramGrid, scoring='roc_auc', cv=cv)
gridSearch.fit(X, y)
print("Best hyperparameters:", gridSearch.best_params_)


Best hyperparameters: {'learning_rate': 0.5, 'n_estimators': 100, 'num_leaves': 31}


Now that we now the best hyperparameter we will train our model.

In [7]:
import joblib
finalModel = lgb.LGBMClassifier(**gridSearch.best_params_, random_state=42, verbose=-1, force_col_wise=True)
finalModel.fit(X, y)
joblib.dump(finalModel, './models/final_model.pkl')

['./models/final_model.pkl']