In [1]:
from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\train"

In [2]:
percent = 100
random = False

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.gaussian_smoothing(3)
factory.clahe(clip_limit=2)
factory.pad2square(fill=0)
factory.resize((150,150))
factory.normalize2float()

factory_loader = FactoryLoader(path=VAL_PATH, batch_size=32, factory=factory, percentage=percent, shuffle=random)

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(MeanExtractor())  # Add mean feature
strategy.add_extractor(StdExtractor())   # Add standard deviation feature
strategy.add_extractor(VarExtractor())   # Add variance feature

strategy.add_extractor(MeanExtractor("lab"))  # Add mean feature
strategy.add_extractor(StdExtractor("lab"))   # Add standard deviation feature
strategy.add_extractor(VarExtractor("lab"))   # Add variance feature

strategy.add_extractor(MeanExtractor("hsv"))  # Add mean feature
strategy.add_extractor(StdExtractor("hsv"))   # Add standard deviation feature
strategy.add_extractor(VarExtractor("hsv"))   # Add variance feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=1, n_points=16))

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation',]))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\train
INFO:utils.ml:Preprocessing steps


In [3]:
pipeline.loader.get_transformation_steps()

{'smoothing': {'kernel_size': 3},
 'clahe': {'clip_limit': 2, 'tile_grid_size': (8, 8)},
 'pad2square': {'fill': 0},
 'resize': {'size': (150, 150)},
 '01_norm': {}}

In [4]:
pipeline.get_feature_names()

['mean_rgb_channel_0',
 'mean_rgb_channel_1',
 'mean_rgb_channel_2',
 'std_rgb_channel_0',
 'std_rgb_channel_1',
 'std_rgb_channel_2',
 'var_rgb_channel_0',
 'var_rgb_channel_1',
 'var_rgb_channel_2',
 'mean_lab_channel_0',
 'mean_lab_channel_1',
 'mean_lab_channel_2',
 'std_lab_channel_0',
 'std_lab_channel_1',
 'std_lab_channel_2',
 'var_lab_channel_0',
 'var_lab_channel_1',
 'var_lab_channel_2',
 'mean_hsv_channel_0',
 'mean_hsv_channel_1',
 'mean_hsv_channel_2',
 'std_hsv_channel_0',
 'std_hsv_channel_1',
 'std_hsv_channel_2',
 'var_hsv_channel_0',
 'var_hsv_channel_1',
 'var_hsv_channel_2',
 'lbp_rad1_bins8_0',
 'lbp_rad1_bins8_1',
 'lbp_rad1_bins8_2',
 'lbp_rad1_bins8_3',
 'lbp_rad1_bins8_4',
 'lbp_rad1_bins8_5',
 'lbp_rad1_bins8_6',
 'lbp_rad1_bins8_7',
 'lbp_rad1_bins8_8',
 'lbp_rad1_bins8_9',
 'lbp_rad1_bins16_0',
 'lbp_rad1_bins16_1',
 'lbp_rad1_bins16_2',
 'lbp_rad1_bins16_3',
 'lbp_rad1_bins16_4',
 'lbp_rad1_bins16_5',
 'lbp_rad1_bins16_6',
 'lbp_rad1_bins16_7',
 'lbp_rad1_

In [5]:
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/634 batches.
Processed 10/634 batches.
Processed 15/634 batches.
Processed 20/634 batches.
Processed 25/634 batches.
Processed 30/634 batches.
Processed 35/634 batches.
Processed 40/634 batches.
Processed 45/634 batches.
Processed 50/634 batches.
Processed 55/634 batches.
Processed 60/634 batches.
Processed 65/634 batches.
Processed 70/634 batches.
Processed 75/634 batches.
Processed 80/634 batches.
Processed 85/634 batches.
Processed 90/634 batches.
Processed 95/634 batches.
Processed 100/634 batches.
Processed 105/634 batches.
Processed 110/634 batches.
Processed 115/634 batches.
Processed 120/634 batches.
Processed 125/634 batches.
Processed 130/634 batches.
Processed 135/634 batches.
Processed 140/634 batches.
Processed 145/634 batches.
Processed 150/634 batches.
Processed 155/634 batches.
Processed 160/634 batches.
Processed 165/634 batches.
Processed 170/634 batches.
Processed 175/634 batches.
Processed 180/634 batches.
Processed 185/634 batches.
Processed 190/634 bat

INFO:utils.ml:Feature extraction completed. Extracted 15195 features.


Processed 634/634 batches.


In [6]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

rf1 = RandomForestClassifier(n_estimators=100)
rf2 = RandomForestClassifier(n_estimators=150)
rf3 = RandomForestClassifier(n_estimators=250)

xgb1 = XGBClassifier(n_estimators=250)
xgb2 = XGBClassifier(n_estimators=350)
xgb3 = XGBClassifier(n_estimators=450)

pipeline.classifiers = [rf1, rf2, rf3,
                        # svm1, svm2, svm3, svm6, svm7, svm8,
                        xgb1, xgb2, xgb3]

In [7]:
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 8.328920841217041 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Fitted classifier: RandomForestClassifier1; Done in 12.497383832931519 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier2
INFO:utils.ml:Fitted classifier: RandomForestClassifier2; Done in 21.323915004730225 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier3
INFO:utils.ml:Fitted classifier: XGBClassifier3; Done in 0.8184385299682617 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier4
INFO:utils.ml:Fitted classifier: XGBClassifier4; Done in 1.1810214519500732 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier5
INFO:utils.ml:Fitted classifier: XGBClassifier5; Done in 1.4790241718292236 seconds
INFO:utils.ml:Fitting completed in 45.66 seconds.


In [8]:
pipeline.predict_with_classifiers(VAL_PATH)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\val


Processed 5/159 batches.
Processed 10/159 batches.
Processed 15/159 batches.
Processed 20/159 batches.
Processed 25/159 batches.
Processed 30/159 batches.
Processed 35/159 batches.
Processed 40/159 batches.
Processed 45/159 batches.
Processed 50/159 batches.
Processed 55/159 batches.
Processed 60/159 batches.
Processed 65/159 batches.
Processed 70/159 batches.
Processed 75/159 batches.
Processed 80/159 batches.
Processed 85/159 batches.
Processed 90/159 batches.
Processed 95/159 batches.
Processed 100/159 batches.
Processed 105/159 batches.
Processed 110/159 batches.
Processed 115/159 batches.
Processed 120/159 batches.
Processed 125/159 batches.
Processed 130/159 batches.
Processed 135/159 batches.
Processed 140/159 batches.
Processed 145/159 batches.
Processed 150/159 batches.
Processed 155/159 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1


Processed 159/159 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: XGBClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5


{'GT': array([0, 0, 0, ..., 1, 1, 1]),
 'RandomForestClassifier0': array([0, 0, 1, ..., 0, 0, 1]),
 'RandomForestClassifier1': array([0, 0, 1, ..., 0, 0, 1]),
 'RandomForestClassifier2': array([0, 0, 1, ..., 1, 0, 1]),
 'XGBClassifier3': array([0, 0, 0, ..., 0, 0, 1]),
 'XGBClassifier4': array([0, 0, 0, ..., 0, 0, 1]),
 'XGBClassifier5': array([0, 0, 0, ..., 0, 0, 1])}

In [10]:
import pandas as pd

df = pd.DataFrame(data=pipeline.calculate_metrics())
df.T

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.7982086406743941, 'precision': 0.7983525605310837, 'recall': 0.7982086406743941, 'f1': 0.7982238781806194}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8045310853530031, 'precision': 0.8046749490476727, 'recall': 0.8045310853530031, 'f1': 0.8045458454438899}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8061116965226555, 'precision': 0.8062153197555775, 'recall': 0.8061116965226555, 'f1': 0.8061256917761043}
INFO:utils.ml:Metrics for classifier XGBClassifier3: {'accuracy': 0.8018967334035827, 'precision': 0.8019885603456933, 'recall': 0.8018967334035827, 'f1': 0.8019105930748737}
INFO:utils.ml:Metrics for classifier XGBClassifier4: {'accuracy': 0.8045310853530031, 'precision': 0.8045727269245491, 'recall': 0.8045310853530031, 'f1': 0.8045409626514

Unnamed: 0,accuracy,precision,recall,f1
GT,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.798209,0.798353,0.798209,0.798224
RandomForestClassifier1,0.804531,0.804675,0.804531,0.804546
RandomForestClassifier2,0.806112,0.806215,0.806112,0.806126
XGBClassifier3,0.801897,0.801989,0.801897,0.801911
XGBClassifier4,0.804531,0.804573,0.804531,0.804541
XGBClassifier5,0.807165,0.807158,0.807165,0.80716


In [13]:
pipeline.get_feature_names()

['mean_rgb_channel_0',
 'mean_rgb_channel_1',
 'mean_rgb_channel_2',
 'std_rgb_channel_0',
 'std_rgb_channel_1',
 'std_rgb_channel_2',
 'var_rgb_channel_0',
 'var_rgb_channel_1',
 'var_rgb_channel_2',
 'mean_lab_channel_0',
 'mean_lab_channel_1',
 'mean_lab_channel_2',
 'std_lab_channel_0',
 'std_lab_channel_1',
 'std_lab_channel_2',
 'var_lab_channel_0',
 'var_lab_channel_1',
 'var_lab_channel_2',
 'mean_hsv_channel_0',
 'mean_hsv_channel_1',
 'mean_hsv_channel_2',
 'std_hsv_channel_0',
 'std_hsv_channel_1',
 'std_hsv_channel_2',
 'var_hsv_channel_0',
 'var_hsv_channel_1',
 'var_hsv_channel_2',
 'lbp_rad1_bins8_0',
 'lbp_rad1_bins8_1',
 'lbp_rad1_bins8_2',
 'lbp_rad1_bins8_3',
 'lbp_rad1_bins8_4',
 'lbp_rad1_bins8_5',
 'lbp_rad1_bins8_6',
 'lbp_rad1_bins8_7',
 'lbp_rad1_bins8_8',
 'lbp_rad1_bins8_9',
 'lbp_rad1_bins16_0',
 'lbp_rad1_bins16_1',
 'lbp_rad1_bins16_2',
 'lbp_rad1_bins16_3',
 'lbp_rad1_bins16_4',
 'lbp_rad1_bins16_5',
 'lbp_rad1_bins16_6',
 'lbp_rad1_bins16_7',
 'lbp_rad1_

In [12]:
pipeline.save_feature_matrix_to_excel()

INFO:utils.ml:Saving feature matrix to Excel...
INFO:utils.ml:Feature matrix saved to ./features_smoothing_clahe_pad2square_resize_01_norm.xlsx


'./features_smoothing_clahe_pad2square_resize_01_norm.xlsx'

# New XGB

In [14]:
# XGBClassifier with lower learning rate and increased complexity with more estimators
xgb4 = XGBClassifier(learning_rate=0.05, n_estimators=400, max_depth=3, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
# XGBClassifier with higher max depth and adjusted subsample for more complex patterns
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
# XGBClassifier with regularization through gamma and higher min_child_weight
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8)
# XGBClassifier configured for imbalanced classes with adjusted scale_pos_weight
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=3, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=10)
# XGBClassifier with high regularization parameters for overfitting control
xgb8 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
# XGBClassifier with a low learning rate and high number of estimators for gradual learning
xgb9 = XGBClassifier(learning_rate=0.01, n_estimators=400, max_depth=3, min_child_weight=1, subsample=1.0, colsample_bytree=1.0)


pipeline.classifiers.extend([
    xgb4, xgb5, xgb6, xgb7, xgb8, xgb9
])

In [15]:
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 12.388845443725586 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Fitted classifier: RandomForestClassifier1; Done in 19.391811847686768 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier2
INFO:utils.ml:Fitted classifier: RandomForestClassifier2; Done in 30.037739992141724 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier3
INFO:utils.ml:Fitted classifier: XGBClassifier3; Done in 2.3493878841400146 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier4
INFO:utils.ml:Fitted classifier: XGBClassifier4; Done in 2.3169169425964355 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier5
INFO:utils.ml:Fitted classifier: XGBClassifier5; Done in 8.055252313613892 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier6
INFO:utils.ml:Fitted classifier: XGBClassifier6; Done in 5.8151

In [17]:
pipeline.predict_with_classifiers(VAL_PATH)
df = pd.DataFrame(data=pipeline.calculate_metrics())
df.T

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\val


Processed 5/159 batches.
Processed 10/159 batches.
Processed 15/159 batches.
Processed 20/159 batches.
Processed 25/159 batches.
Processed 30/159 batches.
Processed 35/159 batches.
Processed 40/159 batches.
Processed 45/159 batches.
Processed 50/159 batches.
Processed 55/159 batches.
Processed 60/159 batches.
Processed 65/159 batches.
Processed 70/159 batches.
Processed 75/159 batches.
Processed 80/159 batches.
Processed 85/159 batches.
Processed 90/159 batches.
Processed 95/159 batches.
Processed 100/159 batches.
Processed 105/159 batches.
Processed 110/159 batches.
Processed 115/159 batches.
Processed 120/159 batches.
Processed 125/159 batches.
Processed 130/159 batches.
Processed 135/159 batches.
Processed 140/159 batches.
Processed 145/159 batches.
Processed 150/159 batches.
Processed 155/159 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1


Processed 159/159 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: XGBClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.797154899894626, 'precision': 0.7974540523827921, 'recall': 0.797154899894626, 'f1': 0.7971661617466502}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy':

Unnamed: 0,accuracy,precision,recall,f1
GT,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.797155,0.797454,0.797155,0.797166
RandomForestClassifier1,0.801106,0.801182,0.801106,0.801119
RandomForestClassifier2,0.80216,0.802258,0.80216,0.802174
XGBClassifier3,0.801897,0.801989,0.801897,0.801911
XGBClassifier4,0.804531,0.804573,0.804531,0.804541
XGBClassifier5,0.807165,0.807158,0.807165,0.80716
XGBClassifier6,0.789515,0.789956,0.789515,0.789518
XGBClassifier7,0.806375,0.806367,0.806375,0.806368
XGBClassifier8,0.803477,0.803528,0.803477,0.803488
