In [20]:
from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\train"
TEST_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\test"

In [2]:
percent = 100
random = False
batch_size = 24

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.pad2square(fill=np.nan)
factory.resize((240,240))
factory.normalize2float()

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(GradientExtractor()) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", ))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", ))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", ))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))
strategy.add_extractor(LBPExtractor(radius=5, n_points=40))

strategy.add_extractor(FourierTransformExtractor())
strategy.add_extractor(FFTExtractor())

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', "ASM"]))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\train
INFO:utils.ml:Preprocessing steps


In [4]:
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/634 batches.
Processed 10/634 batches.
Processed 15/634 batches.
Processed 20/634 batches.
Processed 25/634 batches.
Processed 30/634 batches.
Processed 35/634 batches.
Processed 40/634 batches.
Processed 45/634 batches.
Processed 50/634 batches.
Processed 55/634 batches.
Processed 60/634 batches.
Processed 65/634 batches.
Processed 70/634 batches.
Processed 75/634 batches.
Processed 80/634 batches.
Processed 85/634 batches.
Processed 90/634 batches.
Processed 95/634 batches.
Processed 100/634 batches.
Processed 105/634 batches.
Processed 110/634 batches.
Processed 115/634 batches.
Processed 120/634 batches.
Processed 125/634 batches.
Processed 130/634 batches.
Processed 135/634 batches.
Processed 140/634 batches.
Processed 145/634 batches.
Processed 150/634 batches.
Processed 155/634 batches.
Processed 160/634 batches.
Processed 165/634 batches.
Processed 170/634 batches.
Processed 175/634 batches.
Processed 180/634 batches.
Processed 185/634 batches.
Processed 190/634 bat

INFO:utils.ml:Feature extraction completed. Extracted 15195 features.


Processed 634/634 batches.


In [14]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

xgb0 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=1250, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb2 = XGBClassifier(learning_rate=0.1, n_estimators=1250, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb3 = XGBClassifier(learning_rate=0.1, n_estimators=1250, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb4 = XGBClassifier(learning_rate=0.1, n_estimators=1500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=1500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=1500, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb8 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb9 = XGBClassifier(learning_rate=0.1, n_estimators=1000, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb10 = XGBClassifier(learning_rate=0.1, n_estimators=1250, max_depth=9, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb11 = XGBClassifier(learning_rate=0.1, n_estimators=1250, max_depth=9, min_child_weight=2, subsample=0.8, colsample_bytree=0.8)

pipeline.classifiers = [xgb1, xgb2, xgb3, xgb4, xgb5, xgb6, xgb7, xgb8, xgb9, xgb10, xgb11]
pipeline.fitted_classifiers = {}

In [15]:
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: XGBClassifier0
INFO:utils.ml:Top 10 features for XGBClassifier0: [('fft_radial_mean_121', 0.03146014), ('color_moments_lab_A_min', 0.027667955), ('lbp_rad2_bins16_8', 0.011241277), ('fft_radial_variance_60', 0.010746265), ('fft_radial_variance_62', 0.009892712), ('color_moments_lab_B_skew', 0.008092633), ('color_moments_hsv_H_max', 0.0077363444), ('color_moments_hsv_V_min', 0.0068387813), ('color_moments_hsv_H_min', 0.0067093405), ('lbp_rad2_bins16_1', 0.0065408354)]
INFO:utils.ml:Fitted classifier: XGBClassifier0; Done in 44.57280397415161 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier1
INFO:utils.ml:Top 10 features for XGBClassifier1: [('fft_radial_mean_121', 0.03146014), ('color_moments_lab_A_min', 0.027667955), ('lbp_rad2_bins16_8', 0.011241277), ('fft_radial_variance_60', 0.010746265), ('fft_radial_variance_62', 0.009892712), ('color_moments_lab_B_skew', 0.008092633), ('color_moments_hsv_H_max', 0.0077

In [16]:
pipeline.predict_with_classifiers(VAL_PATH)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\binary\val


Processed 5/159 batches.
Processed 10/159 batches.
Processed 15/159 batches.
Processed 20/159 batches.
Processed 25/159 batches.
Processed 30/159 batches.
Processed 35/159 batches.
Processed 40/159 batches.
Processed 45/159 batches.
Processed 50/159 batches.
Processed 55/159 batches.
Processed 60/159 batches.
Processed 65/159 batches.
Processed 70/159 batches.
Processed 75/159 batches.
Processed 80/159 batches.
Processed 85/159 batches.
Processed 90/159 batches.
Processed 95/159 batches.
Processed 100/159 batches.
Processed 105/159 batches.
Processed 110/159 batches.
Processed 115/159 batches.
Processed 120/159 batches.
Processed 125/159 batches.
Processed 130/159 batches.
Processed 135/159 batches.
Processed 140/159 batches.
Processed 145/159 batches.
Processed 150/159 batches.
Processed 155/159 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier0
INFO:utils.ml:Predictions made with classifier: XGBClassifier1
INFO:utils.ml:Predictions made with classifier: XGBClassifier2
INFO:utils.ml:Predictions made with classifier: XGBClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6


Processed 159/159 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10


{'GT': array([0, 0, 0, ..., 1, 1, 1]),
 'XGBClassifier0': array([0, 0, 0, ..., 0, 0, 1]),
 'XGBClassifier1': array([0, 0, 0, ..., 0, 0, 1]),
 'XGBClassifier2': array([0, 0, 0, ..., 1, 0, 1]),
 'XGBClassifier3': array([0, 0, 0, ..., 0, 0, 1]),
 'XGBClassifier4': array([0, 0, 0, ..., 0, 0, 1]),
 'XGBClassifier5': array([0, 0, 0, ..., 1, 0, 1]),
 'XGBClassifier6': array([0, 0, 0, ..., 0, 0, 1]),
 'XGBClassifier7': array([0, 0, 0, ..., 0, 0, 1]),
 'XGBClassifier8': array([0, 0, 0, ..., 1, 0, 1]),
 'XGBClassifier9': array([0, 0, 0, ..., 0, 1, 1]),
 'XGBClassifier10': array([0, 0, 0, ..., 0, 0, 1])}

In [17]:
import pandas as pd

df = pd.DataFrame(data=pipeline.calculate_metrics())
df = df.T
df

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier XGBClassifier0: {'accuracy': 0.8516859852476291, 'precision': 0.8516364274480883, 'recall': 0.8516548816196305, 'f1': 0.851645122221862, 'kappa': 0.7032904298145117}
INFO:utils.ml:Metrics for classifier XGBClassifier1: {'accuracy': 0.8516859852476291, 'precision': 0.8516364274480883, 'recall': 0.8516548816196305, 'f1': 0.851645122221862, 'kappa': 0.7032904298145117}
INFO:utils.ml:Metrics for classifier XGBClassifier2: {'accuracy': 0.8501053740779768, 'precision': 0.8501300108342362, 'recall': 0.8499729959750812, 'f1': 0.8500301790168034, 'kappa': 0.7000678759170469}
INFO:utils.ml:Metrics for classifier XGBClassifier3: {'accuracy': 0.8516859852476291, 'precision': 0.8516528925619835, 'recall': 0.8516182283415918, 'f1': 0.851634081533124, 'kappa': 0.703268678061175}
INFO:utils.ml:Metrics for classifier XGBClassifier4: {'accuracy': 0.851

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
XGBClassifier0,0.851686,0.851636,0.851655,0.851645,0.70329
XGBClassifier1,0.851686,0.851636,0.851655,0.851645,0.70329
XGBClassifier2,0.850105,0.85013,0.849973,0.85003,0.700068
XGBClassifier3,0.851686,0.851653,0.851618,0.851634,0.703269
XGBClassifier4,0.851686,0.851653,0.851618,0.851634,0.703269
XGBClassifier5,0.851686,0.851723,0.851545,0.851608,0.703225
XGBClassifier6,0.849315,0.849261,0.849315,0.849282,0.698567
XGBClassifier7,0.849315,0.849261,0.849315,0.849282,0.698567
XGBClassifier8,0.849842,0.849832,0.849742,0.849779,0.69956


In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

highest = df[df["accuracy"] == df.iloc[1:]["accuracy"].max()]
idx = highest.index[0]

print(classification_report(pipeline.predictions["GT"], pipeline.predictions[idx]))
print(confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx]))


              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1931
           1       0.85      0.85      0.85      1865

    accuracy                           0.85      3796
   macro avg       0.85      0.85      0.85      3796
weighted avg       0.85      0.85      0.85      3796

[[1648  283]
 [ 280 1585]]


In [19]:
df_sorted = df.sort_values("accuracy", ascending=False)
df_sorted.head(8).index
df_sorted

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
XGBClassifier0,0.851686,0.851636,0.851655,0.851645,0.70329
XGBClassifier1,0.851686,0.851636,0.851655,0.851645,0.70329
XGBClassifier3,0.851686,0.851653,0.851618,0.851634,0.703269
XGBClassifier4,0.851686,0.851653,0.851618,0.851634,0.703269
XGBClassifier5,0.851686,0.851723,0.851545,0.851608,0.703225
XGBClassifier9,0.850369,0.850352,0.850278,0.850309,0.70062
XGBClassifier2,0.850105,0.85013,0.849973,0.85003,0.700068
XGBClassifier8,0.849842,0.849832,0.849742,0.849779,0.69956
XGBClassifier6,0.849315,0.849261,0.849315,0.849282,0.698567


In [34]:
test_loader = FactoryLoader(TEST_PATH, pipeline.batch_size, factory, shuffle=False)

# test_loader.show_images(100)

test_feature_matrix, _ = pipeline.feature_strategy.run(test_loader.get_loader())

Processed 5/265 batches.
Processed 10/265 batches.
Processed 15/265 batches.
Processed 20/265 batches.
Processed 25/265 batches.
Processed 30/265 batches.
Processed 35/265 batches.
Processed 40/265 batches.
Processed 45/265 batches.
Processed 50/265 batches.
Processed 55/265 batches.
Processed 60/265 batches.
Processed 65/265 batches.
Processed 70/265 batches.
Processed 75/265 batches.
Processed 80/265 batches.
Processed 85/265 batches.
Processed 90/265 batches.
Processed 95/265 batches.
Processed 100/265 batches.
Processed 105/265 batches.
Processed 110/265 batches.
Processed 115/265 batches.
Processed 120/265 batches.
Processed 125/265 batches.
Processed 130/265 batches.
Processed 135/265 batches.
Processed 140/265 batches.
Processed 145/265 batches.
Processed 150/265 batches.
Processed 155/265 batches.
Processed 160/265 batches.
Processed 165/265 batches.
Processed 170/265 batches.
Processed 175/265 batches.
Processed 180/265 batches.
Processed 185/265 batches.
Processed 190/265 bat

In [35]:
test_predictions = pipeline.fitted_classifiers["XGBClassifier1"].predict(test_feature_matrix)
test_predictions.shape

(6340,)

In [37]:
np.histogram(test_predictions, 2)

(array([3204, 3136], dtype=int64), array([0. , 0.5, 1. ]))