In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, cohen_kappa_score

from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train"

INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [2]:
percent = 100
random = False
batch_size = 24
th = 0.1

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.pad2square(fill=np.nan)
factory.resize((200,200))
factory.normalize2float()

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

# strategy.add_extractor(GradientExtractor(threshold=th)) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", threshold=th))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))
strategy.add_extractor(LBPExtractor(radius=5, n_points=40))

strategy.add_extractor(FourierTransformExtractor())
strategy.add_extractor(FFTExtractor())

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train
INFO:utils.ml:Preprocessing steps


In [3]:
pipeline.feature_matrix = None
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/212 batches.
Processed 10/212 batches.
Processed 15/212 batches.
Processed 20/212 batches.
Processed 25/212 batches.
Processed 30/212 batches.
Processed 35/212 batches.
Processed 40/212 batches.
Processed 45/212 batches.
Processed 50/212 batches.
Processed 55/212 batches.
Processed 60/212 batches.
Processed 65/212 batches.
Processed 70/212 batches.
Processed 75/212 batches.
Processed 80/212 batches.
Processed 85/212 batches.
Processed 90/212 batches.
Processed 95/212 batches.
Processed 100/212 batches.
Processed 105/212 batches.
Processed 110/212 batches.
Processed 115/212 batches.
Processed 120/212 batches.
Processed 125/212 batches.
Processed 130/212 batches.
Processed 135/212 batches.
Processed 140/212 batches.
Processed 145/212 batches.
Processed 150/212 batches.
Processed 155/212 batches.
Processed 160/212 batches.
Processed 165/212 batches.
Processed 170/212 batches.
Processed 175/212 batches.
Processed 180/212 batches.
Processed 185/212 batches.
Processed 190/212 bat

INFO:utils.ml:Feature extraction completed. Extracted 5082 features.


Processed 212/212 batches.


In [4]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

xgb0 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8)
xgb2 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb3 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8) # THIS!!
xgb4 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1) # THIS
xgb8 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb9 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8)
xgb10 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb11 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb12 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=9, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb13 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8)
xgb14 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb15 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb16 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=9, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)

CLFS = [xgb0, xgb1, xgb2, xgb3, xgb4, xgb5, xgb6, xgb7, xgb8, xgb9, xgb10, xgb11, xgb12, xgb13, xgb14, xgb15, xgb16]

pipeline.classifiers = CLFS.copy()
pipeline.fitted_classifiers = {}

___
# Balance the data

In [5]:
# Backup
pipeline.backup_feature_matrix = pipeline.feature_matrix.copy()
pipeline.backup_labels = pipeline.labels.copy()

In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(np.nan_to_num(pipeline.feature_matrix), pipeline.labels)

In [7]:
print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 515)
(8139, 515)
(5082,)
(8139,)


In [8]:
pipeline.fitted_classifiers = {}
pipeline.classifiers = CLFS.copy()
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: XGBClassifier0
INFO:utils.ml:Top 10 features for XGBClassifier0: [('color_moments_rgb_B_iqr', 0.028615378), ('color_moments_lab_L_iqr', 0.01334318), ('fft_radial_variance_60', 0.010134835), ('fft_radial_mean_124', 0.009946537), ('color_moments_lab_B_var', 0.009753419), ('fft_radial_mean_136', 0.009314835), ('fft_radial_mean_119', 0.008991812), ('lbp_rad4_bins32_20', 0.008428258), ('lbp_rad4_bins32_6', 0.007591425), ('fft_high_freq_energy', 0.0075581158)]
INFO:utils.ml:Fitted classifier: XGBClassifier0; Done in 24.457500219345093 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier1
INFO:utils.ml:Top 10 features for XGBClassifier1: [('color_moments_rgb_B_iqr', 0.030106395), ('color_moments_lab_L_iqr', 0.015473072), ('color_moments_lab_B_var', 0.01024112), ('lbp_rad4_bins32_6', 0.008722955), ('lbp_rad1_bins8_4', 0.008626933), ('lbp_rad4_bins32_20', 0.008139322), ('fft_radial_variance_35', 0.0076102023), ('color_mom

In [9]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier0
INFO:utils.ml:Predictions made with classifier: XGBClassifier1
INFO:utils.ml:Predictions made with classifier: XGBClassifier2
INFO:utils.ml:Predictions made with classifier: XGBClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'XGBClassifier0': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier1': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier2': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier3': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier4': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 2, ..., 0, 2, 0], dtype=int6

In [10]:
import pandas as pd

smote_classifiers = pipeline.fitted_classifiers.copy()
smote_predictions = pipeline.predictions.copy()

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier XGBClassifier0: {'accuracy': 0.8740157480314961, 'precision': 0.7811015976581465, 'recall': 0.7750351181919669, 'f1': 0.7779800360863746, 'kappa': 0.7727171043801172}
INFO:utils.ml:Metrics for classifier XGBClassifier1: {'accuracy': 0.8724409448818897, 'precision': 0.7674425879095989, 'recall': 0.7745849398976689, 'f1': 0.770879351927715, 'kappa': 0.7714040639277524}
INFO:utils.ml:Metrics for classifier XGBClassifier2: {'accuracy': 0.8771653543307086, 'precision': 0.7797001888029075, 'recall': 0.7721366858257103, 'f1': 0.7756882901132224, 'kappa': 0.7786002601547518}
INFO:utils.ml:Metrics for classifier XGBClassifier3: {'accuracy': 0.868503937007874, 'precision': 0.7690750531990416, 'recall': 0.7617194441482745, 'f1': 0.7652600605510687, 'kappa': 0.762522729919292}
INFO:utils.ml:Metrics for classifier XGBClassifier4: {'accuracy': 0.87

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
XGBClassifier0,0.874016,0.781102,0.775035,0.77798,0.772717
XGBClassifier1,0.872441,0.767443,0.774585,0.770879,0.771404
XGBClassifier2,0.877165,0.7797,0.772137,0.775688,0.7786
XGBClassifier3,0.868504,0.769075,0.761719,0.76526,0.762523
XGBClassifier4,0.870866,0.775575,0.766427,0.770798,0.766636
XGBClassifier5,0.877953,0.788769,0.775327,0.781571,0.77941
XGBClassifier6,0.870079,0.761605,0.763591,0.762575,0.766659
XGBClassifier7,0.880315,0.783828,0.777335,0.780428,0.784288
XGBClassifier8,0.876378,0.785921,0.768413,0.776294,0.77635


In [11]:
highest = df_smote[df_smote["kappa"] == df_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[439,  32,  27],
       [ 35, 630,  13],
       [ 32,  13,  49]], dtype=int64)

___
# Borderline SMOTE

In [12]:
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 515)
(8139, 515)
(5082,)
(8139,)


In [13]:
pipeline.fitted_classifiers = {}

pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: XGBClassifier0
INFO:utils.ml:Top 10 features for XGBClassifier0: [('color_moments_rgb_B_iqr', 0.031187931), ('fft_radial_variance_14', 0.019234203), ('color_moments_lab_L_iqr', 0.011752352), ('fft_radial_variance_36', 0.009796996), ('color_moments_lab_B_var', 0.009456159), ('fft_radial_mean_17', 0.008479155), ('fft_radial_variance_29', 0.0083014555), ('lbp_rad1_bins8_4', 0.007160972), ('fft_radial_mean_129', 0.0067424076), ('color_moments_lab_A_iqr', 0.0066360957)]
INFO:utils.ml:Fitted classifier: XGBClassifier0; Done in 30.38069486618042 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier1
INFO:utils.ml:Top 10 features for XGBClassifier1: [('color_moments_rgb_B_iqr', 0.034773793), ('color_moments_lab_L_iqr', 0.013143197), ('color_moments_lab_B_var', 0.010433527), ('lbp_rad1_bins8_4', 0.00950224), ('fft_radial_mean_112', 0.0075628855), ('lbp_rad4_bins32_8', 0.007290875), ('color_moments_lab_L_std', 0.0069023524)

In [14]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier0
INFO:utils.ml:Predictions made with classifier: XGBClassifier1
INFO:utils.ml:Predictions made with classifier: XGBClassifier2
INFO:utils.ml:Predictions made with classifier: XGBClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'XGBClassifier0': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier1': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier2': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier3': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier4': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 0, ..., 0, 2, 0], dtype=int6

In [15]:
borderline_smote_classifiers = pipeline.fitted_classifiers.copy()
borderline_smote_predictions = pipeline.predictions.copy()

df_borderline_smote = pd.DataFrame(data=pipeline.calculate_metrics(
    ["accuracy", "precision", "recall", "f1", "kappa"],))
df_borderline_smote = df_borderline_smote.T
df_borderline_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier XGBClassifier0: {'accuracy': 0.8818897637795275, 'precision': 0.8050624459279551, 'recall': 0.7900034204476224, 'f1': 0.797020746738094, 'kappa': 0.7864828513786147}
INFO:utils.ml:Metrics for classifier XGBClassifier1: {'accuracy': 0.8834645669291339, 'precision': 0.8020880195094886, 'recall': 0.7916975124498496, 'f1': 0.7965459517233514, 'kappa': 0.7899015009590579}
INFO:utils.ml:Metrics for classifier XGBClassifier2: {'accuracy': 0.8850393700787401, 'precision': 0.803494389506584, 'recall': 0.7930362005355257, 'f1': 0.7979271960822357, 'kappa': 0.7926989606997835}
INFO:utils.ml:Metrics for classifier XGBClassifier3: {'accuracy': 0.8771653543307086, 'precision': 0.7897300537572031, 'recall': 0.7750134410736526, 'f1': 0.7817490503228143, 'kappa': 0.7780655452696102}
INFO:utils.ml:Metrics for classifier XGBClassifier4: {'accuracy': 0.8

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
XGBClassifier0,0.88189,0.805062,0.790003,0.797021,0.786483
XGBClassifier1,0.883465,0.802088,0.791698,0.796546,0.789902
XGBClassifier2,0.885039,0.803494,0.793036,0.797927,0.792699
XGBClassifier3,0.877165,0.78973,0.775013,0.781749,0.778066
XGBClassifier4,0.875591,0.78868,0.773497,0.780489,0.775039
XGBClassifier5,0.882677,0.800481,0.787974,0.793828,0.78813
XGBClassifier6,0.882677,0.798908,0.781865,0.789615,0.787885
XGBClassifier7,0.883465,0.803085,0.785589,0.793551,0.789309
XGBClassifier8,0.88189,0.796492,0.784428,0.790063,0.786752


In [18]:
highest = df_borderline_smote[df_borderline_smote["kappa"] == df_borderline_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[442,  32,  24],
       [ 39, 629,  10],
       [ 29,  12,  53]], dtype=int64)