In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, cohen_kappa_score

from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train"

INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [32]:
percent = 100
random = False
batch_size = 24
th = 0.1

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.pad2square(fill=np.nan)
factory.resize((200,200))
factory.normalize2float()

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(GradientExtractor(threshold=th)) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", threshold=th))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))
strategy.add_extractor(LBPExtractor(radius=5, n_points=40))

strategy.add_extractor(FourierTransformExtractor())
strategy.add_extractor(FFTExtractor())

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train
INFO:utils.ml:Preprocessing steps


In [33]:
pipeline.feature_matrix = None
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/212 batches.
Processed 10/212 batches.
Processed 15/212 batches.
Processed 20/212 batches.
Processed 25/212 batches.
Processed 30/212 batches.
Processed 35/212 batches.
Processed 40/212 batches.
Processed 45/212 batches.
Processed 50/212 batches.
Processed 55/212 batches.
Processed 60/212 batches.
Processed 65/212 batches.
Processed 70/212 batches.
Processed 75/212 batches.
Processed 80/212 batches.
Processed 85/212 batches.
Processed 90/212 batches.
Processed 95/212 batches.
Processed 100/212 batches.
Processed 105/212 batches.
Processed 110/212 batches.
Processed 115/212 batches.
Processed 120/212 batches.
Processed 125/212 batches.
Processed 130/212 batches.
Processed 135/212 batches.
Processed 140/212 batches.
Processed 145/212 batches.
Processed 150/212 batches.
Processed 155/212 batches.
Processed 160/212 batches.
Processed 165/212 batches.
Processed 170/212 batches.
Processed 175/212 batches.
Processed 180/212 batches.
Processed 185/212 batches.
Processed 190/212 bat

INFO:utils.ml:Feature extraction completed. Extracted 5082 features.


Processed 212/212 batches.


In [34]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

xgb0 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8)
xgb2 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb3 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8) # THIS!!
xgb4 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1) # THIS
xgb8 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb9 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8)
xgb10 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb11 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb12 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=9, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb13 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8)
xgb14 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb15 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb16 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=9, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)


pipeline.classifiers = [xgb0, xgb1, xgb2, xgb3, xgb4, xgb5, xgb6, xgb7, xgb8, xgb9, xgb10, xgb11, xgb12, xgb13, xgb14, xgb15, xgb16]
pipeline.fitted_classifiers = {}

___
# Balance the data

In [35]:
# Backup
pipeline.backup_feature_matrix = pipeline.feature_matrix.copy()
pipeline.backup_labels = pipeline.labels.copy()

In [36]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(np.nan_to_num(pipeline.feature_matrix), pipeline.labels)

In [37]:
print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 519)
(8139, 519)
(5082,)
(8139,)


In [38]:
pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: XGBClassifier0
INFO:utils.ml:Top 10 features for XGBClassifier0: [('color_moments_rgb_B_iqr', 0.026797362), ('fft_radial_variance_83', 0.016851636), ('color_moments_lab_L_iqr', 0.0153307505), ('fft_radial_variance_107', 0.013709151), ('color_moments_lab_B_var', 0.009709959), ('lbp_rad1_bins8_4', 0.007945936), ('fft_radial_mean_102', 0.006948025), ('lbp_rad4_bins32_20', 0.0062142746), ('color_moments_hsv_H_median', 0.00620055), ('fft_radial_variance_67', 0.0061397473)]
INFO:utils.ml:Fitted classifier: XGBClassifier0; Done in 24.354575395584106 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier1
INFO:utils.ml:Top 10 features for XGBClassifier1: [('color_moments_rgb_B_iqr', 0.028739294), ('color_moments_lab_L_iqr', 0.018154874), ('color_moments_lab_B_var', 0.011482409), ('lbp_rad4_bins32_20', 0.008865193), ('lbp_rad1_bins8_4', 0.008857697), ('lbp_rad4_bins32_6', 0.008291549), ('color_moments_hsv_H_median', 0.00816

In [39]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier0
INFO:utils.ml:Predictions made with classifier: XGBClassifier1
INFO:utils.ml:Predictions made with classifier: XGBClassifier2
INFO:utils.ml:Predictions made with classifier: XGBClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'XGBClassifier0': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier1': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier2': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier3': array([0, 0, 2, ..., 1, 2, 2], dtype=int64),
 'XGBClassifier4': array([0, 0, 2, ..., 1, 2, 2], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 2, ..., 0, 2, 0], dtype=int6

In [40]:
import pandas as pd

smote_classifiers = pipeline.fitted_classifiers.copy()
smote_predictions = pipeline.predictions.copy()

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier XGBClassifier0: {'accuracy': 0.8740157480314961, 'precision': 0.782604199249378, 'recall': 0.7750351181919669, 'f1': 0.7786806229252421, 'kappa': 0.7726143530475496}
INFO:utils.ml:Metrics for classifier XGBClassifier1: {'accuracy': 0.8748031496062992, 'precision': 0.7767144541702434, 'recall': 0.7735385148199653, 'f1': 0.775043103827698, 'kappa': 0.7747382934861182}
INFO:utils.ml:Metrics for classifier XGBClassifier2: {'accuracy': 0.8771653543307086, 'precision': 0.7910044574865783, 'recall': 0.7805892496112611, 'f1': 0.7854970094681247, 'kappa': 0.7783218160335136}
INFO:utils.ml:Metrics for classifier XGBClassifier3: {'accuracy': 0.8818897637795275, 'precision': 0.7927998711117145, 'recall': 0.7900034204476224, 'f1': 0.7913768865480963, 'kappa': 0.7873285790199922}
INFO:utils.ml:Metrics for classifier XGBClassifier4: {'accuracy': 0.8

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
XGBClassifier0,0.874016,0.782604,0.775035,0.778681,0.772614
XGBClassifier1,0.874803,0.776714,0.773539,0.775043,0.774738
XGBClassifier2,0.877165,0.791004,0.780589,0.785497,0.778322
XGBClassifier3,0.88189,0.7928,0.790003,0.791377,0.787329
XGBClassifier4,0.882677,0.796576,0.790673,0.793526,0.788556
XGBClassifier5,0.874016,0.786193,0.769104,0.776935,0.771858
XGBClassifier6,0.872441,0.774868,0.771886,0.773276,0.77058
XGBClassifier7,0.87874,0.790615,0.78175,0.785969,0.781219
XGBClassifier8,0.877165,0.79026,0.771781,0.78015,0.777595


In [41]:
highest = df_smote[df_smote["kappa"] == df_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[436,  36,  26],
       [ 35, 633,  10],
       [ 30,  11,  53]], dtype=int64)

___
# Borderline SMOTE

In [42]:
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 519)
(8139, 519)
(5082,)
(8139,)


In [43]:
pipeline.fitted_classifiers = {}

pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: XGBClassifier0
INFO:utils.ml:Top 10 features for XGBClassifier0: [('color_moments_rgb_B_iqr', 0.028127212), ('fft_radial_mean_113', 0.017429804), ('color_moments_lab_L_iqr', 0.016140258), ('color_moments_lab_B_var', 0.0091401655), ('lbp_rad1_bins8_4', 0.007685275), ('color_moments_rgb_G_entropy', 0.00721154), ('fft_radial_variance_28', 0.0070289066), ('fft_radial_variance_19', 0.006940012), ('color_moments_lab_A_iqr', 0.00673417), ('fft_radial_variance_10', 0.006557303)]
INFO:utils.ml:Fitted classifier: XGBClassifier0; Done in 42.7605619430542 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier1
INFO:utils.ml:Top 10 features for XGBClassifier1: [('color_moments_rgb_B_iqr', 0.02953613), ('fft_radial_variance_34', 0.02824795), ('fft_radial_mean_134', 0.013825244), ('color_moments_lab_L_iqr', 0.013313653), ('fft_radial_variance_89', 0.011321802), ('color_moments_lab_B_var', 0.010541675), ('lbp_rad4_bins32_8', 0.008

In [44]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier0
INFO:utils.ml:Predictions made with classifier: XGBClassifier1
INFO:utils.ml:Predictions made with classifier: XGBClassifier2
INFO:utils.ml:Predictions made with classifier: XGBClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'XGBClassifier0': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier1': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier2': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier3': array([0, 0, 2, ..., 1, 2, 2], dtype=int64),
 'XGBClassifier4': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier7': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier10': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier14': array([0, 0, 0, ..., 0, 2, 0], dtype=int6

In [45]:
borderline_smote_classifiers = pipeline.fitted_classifiers.copy()
borderline_smote_predictions = pipeline.predictions.copy()

df_borderline_smote = pd.DataFrame(data=pipeline.calculate_metrics(
    ["accuracy", "precision", "recall", "f1", "kappa"],))
df_borderline_smote = df_borderline_smote.T
df_borderline_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier XGBClassifier0: {'accuracy': 0.8724409448818897, 'precision': 0.7903603005553631, 'recall': 0.7738741320645666, 'f1': 0.7815106787484595, 'kappa': 0.7690172849707426}
INFO:utils.ml:Metrics for classifier XGBClassifier1: {'accuracy': 0.8811023622047244, 'precision': 0.7984370342740252, 'recall': 0.7963315006085977, 'f1': 0.797321788084972, 'kappa': 0.7862114942118688}
INFO:utils.ml:Metrics for classifier XGBClassifier2: {'accuracy': 0.8724409448818897, 'precision': 0.7826752521876253, 'recall': 0.7745849398976689, 'f1': 0.7783923211474395, 'kappa': 0.7700848855453192}
INFO:utils.ml:Metrics for classifier XGBClassifier3: {'accuracy': 0.8811023622047244, 'precision': 0.8013503981478377, 'recall': 0.7871681289899447, 'f1': 0.7937394440861342, 'kappa': 0.785232719616539}
INFO:utils.ml:Metrics for classifier XGBClassifier4: {'accuracy': 0.8

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
XGBClassifier0,0.872441,0.79036,0.773874,0.781511,0.769017
XGBClassifier1,0.881102,0.798437,0.796332,0.797322,0.786211
XGBClassifier2,0.872441,0.782675,0.774585,0.778392,0.770085
XGBClassifier3,0.881102,0.80135,0.787168,0.793739,0.785233
XGBClassifier4,0.880315,0.800318,0.789376,0.794536,0.784006
XGBClassifier5,0.874016,0.794666,0.778267,0.785851,0.772053
XGBClassifier6,0.88189,0.800661,0.796823,0.798653,0.787489
XGBClassifier7,0.879528,0.794533,0.782775,0.788236,0.782618
XGBClassifier8,0.873228,0.794696,0.774899,0.783897,0.770374


In [21]:
highest = df_borderline_smote[df_borderline_smote["kappa"] == df_borderline_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[440,  33,  25],
       [ 41, 626,  11],
       [ 26,  13,  55]], dtype=int64)