In [32]:
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, cohen_kappa_score

from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train"

In [33]:
percent = 100
random = False
batch_size = 24
th = 0.1

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.pad2square(fill=np.nan)
factory.resize((200,200))
factory.normalize2float()

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(GradientExtractor(threshold=th)) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", threshold=th))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))
strategy.add_extractor(LBPExtractor(radius=5, n_points=40))

strategy.add_extractor(FourierTransformExtractor())
strategy.add_extractor(FFTExtractor())

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train
INFO:utils.ml:Preprocessing steps


In [34]:
pipeline.feature_matrix = None
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/212 batches.
Processed 10/212 batches.
Processed 15/212 batches.
Processed 20/212 batches.
Processed 25/212 batches.
Processed 30/212 batches.
Processed 35/212 batches.
Processed 40/212 batches.
Processed 45/212 batches.
Processed 50/212 batches.
Processed 55/212 batches.
Processed 60/212 batches.
Processed 65/212 batches.
Processed 70/212 batches.
Processed 75/212 batches.
Processed 80/212 batches.
Processed 85/212 batches.
Processed 90/212 batches.
Processed 95/212 batches.
Processed 100/212 batches.
Processed 105/212 batches.
Processed 110/212 batches.
Processed 115/212 batches.
Processed 120/212 batches.
Processed 125/212 batches.
Processed 130/212 batches.
Processed 135/212 batches.
Processed 140/212 batches.
Processed 145/212 batches.
Processed 150/212 batches.
Processed 155/212 batches.
Processed 160/212 batches.
Processed 165/212 batches.
Processed 170/212 batches.
Processed 175/212 batches.
Processed 180/212 batches.
Processed 185/212 batches.
Processed 190/212 bat

INFO:utils.ml:Feature extraction completed. Extracted 5082 features.


Processed 212/212 batches.


In [35]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Random Forest with Bootstrap Disabled
rf0 = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=42)
# Random Forest with Out-of-Bag Error (OOB)
rf1 = RandomForestClassifier(n_estimators=150, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf2 = RandomForestClassifier(n_estimators=150, bootstrap=False, random_state=42)
# Random Forest with Bootstrap Disabled
rf3 = RandomForestClassifier(n_estimators=200, bootstrap=False, random_state=42)

xgb4 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7) # 2ND
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb8 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)


xgb9 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb10 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb11 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb12 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb13 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb14 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb15 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb16 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)

pipeline.classifiers = [rf0, rf1, rf2, rf3,
                        # svm1, svm2, svm3, svm6, svm7, svm8,
                        xgb8, xgb9, xgb10, xgb11, xgb12, xgb13, xgb14, xgb15, xgb16]
pipeline.fitted_classifiers = {}

In [36]:
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_B_var', 0.03678171562927555), ('color_moments_lab_B_std', 0.033195052551086665), ('color_moments_rgb_B_iqr', 0.02367868802470365), ('color_moments_rgb_B_var', 0.018855286614254636), ('color_moments_rgb_R_iqr', 0.018038690955580746), ('color_moments_rgb_B_std', 0.01515474177964628), ('color_moments_rgb_R_skew', 0.014060165472765212), ('color_moments_lab_L_iqr', 0.01315417280578557), ('color_moments_lab_B_iqr', 0.013137011317174953), ('color_moments_rgb_G_iqr', 0.012804093587298669)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 5.984819412231445 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_var', 0.036313918522740414), ('color_moments_lab_B_std', 0.0358567546759903), ('color_moments_rgb_R_iqr', 

In [37]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier12


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 0, ..., 1, 0, 0]),
 'XGBClassifier4': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 0, ..., 0, 2, 0], dtype=int64)}

In [38]:
import pandas as pd

df = pd.DataFrame(data=pipeline.calculate_metrics())
df = df.T
df

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8433070866141732, 'precision': 0.873349477285119, 'recall': 0.6586515521698921, 'f1': 0.6816697547191483, 'kappa': 0.7066041634741744}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8244094488188977, 'precision': 0.878517161954888, 'recall': 0.617407305743403, 'f1': 0.6228420740140242, 'kappa': 0.6686966856800602}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8433070866141732, 'precision': 0.8714071232726225, 'recall': 0.6521872338409056, 'f1': 0.6719846647800538, 'kappa': 0.7059596885660169}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.84251968503937, 'precision': 0.8709376397045473, 'recall': 0.6516955917563431, 'f1': 0.6715043573833199, 'kappa': 0.7044820990613234}
INFO:utils.ml:Metrics for classifier

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.843307,0.873349,0.658652,0.68167,0.706604
RandomForestClassifier1,0.824409,0.878517,0.617407,0.622842,0.668697
RandomForestClassifier2,0.843307,0.871407,0.652187,0.671985,0.70596
RandomForestClassifier3,0.84252,0.870938,0.651696,0.671504,0.704482
XGBClassifier4,0.881102,0.852539,0.716171,0.745599,0.779306
XGBClassifier5,0.868504,0.834671,0.703474,0.731729,0.755769
XGBClassifier6,0.870079,0.819762,0.707334,0.734269,0.758893
XGBClassifier7,0.880315,0.844942,0.716568,0.744391,0.778498
XGBClassifier8,0.877953,0.8434,0.713672,0.742406,0.773518


In [39]:
highest = df[df["kappa"] == df.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[452,  40,   6],
       [ 39, 637,   2],
       [ 44,  20,  30]], dtype=int64)

___
# Balance the data

In [40]:
# Backup
pipeline.backup_feature_matrix = pipeline.feature_matrix.copy()
pipeline.backup_labels = pipeline.labels.copy()

In [41]:
print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 190)
(5082, 190)
(5082,)
(5082,)


In [42]:
pipeline.get_feature_names()

['gradient_magnitude_mean',
 'gradient_magnitude_std',
 'gradient_direction_mean',
 'gradient_direction_std',
 'color_moments_rgb_R_mean',
 'color_moments_rgb_R_std',
 'color_moments_rgb_R_skew',
 'color_moments_rgb_R_kurtosis',
 'color_moments_rgb_R_median',
 'color_moments_rgb_R_var',
 'color_moments_rgb_R_min',
 'color_moments_rgb_R_max',
 'color_moments_rgb_R_iqr',
 'color_moments_rgb_R_entropy',
 'color_moments_rgb_G_mean',
 'color_moments_rgb_G_std',
 'color_moments_rgb_G_skew',
 'color_moments_rgb_G_kurtosis',
 'color_moments_rgb_G_median',
 'color_moments_rgb_G_var',
 'color_moments_rgb_G_min',
 'color_moments_rgb_G_max',
 'color_moments_rgb_G_iqr',
 'color_moments_rgb_G_entropy',
 'color_moments_rgb_B_mean',
 'color_moments_rgb_B_std',
 'color_moments_rgb_B_skew',
 'color_moments_rgb_B_kurtosis',
 'color_moments_rgb_B_median',
 'color_moments_rgb_B_var',
 'color_moments_rgb_B_min',
 'color_moments_rgb_B_max',
 'color_moments_rgb_B_iqr',
 'color_moments_rgb_B_entropy',
 'color_

In [43]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)


pipeline.feature_matrix, pipeline.labels = smote.fit_resample(np.nan_to_num(pipeline.feature_matrix), pipeline.labels)

In [44]:
pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_B_var', 0.020103475475452246), ('color_moments_lab_B_std', 0.01887654583079339), ('color_moments_rgb_B_iqr', 0.018601631705106522), ('color_moments_rgb_B_var', 0.016366039371529625), ('color_moments_rgb_G_iqr', 0.015103918777083445), ('color_moments_rgb_B_std', 0.014193034368277863), ('color_moments_rgb_R_iqr', 0.013737786888031896), ('lbp_rad1_bins8_4', 0.01313108092663636), ('color_moments_lab_L_iqr', 0.012708989211170417), ('color_moments_rgb_B_kurtosis', 0.01193028036010276)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 11.163585424423218 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_var', 0.021371863466320702), ('color_moments_lab_B_std', 0.0210420187599849), ('color_moments_rgb_B_iqr', 0

In [45]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 2, 0]),
 'XGBClassifier4': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 0, 2, 0], dtype=int64)}

In [46]:
import pandas as pd

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8448818897637795, 'precision': 0.7580491058898616, 'recall': 0.7397950554493123, 'f1': 0.7479090488553602, 'kappa': 0.7193555913510505}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8362204724409449, 'precision': 0.7374628976282226, 'recall': 0.7245128130673701, 'f1': 0.7302551175950313, 'kappa': 0.7043708704138092}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8401574803149606, 'precision': 0.7422943408366467, 'recall': 0.7269710234901824, 'f1': 0.7338565449725793, 'kappa': 0.7107639216108635}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8440944881889764, 'precision': 0.7503685774195681, 'recall': 0.7328390950357635, 'f1': 0.7406611061791807, 'kappa': 0.717703005509877}
INFO:utils.ml:Metrics for classi

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.844882,0.758049,0.739795,0.747909,0.719356
RandomForestClassifier1,0.83622,0.737463,0.724513,0.730255,0.704371
RandomForestClassifier2,0.840157,0.742294,0.726971,0.733857,0.710764
RandomForestClassifier3,0.844094,0.750369,0.732839,0.740661,0.717703
XGBClassifier4,0.876378,0.781042,0.770401,0.775446,0.776496
XGBClassifier5,0.872441,0.78632,0.774052,0.779811,0.769506
XGBClassifier6,0.862992,0.765783,0.751636,0.758165,0.752008
XGBClassifier7,0.875591,0.787129,0.773675,0.779924,0.775141
XGBClassifier8,0.876378,0.784422,0.7731,0.778466,0.77636


In [47]:
highest = df_smote[df_smote["kappa"] == df_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[431,  41,  26],
       [ 33, 635,  10],
       [ 32,  13,  49]], dtype=int64)

In [48]:
df_sorted = df_smote.sort_values("kappa", ascending=False)
df_sorted.head(8).index

Index(['GT', 'XGBClassifier12', 'XGBClassifier4', 'XGBClassifier8',
       'XGBClassifier7', 'XGBClassifier11', 'XGBClassifier9',
       'XGBClassifier5'],
      dtype='object')

___
# Borderline SMOTE

In [49]:
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 190)
(8139, 190)
(5082,)
(8139,)


In [50]:
pipeline.fitted_classifiers = {}

pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_B_var', 0.02280485651507314), ('color_moments_rgb_B_iqr', 0.01948241848058574), ('color_moments_lab_B_std', 0.018483455795756963), ('color_moments_rgb_B_var', 0.016758705757086247), ('color_moments_rgb_R_iqr', 0.013408101003226938), ('color_moments_rgb_G_iqr', 0.012327187749164787), ('color_moments_rgb_B_std', 0.011997393964135157), ('color_moments_lab_L_iqr', 0.01183032350494857), ('color_moments_rgb_B_kurtosis', 0.011212692406473275), ('lbp_rad1_bins8_4', 0.011154352660797251)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 11.47402024269104 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_std', 0.020128441003352194), ('color_moments_lab_B_var', 0.019880976228302403), ('color_moments_rgb_B_iqr', 

In [51]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 2, 0]),
 'XGBClassifier4': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 0, 2, 2], dtype=int64)}

In [52]:
df_borderline_smote = pd.DataFrame(data=pipeline.calculate_metrics(
    ["accuracy", "precision", "recall", "f1", "kappa"],))
df_borderline_smote = df_borderline_smote.T
df_borderline_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8496062992125984, 'precision': 0.7657490758860622, 'recall': 0.7350031520042387, 'f1': 0.7474814767120478, 'kappa': 0.7271342629464145}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.841732283464567, 'precision': 0.7468524157482639, 'recall': 0.7349517318631206, 'f1': 0.7402115336643464, 'kappa': 0.7146788334547926}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8543307086614174, 'precision': 0.7691407018351097, 'recall': 0.7386638123447157, 'f1': 0.7509721903807641, 'kappa': 0.7358129472457979}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8519685039370078, 'precision': 0.7675434193819887, 'recall': 0.7336013230099838, 'f1': 0.7470597574662615, 'kappa': 0.7311757738435694}
INFO:utils.ml:Metrics for classi

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.849606,0.765749,0.735003,0.747481,0.727134
RandomForestClassifier1,0.841732,0.746852,0.734952,0.740212,0.714679
RandomForestClassifier2,0.854331,0.769141,0.738664,0.750972,0.735813
RandomForestClassifier3,0.851969,0.767543,0.733601,0.74706,0.731176
XGBClassifier4,0.866929,0.777698,0.760914,0.768606,0.758976
XGBClassifier5,0.864567,0.772209,0.761783,0.766725,0.755241
XGBClassifier6,0.865354,0.763378,0.759931,0.761598,0.757543
XGBClassifier7,0.871654,0.77743,0.761875,0.768957,0.767815
XGBClassifier8,0.870866,0.784483,0.760495,0.771117,0.765444


In [53]:
highest = df_borderline_smote[df_borderline_smote["kappa"] == df_borderline_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[437,  38,  23],
       [ 41, 626,  11],
       [ 34,  14,  46]], dtype=int64)

___
# PCA

In [54]:
from sklearn.decomposition import PCA
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

pca = PCA(n_components=100)

pipeline.feature_matrix = pca.fit_transform(pipeline.feature_matrix, pipeline.labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

print(pca.get_feature_names_out())

pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...


(5082, 190)
(8139, 100)
(5082,)
(8139,)
['pca0' 'pca1' 'pca2' 'pca3' 'pca4' 'pca5' 'pca6' 'pca7' 'pca8' 'pca9'
 'pca10' 'pca11' 'pca12' 'pca13' 'pca14' 'pca15' 'pca16' 'pca17' 'pca18'
 'pca19' 'pca20' 'pca21' 'pca22' 'pca23' 'pca24' 'pca25' 'pca26' 'pca27'
 'pca28' 'pca29' 'pca30' 'pca31' 'pca32' 'pca33' 'pca34' 'pca35' 'pca36'
 'pca37' 'pca38' 'pca39' 'pca40' 'pca41' 'pca42' 'pca43' 'pca44' 'pca45'
 'pca46' 'pca47' 'pca48' 'pca49' 'pca50' 'pca51' 'pca52' 'pca53' 'pca54'
 'pca55' 'pca56' 'pca57' 'pca58' 'pca59' 'pca60' 'pca61' 'pca62' 'pca63'
 'pca64' 'pca65' 'pca66' 'pca67' 'pca68' 'pca69' 'pca70' 'pca71' 'pca72'
 'pca73' 'pca74' 'pca75' 'pca76' 'pca77' 'pca78' 'pca79' 'pca80' 'pca81'
 'pca82' 'pca83' 'pca84' 'pca85' 'pca86' 'pca87' 'pca88' 'pca89' 'pca90'
 'pca91' 'pca92' 'pca93' 'pca94' 'pca95' 'pca96' 'pca97' 'pca98' 'pca99']


INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('gradient_direction_mean', 0.04955539805118837), ('color_moments_rgb_R_median', 0.030846212098150066), ('color_moments_rgb_R_skew', 0.024109006919812246), ('gradient_magnitude_std', 0.022834336064755933), ('color_moments_lab_B_min', 0.02175719407989049), ('color_moments_lab_L_var', 0.020520016183174943), ('gradient_direction_std', 0.020251705585997506), ('color_moments_rgb_B_max', 0.01814614978390262), ('color_moments_lab_A_mean', 0.016984353311596076), ('color_moments_rgb_G_entropy', 0.015360724080591534)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 10.875952005386353 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('gradient_direction_mean', 0.04796118427919965), ('color_moments_rgb_R_median', 0.03110844334017325), ('color_moments_rgb_R_skew', 0.022484593770461175), ('

In [55]:
# Load and extract features from the new dataset
new_loader = FactoryLoader(path=VAL_PATH, factory=pipeline.loader.get_factory(),
                           percentage=percent, batch_size=pipeline.batch_size)
new_feature_matrix, new_labels = pipeline.feature_strategy.run(new_loader.get_loader())

new_feature_matrix = np.nan_to_num(new_feature_matrix) # Impute nans
new_feature_matrix = pca.transform(new_feature_matrix) # Impute nans

# Store predictions in the class attribute
pipeline.predictions = {"GT": new_labels, }
for clf_name, clf in pipeline.fitted_classifiers.items():

    pipeline.predictions[clf_name] = clf.predict(new_feature_matrix)
    if pipeline.verbose:
        logger.info("Predictions made with classifier: %s", clf_name)


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.utils:Predictions made with classifier: RandomForestClassifier0
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier1
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier2
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier3
INFO:utils.utils:Predictions made with classifier: XGBClassifier4
INFO:utils.utils:Predictions made with classifier: XGBClassifier5
INFO:utils.utils:Predictions made with classifier: XGBClassifier6


Processed 53/53 batches.


INFO:utils.utils:Predictions made with classifier: XGBClassifier7
INFO:utils.utils:Predictions made with classifier: XGBClassifier8
INFO:utils.utils:Predictions made with classifier: XGBClassifier9
INFO:utils.utils:Predictions made with classifier: XGBClassifier10
INFO:utils.utils:Predictions made with classifier: XGBClassifier11
INFO:utils.utils:Predictions made with classifier: XGBClassifier12


In [56]:
df_pca = pd.DataFrame(data=pipeline.calculate_metrics(avg="macro"))
df_pca = df_pca.T
df_pca

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8307086614173228, 'precision': 0.7533966345569589, 'recall': 0.674332426675257, 'f1': 0.6944397219018827, 'kappa': 0.6877844300422157}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8173228346456692, 'precision': 0.7537313724850817, 'recall': 0.6801805855191304, 'f1': 0.7016019028618498, 'kappa': 0.6649228949642906}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8362204724409449, 'precision': 0.7685628248218069, 'recall': 0.6810060804316872, 'f1': 0.703246294587908, 'kappa': 0.697560886251019}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.831496062992126, 'precision': 0.770024482734652, 'recall': 0.6771677181329346, 'f1': 0.7006046831002783, 'kappa': 0.6882427434822109}
INFO:utils.ml:Metrics for classifier

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.830709,0.753397,0.674332,0.69444,0.687784
RandomForestClassifier1,0.817323,0.753731,0.680181,0.701602,0.664923
RandomForestClassifier2,0.83622,0.768563,0.681006,0.703246,0.697561
RandomForestClassifier3,0.831496,0.770024,0.677168,0.700605,0.688243
XGBClassifier4,0.854331,0.786113,0.723569,0.744987,0.733076
XGBClassifier5,0.853543,0.786044,0.715691,0.737302,0.731785
XGBClassifier6,0.849606,0.756261,0.723674,0.736388,0.726595
XGBClassifier7,0.852756,0.775239,0.726529,0.743988,0.731574
XGBClassifier8,0.856693,0.788945,0.722345,0.744432,0.737163


In [57]:
highest = df_pca[df_pca["kappa"] == df_pca.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[436,  47,  15],
       [ 57, 616,   5],
       [ 34,  24,  36]], dtype=int64)