In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, cohen_kappa_score

from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train"

INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [2]:
percent = 100
random = False
batch_size = 24
th = 0.01

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.pad2square(fill=np.nan)
factory.resize((150,150))
factory.normalize2float()

factory_loader = FactoryLoader(path=VAL_PATH, batch_size=batch_size, factory=factory, percentage=percent, shuffle=random)

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(GradientExtractor(threshold=th)) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", threshold=th))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))

strategy.add_extractor(FourierTransformExtractor())

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation',]))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train
INFO:utils.ml:Preprocessing steps


In [3]:
pipeline.feature_matrix = None
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/212 batches.
Processed 10/212 batches.
Processed 15/212 batches.
Processed 20/212 batches.
Processed 25/212 batches.
Processed 30/212 batches.
Processed 35/212 batches.
Processed 40/212 batches.
Processed 45/212 batches.
Processed 50/212 batches.
Processed 55/212 batches.
Processed 60/212 batches.
Processed 65/212 batches.
Processed 70/212 batches.
Processed 75/212 batches.
Processed 80/212 batches.
Processed 85/212 batches.
Processed 90/212 batches.
Processed 95/212 batches.
Processed 100/212 batches.
Processed 105/212 batches.
Processed 110/212 batches.
Processed 115/212 batches.
Processed 120/212 batches.
Processed 125/212 batches.
Processed 130/212 batches.
Processed 135/212 batches.
Processed 140/212 batches.
Processed 145/212 batches.
Processed 150/212 batches.
Processed 155/212 batches.
Processed 160/212 batches.
Processed 165/212 batches.
Processed 170/212 batches.
Processed 175/212 batches.
Processed 180/212 batches.
Processed 185/212 batches.
Processed 190/212 bat

INFO:utils.ml:Feature extraction completed. Extracted 5082 features.


Processed 212/212 batches.


In [4]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

rf1 = RandomForestClassifier(n_estimators=100)
rf2 = RandomForestClassifier(n_estimators=150)
# Random Forest with Out-of-Bag Error (OOB)
rf3 = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf4 = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=42)
# Random Forest with Out-of-Bag Error (OOB)
rf5 = RandomForestClassifier(n_estimators=150, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf6 = RandomForestClassifier(n_estimators=150, bootstrap=False, random_state=42)
# Random Forest with Out-of-Bag Error (OOB)
rf7 = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf8 = RandomForestClassifier(n_estimators=200, bootstrap=False, random_state=42)


xgb1 = XGBClassifier(n_estimators=350)
xgb2 = XGBClassifier(n_estimators=450)
xgb3 = XGBClassifier(n_estimators=550)
xgb4 = XGBClassifier(learning_rate=0.05, n_estimators=400, max_depth=3, min_child_weight=4, subsample=0.8, colsample_bytree=0.8)
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7) # 2ND
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb8 = XGBClassifier(learning_rate=0.05, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb9 = XGBClassifier(learning_rate=0.05, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb10 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb11 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)


pipeline.classifiers = [rf1, rf2, rf3, rf4, rf5, rf6, rf7, rf8,
                        # svm1, svm2, svm3, svm6, svm7, svm8,
                        xgb1, xgb2, xgb3, xgb4, xgb5, xgb6, xgb7, xgb8, xgb9, xgb10, xgb11]
pipeline.fitted_classifiers = {}

In [5]:
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_B_var', 0.03989228441192286), ('color_moments_lab_B_std', 0.03892579031123583), ('color_moments_rgb_B_var', 0.020948314154270054), ('color_moments_rgb_B_std', 0.017645031661127276), ('color_moments_rgb_B_iqr', 0.017254162127920053), ('color_moments_rgb_B_kurtosis', 0.016971137053938327), ('color_moments_lab_L_iqr', 0.01682056399822013), ('color_moments_rgb_R_iqr', 0.015114091985386706), ('color_moments_rgb_G_var', 0.011442033459388337), ('color_moments_rgb_R_kurtosis', 0.011242740105242921)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 4.296960353851318 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_var', 0.035240260679251206), ('color_moments_lab_B_std', 0.031351018404342755), ('color_moments_

In [6]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier4
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier5
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier6


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16
INFO:utils.ml:Predictions made with classifier: XGBClassifier17
INFO:utils.ml:Predictions made with classifier: XGBClassifier18


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier4': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier5': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier6': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier7': array([0, 0, 1, ..., 1, 0, 0]),
 'XGBClassifier8': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier15': array([0

In [7]:
import pandas as pd

df = pd.DataFrame(data=pipeline.calculate_metrics())
df = df.T
df

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8322834645669291, 'precision': 0.8838361179405272, 'recall': 0.6198023752576363, 'f1': 0.622506054356038, 'kappa': 0.6832034964609108}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.831496062992126, 'precision': 0.8469070356231693, 'recall': 0.6187776272982471, 'f1': 0.6215956543293997, 'kappa': 0.6814640138018978}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8267716535433071, 'precision': 0.8801137405980555, 'recall': 0.6136618273760327, 'f1': 0.6131270380972068, 'kappa': 0.6733264116882267}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8464566929133859, 'precision': 0.856696112855956, 'recall': 0.6539761002208798, 'f1': 0.6734580986237019, 'kappa': 0.7116432550254299}
INFO:utils.ml:Metrics for classifi

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.832283,0.883836,0.619802,0.622506,0.683203
RandomForestClassifier1,0.831496,0.846907,0.618778,0.621596,0.681464
RandomForestClassifier2,0.826772,0.880114,0.613662,0.613127,0.673326
RandomForestClassifier3,0.846457,0.856696,0.653976,0.673458,0.711643
RandomForestClassifier4,0.823622,0.878103,0.611518,0.611052,0.667387
RandomForestClassifier5,0.852756,0.879106,0.662208,0.683646,0.723763
RandomForestClassifier6,0.822047,0.8771,0.609823,0.609797,0.663922
RandomForestClassifier7,0.851969,0.879432,0.664415,0.687689,0.722358
XGBClassifier8,0.870866,0.851068,0.706548,0.735645,0.76048


In [None]:
highest = df[df["kappa"] == df.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

___
# Balance the data

In [8]:
# Backup
pipeline.backup_feature_matrix = pipeline.feature_matrix.copy()
pipeline.backup_labels = pipeline.labels.copy()

In [9]:
print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 189)
(5082, 189)
(5082,)
(5082,)


In [10]:
pipeline.get_feature_names()

['gradient_magnitude_mean',
 'gradient_magnitude_std',
 'gradient_direction_mean',
 'gradient_direction_std',
 'color_moments_rgb_R_mean',
 'color_moments_rgb_R_std',
 'color_moments_rgb_R_skew',
 'color_moments_rgb_R_kurtosis',
 'color_moments_rgb_R_median',
 'color_moments_rgb_R_var',
 'color_moments_rgb_R_min',
 'color_moments_rgb_R_max',
 'color_moments_rgb_R_iqr',
 'color_moments_rgb_R_entropy',
 'color_moments_rgb_G_mean',
 'color_moments_rgb_G_std',
 'color_moments_rgb_G_skew',
 'color_moments_rgb_G_kurtosis',
 'color_moments_rgb_G_median',
 'color_moments_rgb_G_var',
 'color_moments_rgb_G_min',
 'color_moments_rgb_G_max',
 'color_moments_rgb_G_iqr',
 'color_moments_rgb_G_entropy',
 'color_moments_rgb_B_mean',
 'color_moments_rgb_B_std',
 'color_moments_rgb_B_skew',
 'color_moments_rgb_B_kurtosis',
 'color_moments_rgb_B_median',
 'color_moments_rgb_B_var',
 'color_moments_rgb_B_min',
 'color_moments_rgb_B_max',
 'color_moments_rgb_B_iqr',
 'color_moments_rgb_B_entropy',
 'color_

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)


pipeline.feature_matrix, pipeline.labels = smote.fit_resample(np.nan_to_num(pipeline.feature_matrix), pipeline.labels)

In [12]:
pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_B_std', 0.022517908709084905), ('color_moments_lab_B_var', 0.02206546208689515), ('color_moments_rgb_B_var', 0.021429789844875667), ('color_moments_lab_L_iqr', 0.01731470636711622), ('color_moments_rgb_B_std', 0.015618958109359366), ('color_moments_rgb_B_iqr', 0.014527995995931664), ('color_moments_rgb_G_std', 0.01323798455843907), ('color_moments_rgb_R_iqr', 0.01311700556451779), ('lbp_rad1_bins8_4', 0.012984705484616294), ('color_moments_rgb_B_kurtosis', 0.012900815412511195)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 8.099308252334595 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_std', 0.023580736364420193), ('color_moments_lab_B_var', 0.021394836924772734), ('color_moments_lab_L_iqr', 0

In [13]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier4
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier5


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier6
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16
INFO:utils.ml:Predictions made with classifier: XGBClassifier17
INFO:utils.ml:Predictions made with classifier: XGBClassifier18


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier4': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier5': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier6': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier7': array([0, 0, 2, ..., 1, 0, 0]),
 'XGBClassifier8': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier15': array([0

In [14]:
import pandas as pd

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8275590551181102, 'precision': 0.718045636666225, 'recall': 0.7187493462206321, 'f1': 0.7182278215097123, 'kappa': 0.6907577563509572}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8330708661417323, 'precision': 0.7317294240799481, 'recall': 0.7180366479411653, 'f1': 0.7235613691741704, 'kappa': 0.699382776700937}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8307086614173228, 'precision': 0.7190875680631134, 'recall': 0.7153178079795489, 'f1': 0.7169501694801811, 'kappa': 0.695615435379731}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8456692913385827, 'precision': 0.7408366367192158, 'recall': 0.7262583252107154, 'f1': 0.7325426255146416, 'kappa': 0.7212304742620872}
INFO:utils.ml:Metrics for classifi

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.827559,0.718046,0.718749,0.718228,0.690758
RandomForestClassifier1,0.833071,0.731729,0.718037,0.723561,0.699383
RandomForestClassifier2,0.830709,0.719088,0.715318,0.71695,0.695615
RandomForestClassifier3,0.845669,0.740837,0.726258,0.732543,0.72123
RandomForestClassifier4,0.832283,0.728411,0.725287,0.726642,0.698583
RandomForestClassifier5,0.848031,0.75264,0.730965,0.73996,0.725111
RandomForestClassifier6,0.831496,0.725944,0.721918,0.723641,0.697153
RandomForestClassifier7,0.847244,0.746604,0.726709,0.735192,0.723365
XGBClassifier8,0.874803,0.804085,0.767963,0.782897,0.772528


In [15]:
highest = df_smote[df_smote["kappa"] == df_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[440,  33,  25],
       [ 42, 628,   8],
       [ 34,  15,  45]], dtype=int64)

___
# Borderline SMOTE

In [16]:
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 189)
(8139, 189)
(5082,)
(8139,)


In [17]:
pipeline.fitted_classifiers = {}

pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_rgb_B_std', 0.022493374450476437), ('color_moments_lab_B_std', 0.021627789726991704), ('color_moments_lab_B_var', 0.0199290353525708), ('color_moments_lab_L_iqr', 0.018618847444709753), ('color_moments_rgb_B_iqr', 0.015344972422508213), ('color_moments_rgb_B_var', 0.01440355928761019), ('color_moments_rgb_G_iqr', 0.014060164979042736), ('color_moments_rgb_G_var', 0.013005492608675584), ('glcm_energy_dist_1_angle_0', 0.011998496772818614), ('color_moments_rgb_R_kurtosis', 0.01191937710894334)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 11.963889122009277 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_var', 0.021249296279635482), ('color_moments_lab_L_iqr', 0.020328343823046727), ('color_moments_la

In [18]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier4
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier5
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier6


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16
INFO:utils.ml:Predictions made with classifier: XGBClassifier17
INFO:utils.ml:Predictions made with classifier: XGBClassifier18


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 0, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier4': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier5': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier6': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier7': array([0, 0, 2, ..., 1, 0, 0]),
 'XGBClassifier8': array([2, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier9': array([2, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([2, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier15': array([0

In [19]:
df_borderline_smote = pd.DataFrame(data=pipeline.calculate_metrics(
    ["accuracy", "precision", "recall", "f1", "kappa"],))
df_borderline_smote = df_borderline_smote.T
df_borderline_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8362204724409449, 'precision': 0.7448201814653773, 'recall': 0.7320433431460103, 'f1': 0.7374251057008433, 'kappa': 0.7051430306331566}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8322834645669291, 'precision': 0.7353491657082506, 'recall': 0.7256421657256026, 'f1': 0.7298646703907566, 'kappa': 0.6980241125251172}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8354330708661417, 'precision': 0.7319176474355581, 'recall': 0.7277864360221278, 'f1': 0.7295407770706291, 'kappa': 0.7042878978961582}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8456692913385827, 'precision': 0.7487613870790913, 'recall': 0.7205048147148312, 'f1': 0.7316176553392993, 'kappa': 0.720090679285001}
INFO:utils.ml:Metrics for classi

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.83622,0.74482,0.732043,0.737425,0.705143
RandomForestClassifier1,0.832283,0.735349,0.725642,0.729865,0.698024
RandomForestClassifier2,0.835433,0.731918,0.727786,0.729541,0.704288
RandomForestClassifier3,0.845669,0.748761,0.720505,0.731618,0.720091
RandomForestClassifier4,0.840945,0.745459,0.740924,0.742833,0.714307
RandomForestClassifier5,0.849606,0.761929,0.72584,0.739816,0.726526
RandomForestClassifier6,0.838583,0.74492,0.735684,0.739827,0.709263
RandomForestClassifier7,0.845669,0.754719,0.719972,0.733343,0.719481
XGBClassifier8,0.874803,0.781291,0.759332,0.768636,0.773502


In [20]:
highest = df_borderline_smote[df_borderline_smote["kappa"] == df_borderline_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[445,  30,  23],
       [ 44, 623,  11],
       [ 35,  15,  44]], dtype=int64)

___
# PCA

In [21]:
from sklearn.decomposition import PCA
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

pca = PCA(n_components=100)

pipeline.feature_matrix = pca.fit_transform(pipeline.feature_matrix, pipeline.labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

print(pca.get_feature_names_out())

pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0


(5082, 189)
(8139, 100)
(5082,)
(8139,)
['pca0' 'pca1' 'pca2' 'pca3' 'pca4' 'pca5' 'pca6' 'pca7' 'pca8' 'pca9'
 'pca10' 'pca11' 'pca12' 'pca13' 'pca14' 'pca15' 'pca16' 'pca17' 'pca18'
 'pca19' 'pca20' 'pca21' 'pca22' 'pca23' 'pca24' 'pca25' 'pca26' 'pca27'
 'pca28' 'pca29' 'pca30' 'pca31' 'pca32' 'pca33' 'pca34' 'pca35' 'pca36'
 'pca37' 'pca38' 'pca39' 'pca40' 'pca41' 'pca42' 'pca43' 'pca44' 'pca45'
 'pca46' 'pca47' 'pca48' 'pca49' 'pca50' 'pca51' 'pca52' 'pca53' 'pca54'
 'pca55' 'pca56' 'pca57' 'pca58' 'pca59' 'pca60' 'pca61' 'pca62' 'pca63'
 'pca64' 'pca65' 'pca66' 'pca67' 'pca68' 'pca69' 'pca70' 'pca71' 'pca72'
 'pca73' 'pca74' 'pca75' 'pca76' 'pca77' 'pca78' 'pca79' 'pca80' 'pca81'
 'pca82' 'pca83' 'pca84' 'pca85' 'pca86' 'pca87' 'pca88' 'pca89' 'pca90'
 'pca91' 'pca92' 'pca93' 'pca94' 'pca95' 'pca96' 'pca97' 'pca98' 'pca99']


INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('gradient_magnitude_std', 0.04170418579194047), ('color_moments_rgb_R_median', 0.03078526332404313), ('color_moments_rgb_R_mean', 0.028379020849434906), ('gradient_direction_mean', 0.028183303417517146), ('color_moments_lab_L_var', 0.025870378483579856), ('color_moments_rgb_R_skew', 0.020136574229036325), ('color_moments_rgb_G_entropy', 0.019541009969340606), ('color_moments_lab_A_mean', 0.017727507043115064), ('gradient_direction_std', 0.01768926130506994), ('color_moments_lab_L_kurtosis', 0.01565686943456985)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 8.598012447357178 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('gradient_magnitude_std', 0.04385974818488469), ('color_moments_rgb_R_median', 0.033100391477724006), ('gradient_direction_mean', 0.02817613634741616), ('color_moments_rgb_R_mean', 0.02603996110317474), ('color

In [22]:
# Load and extract features from the new dataset
new_loader = FactoryLoader(path=VAL_PATH, factory=pipeline.loader.get_factory(),
                           percentage=percent, batch_size=pipeline.batch_size)
new_feature_matrix, new_labels = pipeline.feature_strategy.run(new_loader.get_loader())

new_feature_matrix = np.nan_to_num(new_feature_matrix) # Impute nans
new_feature_matrix = pca.transform(new_feature_matrix) # Impute nans

# Store predictions in the class attribute
pipeline.predictions = {"GT": new_labels, }
for clf_name, clf in pipeline.fitted_classifiers.items():

    pipeline.predictions[clf_name] = clf.predict(new_feature_matrix)
    if pipeline.verbose:
        logger.info("Predictions made with classifier: %s", clf_name)


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.utils:Predictions made with classifier: RandomForestClassifier0
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier1
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier2
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier3
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier4


Processed 53/53 batches.


INFO:utils.utils:Predictions made with classifier: RandomForestClassifier5
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier6
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier7
INFO:utils.utils:Predictions made with classifier: XGBClassifier8
INFO:utils.utils:Predictions made with classifier: XGBClassifier9
INFO:utils.utils:Predictions made with classifier: XGBClassifier10
INFO:utils.utils:Predictions made with classifier: XGBClassifier11
INFO:utils.utils:Predictions made with classifier: XGBClassifier12
INFO:utils.utils:Predictions made with classifier: XGBClassifier13
INFO:utils.utils:Predictions made with classifier: XGBClassifier14
INFO:utils.utils:Predictions made with classifier: XGBClassifier15
INFO:utils.utils:Predictions made with classifier: XGBClassifier16
INFO:utils.utils:Predictions made with classifier: XGBClassifier17
INFO:utils.utils:Predictions made with classifier: XGBClassifier18


In [23]:
df_pca = pd.DataFrame(data=pipeline.calculate_metrics(avg="macro"))
df_pca = df_pca.T
df_pca

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8023622047244094, 'precision': 0.6830837716214333, 'recall': 0.6356410390195691, 'f1': 0.6471530680864929, 'kappa': 0.6360857037175722}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8070866141732284, 'precision': 0.686342664995935, 'recall': 0.6337258908224374, 'f1': 0.6443746921173038, 'kappa': 0.6441836636675099}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8086614173228347, 'precision': 0.7018946467475881, 'recall': 0.640284983529171, 'f1': 0.6537751540479665, 'kappa': 0.6470156287173106}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.805511811023622, 'precision': 0.7022589889285316, 'recall': 0.632564904695037, 'f1': 0.645377420306688, 'kappa': 0.6403742886360614}
INFO:utils.ml:Metrics for classifier

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.802362,0.683084,0.635641,0.647153,0.636086
RandomForestClassifier1,0.807087,0.686343,0.633726,0.644375,0.644184
RandomForestClassifier2,0.808661,0.701895,0.640285,0.653775,0.647016
RandomForestClassifier3,0.805512,0.702259,0.632565,0.645377,0.640374
RandomForestClassifier4,0.819685,0.713775,0.651644,0.665753,0.667506
RandomForestClassifier5,0.811811,0.715115,0.633977,0.647204,0.650799
RandomForestClassifier6,0.817323,0.722972,0.6561,0.672192,0.663513
RandomForestClassifier7,0.815748,0.727867,0.636968,0.650857,0.657999
XGBClassifier8,0.855906,0.769706,0.709636,0.728908,0.735489


In [24]:
highest = df_pca[df_pca["kappa"] == df_pca.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[441,  46,  11],
       [ 52, 619,   7],
       [ 37,  24,  33]], dtype=int64)