In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, cohen_kappa_score
from sklearn.utils import shuffle

from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train"

INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [59]:
percent = 100
random = False
batch_size = 24
th = 0.075

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.gaussian_smoothing(5)
factory.clahe(clip_limit=1.5)
factory.pad2square(fill=np.nan)
factory.resize((150,150))
factory.hair_removal()
factory.normalize2float()

factory_loader = FactoryLoader(path=VAL_PATH, batch_size=batch_size, factory=factory, percentage=percent, shuffle=random)

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(GradientExtractor(threshold=th)) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", threshold=th))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))

strategy.add_extractor(FourierTransformExtractor())

strategy.add_extractor(LaplacianOfGaussianExtractor(0.5))
strategy.add_extractor(LaplacianOfGaussianExtractor(1))
strategy.add_extractor(LaplacianOfGaussianExtractor(2))

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation',]))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

NameError: name 'FourierTransformExtractor' is not defined

In [3]:
pipeline.feature_matrix = None
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/106 batches.
Processed 10/106 batches.
Processed 15/106 batches.
Processed 20/106 batches.
Processed 25/106 batches.
Processed 30/106 batches.
Processed 35/106 batches.
Processed 40/106 batches.
Processed 45/106 batches.
Processed 50/106 batches.
Processed 55/106 batches.
Processed 60/106 batches.
Processed 65/106 batches.
Processed 70/106 batches.
Processed 75/106 batches.
Processed 80/106 batches.
Processed 85/106 batches.
Processed 90/106 batches.
Processed 95/106 batches.
Processed 100/106 batches.
Processed 105/106 batches.


INFO:utils.ml:Feature extraction completed. Extracted 5082 features.


Processed 106/106 batches.


In [4]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

rf1 = RandomForestClassifier(n_estimators=100)
rf2 = RandomForestClassifier(n_estimators=150)
# Random Forest with Out-of-Bag Error (OOB)
rf3 = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf4 = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=42)
# Random Forest with Out-of-Bag Error (OOB)
rf5 = RandomForestClassifier(n_estimators=150, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf6 = RandomForestClassifier(n_estimators=150, bootstrap=False, random_state=42)
# Random Forest with Out-of-Bag Error (OOB)
rf7 = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf8 = RandomForestClassifier(n_estimators=200, bootstrap=False, random_state=42)


xgb1 = XGBClassifier(n_estimators=350)
xgb2 = XGBClassifier(n_estimators=450)
xgb3 = XGBClassifier(n_estimators=550)
xgb4 = XGBClassifier(learning_rate=0.05, n_estimators=400, max_depth=3, min_child_weight=4, subsample=0.8, colsample_bytree=0.8)
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7) # 2ND
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb8 = XGBClassifier(learning_rate=0.05, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb9 = XGBClassifier(learning_rate=0.05, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb10 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb11 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)


pipeline.classifiers = [rf1, rf2, rf3, rf4, rf5, rf6, rf7, rf8,
                        # svm1, svm2, svm3, svm6, svm7, svm8,
                        xgb1, xgb2, xgb3, xgb4, xgb5, xgb6, xgb7, xgb8, xgb9, xgb10, xgb11]
pipeline.fitted_classifiers = {}

In [5]:
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_channel_0_skew', 0.037887293603575525), ('color_moments_hsv_channel_0_mean', 0.03173984370883756), ('var_lab_channel_1', 0.022725990229806835), ('var_lab_channel_0', 0.018490798726188803), ('color_moments_lab_channel_1_skew', 0.01831519520538165), ('var_lab_channel_2', 0.016085381930776018), ('color_moments_rgb_channel_0_skew', 0.015846550530500266), ('var_rgb_channel_2', 0.015586900885901766), ('color_moments_rgb_channel_0_std', 0.012670201441078761), ('color_moments_rgb_channel_1_mean', 0.012656744498200338)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 3.800952196121216 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_hsv_channel_0_mean', 0.03586397393702079), ('color_moments_lab_channel_0_skew', 0.

In [6]:
pipeline.predict_with_classifiers(VAL_PATH)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/27 batches.
Processed 10/27 batches.
Processed 15/27 batches.
Processed 20/27 batches.
Processed 25/27 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier4
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier5
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier6


Processed 27/27 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16
INFO:utils.ml:Predictions made with classifier: XGBClassifier17
INFO:utils.ml:Predictions made with classifier: XGBClassifier18


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier4': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier5': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier6': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier7': array([0, 0, 1, ..., 1, 0, 0]),
 'XGBClassifier8': array([0, 0, 1, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 1, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 1, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier15': array([0

In [7]:
import pandas as pd

df = pd.DataFrame(data=pipeline.calculate_metrics())
df.T

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8062992125984252, 'precision': 0.8675962546310542, 'recall': 0.5866730588959742, 'f1': 0.5690029433406917, 'kappa': 0.6361573972711223}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8118110236220473, 'precision': 0.8710387036897685, 'recall': 0.5933467126524046, 'f1': 0.5791835413481873, 'kappa': 0.6463805058996774}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8023622047244094, 'precision': 0.7540638745458023, 'recall': 0.5833263386817842, 'f1': 0.5661959747567226, 'kappa': 0.6285476571130263}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8196850393700787, 'precision': 0.8757356117049994, 'recall': 0.6137131214873931, 'f1': 0.6147157924425688, 'kappa': 0.6619010639485138}
INFO:utils.ml:Metrics for class

cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated

INFO:utils.ml:Metrics for classifier XGBClassifier17: {'accuracy': 0.8338582677165355, 'precision': 0.8080557864765593, 'recall': 0.6383680709154322, 'f1': 0.6497274470760258, 'kappa': 0.6895290044814786}
INFO:utils.ml:Metrics for classifier XGBClassifier18: {'accuracy': 0.8362204724409449, 'precision': 0.8252287232585531, 'recall': 0.6403761030439463, 'f1': 0.6519723039797016, 'kappa': 0.6939276809135824}


cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value


Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.806299,0.867596,0.586673,0.569003,0.636157
RandomForestClassifier1,0.811811,0.871039,0.593347,0.579184,0.646381
RandomForestClassifier2,0.802362,0.754064,0.583326,0.566196,0.628548
RandomForestClassifier3,0.819685,0.875736,0.613713,0.614716,0.661901
RandomForestClassifier4,0.807874,0.702233,0.584246,0.563445,0.638737
RandomForestClassifier5,0.81811,0.833393,0.609142,0.607483,0.658591
RandomForestClassifier6,0.806299,0.701058,0.58273,0.56229,0.635316
RandomForestClassifier7,0.81811,0.833343,0.608965,0.607425,0.658448
XGBClassifier8,0.829134,0.781121,0.623023,0.625703,0.679901


___
# Balance the data

In [8]:
# Backup
pipeline.backup_feature_matrix = pipeline.feature_matrix.copy()
pipeline.backup_labels = pipeline.labels.copy()

In [9]:
print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 252)
(5082, 252)
(5082,)
(5082,)


In [10]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
pipeline.feature_matrix, pipeline.labels = smote.fit_resample(pipeline.feature_matrix, pipeline.labels)

In [11]:
pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_hsv_channel_0_mean', 0.025189814877910263), ('color_moments_lab_channel_0_skew', 0.023159984002411368), ('var_lab_channel_2', 0.012882468762287342), ('var_lab_channel_0', 0.012144807106830368), ('var_lab_channel_1', 0.01213369156439068), ('color_moments_rgb_channel_2_mean', 0.011847228762639366), ('color_moments_rgb_channel_2_std', 0.011449515278480935), ('color_moments_lab_channel_2_mean', 0.010657304827664532), ('var_rgb_channel_2', 0.010546591377447638), ('color_moments_rgb_channel_1_mean', 0.009791860019408294)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 7.689535617828369 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_hsv_channel_0_mean', 0.02207587080755871), ('color_moments_lab_channel_0_skew', 0

list index out of range


INFO:utils.ml:Fitted classifier: XGBClassifier9; Done in 8.433682203292847 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier10


list index out of range


INFO:utils.ml:Fitted classifier: XGBClassifier10; Done in 9.733409643173218 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier11


list index out of range


INFO:utils.ml:Top 10 features for XGBClassifier11: [('color_moments_hsv_channel_0_mean', 0.02618271), ('var_lab_channel_0', 0.0140603), ('lbp_rad1_bins64_9', 0.012978315), ('lbp_rad1_bins64_16', 0.012964124), ('color_moments_lab_channel_1_std', 0.01222537), ('var_lab_channel_1', 0.011894069), ('lbp_rad2_bins64_8', 0.011763771), ('color_moments_lab_channel_1_skew', 0.011108612), ('gradient_magnitude_mean', 0.009903481), ('color_moments_lab_channel_0_skew', 0.009742757)]
INFO:utils.ml:Fitted classifier: XGBClassifier11; Done in 3.739762306213379 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier12
INFO:utils.ml:Fitted classifier: XGBClassifier12; Done in 10.224793434143066 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier13


list index out of range


INFO:utils.ml:Top 10 features for XGBClassifier13: [('color_moments_hsv_channel_0_mean', 0.022849651), ('lbp_rad2_bins64_8', 0.021082457), ('var_lab_channel_0', 0.015250812), ('var_lab_channel_1', 0.014459662), ('color_moments_lab_channel_1_std', 0.013492688), ('lbp_rad1_bins64_9', 0.012475105), ('lbp_rad3_bins64_8', 0.011289164), ('var_lab_channel_2', 0.010134945), ('gradient_magnitude_mean', 0.0096933665), ('color_moments_lab_channel_0_skew', 0.009192759)]
INFO:utils.ml:Fitted classifier: XGBClassifier13; Done in 6.862001657485962 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier14
INFO:utils.ml:Fitted classifier: XGBClassifier14; Done in 7.784689664840698 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier15


list index out of range


INFO:utils.ml:Top 10 features for XGBClassifier15: [('color_moments_hsv_channel_0_mean', 0.022531783), ('var_lab_channel_0', 0.018590655), ('var_lab_channel_1', 0.014160898), ('lbp_rad2_bins64_8', 0.014004299), ('lbp_rad1_bins64_9', 0.01244467), ('color_moments_lab_channel_1_std', 0.01232961), ('color_moments_lab_channel_1_skew', 0.0099974945), ('var_lab_channel_2', 0.009568464), ('gradient_magnitude_mean', 0.0095445365), ('lbp_rad3_bins64_8', 0.009075515)]
INFO:utils.ml:Fitted classifier: XGBClassifier15; Done in 30.05023455619812 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier16
INFO:utils.ml:Top 10 features for XGBClassifier16: [('color_moments_hsv_channel_0_mean', 0.023295276), ('var_lab_channel_0', 0.018214742), ('var_lab_channel_1', 0.014081606), ('lbp_rad2_bins64_8', 0.013252051), ('color_moments_lab_channel_1_std', 0.01267974), ('lbp_rad1_bins64_9', 0.012036265), ('color_moments_lab_channel_1_skew', 0.009998684), ('gradient_magnitude_mean', 0.009484538), ('var_lab_chann

In [12]:
pipeline.predict_with_classifiers(VAL_PATH)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/27 batches.
Processed 10/27 batches.
Processed 15/27 batches.
Processed 20/27 batches.
Processed 25/27 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier4
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier5
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier6


Processed 27/27 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16
INFO:utils.ml:Predictions made with classifier: XGBClassifier17
INFO:utils.ml:Predictions made with classifier: XGBClassifier18


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier4': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier5': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier6': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier7': array([0, 0, 2, ..., 1, 0, 0]),
 'XGBClassifier8': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier15': array([0

In [14]:
import pandas as pd

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.7937007874015748, 'precision': 0.6843907861707198, 'recall': 0.6891561727988966, 'f1': 0.685134720190281, 'kappa': 0.6336672905427723}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8039370078740158, 'precision': 0.6806631670640014, 'recall': 0.6793532001602091, 'f1': 0.6776645947539149, 'kappa': 0.6511727953829455}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.7992125984251969, 'precision': 0.6702815305686834, 'recall': 0.670294433240399, 'f1': 0.6677909293014243, 'kappa': 0.6432096915639501}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8070866141732284, 'precision': 0.6951363740676993, 'recall': 0.6667582903004726, 'f1': 0.6744524078509272, 'kappa': 0.6518780571853084}
INFO:utils.ml:Metrics for classif

cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.793701,0.684391,0.689156,0.685135,0.633667
RandomForestClassifier1,0.803937,0.680663,0.679353,0.677665,0.651173
RandomForestClassifier2,0.799213,0.670282,0.670294,0.667791,0.64321
RandomForestClassifier3,0.807087,0.695136,0.666758,0.674452,0.651878
RandomForestClassifier4,0.799213,0.670765,0.669939,0.668084,0.642696
RandomForestClassifier5,0.806299,0.680724,0.657281,0.662868,0.650598
RandomForestClassifier6,0.800787,0.670386,0.671278,0.668516,0.646025
RandomForestClassifier7,0.808661,0.686752,0.661633,0.668115,0.654581
XGBClassifier8,0.819685,0.693797,0.684355,0.68803,0.675015


In [15]:
confusion_matrix(pipeline.predictions["GT"],pipeline.predictions["XGBClassifier17"])

array([[433,  43,  22],
       [ 67, 593,  18],
       [ 45,  21,  28]], dtype=int64)

___
# Borderline SMOTE

In [16]:
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    pipeline.backup_feature_matrix, pipeline.backup_labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 252)
(8139, 252)
(5082,)
(8139,)


In [17]:
pipeline.fitted_classifiers = {}

pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_channel_0_skew', 0.024642895577035882), ('color_moments_hsv_channel_0_mean', 0.02015596566185286), ('var_lab_channel_1', 0.016598769678566428), ('var_lab_channel_2', 0.013157546850449994), ('var_lab_channel_0', 0.012139962985897187), ('color_moments_rgb_channel_2_mean', 0.011951745213025326), ('color_moments_hsv_channel_1_mean', 0.011314777077500553), ('color_moments_hsv_channel_2_std', 0.01090544948967954), ('color_moments_rgb_channel_2_std', 0.010680693958617331), ('color_moments_lab_channel_2_mean', 0.01065121760686456)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 8.557301044464111 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_channel_0_skew', 0.0246908235587676), ('color_moments_hsv_channel

list index out of range


INFO:utils.ml:Top 10 features for XGBClassifier12: [('color_moments_lab_channel_1_std', 0.014347131), ('var_lab_channel_0', 0.013992742), ('lbp_rad1_bins64_9', 0.013876258), ('color_moments_hsv_channel_0_mean', 0.013406085), ('var_lab_channel_1', 0.013069536), ('color_moments_lab_channel_0_skew', 0.012982804), ('color_moments_hsv_channel_2_mean', 0.010927656), ('lbp_rad3_bins64_8', 0.010203553), ('color_moments_lab_channel_1_skew', 0.010150538), ('var_lab_channel_2', 0.009440798)]
INFO:utils.ml:Fitted classifier: XGBClassifier12; Done in 10.212116479873657 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier13
INFO:utils.ml:Top 10 features for XGBClassifier13: [('color_moments_hsv_channel_0_mean', 0.019046733), ('var_lab_channel_0', 0.016562335), ('color_moments_lab_channel_1_std', 0.013868792), ('var_lab_channel_1', 0.013152718), ('lbp_rad1_bins64_9', 0.012547502), ('color_moments_hsv_channel_2_mean', 0.0120021105), ('lbp_rad3_bins64_47', 0.0109874755), ('lbp_rad2_bins64_8', 0.0105

In [18]:
pipeline.predict_with_classifiers(VAL_PATH)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/27 batches.
Processed 10/27 batches.
Processed 15/27 batches.
Processed 20/27 batches.
Processed 25/27 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier4
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier5
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier6


Processed 27/27 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16
INFO:utils.ml:Predictions made with classifier: XGBClassifier17
INFO:utils.ml:Predictions made with classifier: XGBClassifier18


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier4': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier5': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier6': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier7': array([0, 0, 2, ..., 1, 0, 0]),
 'XGBClassifier8': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 2, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier15': array([0

In [19]:
df_borderline_smote = pd.DataFrame(data=pipeline.calculate_metrics(
    ["accuracy", "precision", "recall", "f1", "kappa"],))
df_borderline_smote = df_borderline_smote.T
df_borderline_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.7811023622047244, 'precision': 0.6442775734476833, 'recall': 0.636716955060561, 'f1': 0.6366486334638249, 'kappa': 0.6090957807233932}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.7913385826771654, 'precision': 0.6562174664608534, 'recall': 0.6429306002015972, 'f1': 0.6456383420908606, 'kappa': 0.624948737607874}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.7952755905511811, 'precision': 0.661457077855144, 'recall': 0.649331777622005, 'f1': 0.651318934421736, 'kappa': 0.6330181423738458}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8047244094488188, 'precision': 0.6875052445748512, 'recall': 0.64947797214087, 'f1': 0.6583800263678313, 'kappa': 0.6453696280749796}
INFO:utils.ml:Metrics for classifier R

cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.781102,0.644278,0.636717,0.636649,0.609096
RandomForestClassifier1,0.791339,0.656217,0.642931,0.645638,0.624949
RandomForestClassifier2,0.795276,0.661457,0.649332,0.651319,0.633018
RandomForestClassifier3,0.804724,0.687505,0.649478,0.65838,0.64537
RandomForestClassifier4,0.797638,0.668055,0.653861,0.656815,0.636799
RandomForestClassifier5,0.810236,0.69271,0.651109,0.659919,0.655118
RandomForestClassifier6,0.800787,0.673464,0.659415,0.662161,0.64297
RandomForestClassifier7,0.811811,0.695209,0.655324,0.664082,0.658568
XGBClassifier8,0.827559,0.707258,0.682062,0.690369,0.687546


In [20]:
confusion_matrix(pipeline.predictions["GT"], pipeline.predictions["XGBClassifier13"])

array([[423,  49,  26],
       [ 75, 585,  18],
       [ 47,  21,  26]], dtype=int64)

___
PCA

In [37]:
from sklearn.decomposition import PCA
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    pipeline.backup_feature_matrix, pipeline.backup_labels)

pca = PCA(n_components=100)

pipeline.feature_matrix = pca.fit_transform(pipeline.feature_matrix, pipeline.labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

print(pca.get_feature_names_out())

pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0


(5082, 252)
(8139, 100)
(5082,)
(8139,)
['pca0' 'pca1' 'pca2' 'pca3' 'pca4' 'pca5' 'pca6' 'pca7' 'pca8' 'pca9'
 'pca10' 'pca11' 'pca12' 'pca13' 'pca14' 'pca15' 'pca16' 'pca17' 'pca18'
 'pca19' 'pca20' 'pca21' 'pca22' 'pca23' 'pca24' 'pca25' 'pca26' 'pca27'
 'pca28' 'pca29' 'pca30' 'pca31' 'pca32' 'pca33' 'pca34' 'pca35' 'pca36'
 'pca37' 'pca38' 'pca39' 'pca40' 'pca41' 'pca42' 'pca43' 'pca44' 'pca45'
 'pca46' 'pca47' 'pca48' 'pca49' 'pca50' 'pca51' 'pca52' 'pca53' 'pca54'
 'pca55' 'pca56' 'pca57' 'pca58' 'pca59' 'pca60' 'pca61' 'pca62' 'pca63'
 'pca64' 'pca65' 'pca66' 'pca67' 'pca68' 'pca69' 'pca70' 'pca71' 'pca72'
 'pca73' 'pca74' 'pca75' 'pca76' 'pca77' 'pca78' 'pca79' 'pca80' 'pca81'
 'pca82' 'pca83' 'pca84' 'pca85' 'pca86' 'pca87' 'pca88' 'pca89' 'pca90'
 'pca91' 'pca92' 'pca93' 'pca94' 'pca95' 'pca96' 'pca97' 'pca98' 'pca99']


INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_rgb_channel_0_std', 0.0422789641803917), ('gradient_direction_mean', 0.038086715790584605), ('color_moments_lab_channel_0_std', 0.025170651229483226), ('gradient_magnitude_mean', 0.023844981344221607), ('var_rgb_channel_1', 0.023303285337066496), ('color_moments_lab_channel_1_std', 0.0188195126494748), ('color_moments_lab_channel_1_mean', 0.017616407381439898), ('var_lab_channel_1', 0.01572854055658206), ('var_rgb_channel_0', 0.015187758461230877), ('gradient_magnitude_std', 0.014912843252187858)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 6.822502374649048 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_rgb_channel_0_std', 0.04171744740903234), ('gradient_direction_mean', 0.03505524852566532), ('color_moments_lab_channel_0_std', 0.026296317499708302), ('var_rgb_channel_1', 0.024728841601333807),

In [38]:
# Load and extract features from the new dataset
new_loader = FactoryLoader(path=VAL_PATH, factory=pipeline.loader.get_factory(),
                           percentage=percent, batch_size=pipeline.batch_size)
new_feature_matrix, new_labels = pipeline.feature_strategy.run(new_loader.get_loader())

new_feature_matrix = np.nan_to_num(new_feature_matrix) # Impute nans
new_feature_matrix = pca.transform(new_feature_matrix) # Impute nans

# Store predictions in the class attribute
pipeline.predictions = {"GT": new_labels, }
for clf_name, clf in pipeline.fitted_classifiers.items():

    pipeline.predictions[clf_name] = clf.predict(new_feature_matrix)
    if pipeline.verbose:
        logger.info("Predictions made with classifier: %s", clf_name)


Processed 5/27 batches.
Processed 10/27 batches.
Processed 15/27 batches.
Processed 20/27 batches.
Processed 25/27 batches.


INFO:utils.utils:Predictions made with classifier: RandomForestClassifier0
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier1
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier2
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier3
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier4
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier5
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier6


Processed 27/27 batches.


INFO:utils.utils:Predictions made with classifier: RandomForestClassifier7
INFO:utils.utils:Predictions made with classifier: XGBClassifier8
INFO:utils.utils:Predictions made with classifier: XGBClassifier9
INFO:utils.utils:Predictions made with classifier: XGBClassifier10
INFO:utils.utils:Predictions made with classifier: XGBClassifier11
INFO:utils.utils:Predictions made with classifier: XGBClassifier12
INFO:utils.utils:Predictions made with classifier: XGBClassifier13
INFO:utils.utils:Predictions made with classifier: XGBClassifier14
INFO:utils.utils:Predictions made with classifier: XGBClassifier15
INFO:utils.utils:Predictions made with classifier: XGBClassifier16
INFO:utils.utils:Predictions made with classifier: XGBClassifier17
INFO:utils.utils:Predictions made with classifier: XGBClassifier18


In [40]:
df_pca = pd.DataFrame(data=pipeline.calculate_metrics(avg="macro"))
df_pca = df_pca.T
df_pca

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8086614173228347, 'precision': 0.694230819219157, 'recall': 0.6584340248082016, 'f1': 0.6695111303283432, 'kappa': 0.6510499821346998}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.7921259842519685, 'precision': 0.6680148655105943, 'recall': 0.6389461694137374, 'f1': 0.646736818388215, 'kappa': 0.6224307316699624}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8118110236220473, 'precision': 0.6987406877620277, 'recall': 0.6463722208232919, 'f1': 0.6583145656619592, 'kappa': 0.6548143801374693}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.815748031496063, 'precision': 0.7016145326648208, 'recall': 0.6495412390792065, 'f1': 0.6612958116289487, 'kappa': 0.6621050046389784}
INFO:utils.ml:Metrics for classifi

cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.808661,0.694231,0.658434,0.669511,0.65105
RandomForestClassifier1,0.792126,0.668015,0.638946,0.646737,0.622431
RandomForestClassifier2,0.811811,0.698741,0.646372,0.658315,0.654814
RandomForestClassifier3,0.815748,0.701615,0.649541,0.661296,0.662105
RandomForestClassifier4,0.801575,0.680903,0.642858,0.652181,0.638386
RandomForestClassifier5,0.819685,0.707278,0.652355,0.664625,0.66897
RandomForestClassifier6,0.800787,0.678389,0.642544,0.651281,0.637338
RandomForestClassifier7,0.819685,0.709018,0.658641,0.671028,0.67005
XGBClassifier8,0.807087,0.67531,0.657984,0.663641,0.650856


In [58]:
highest = df_pca[df_pca["kappa"] == df_pca.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[431,  49,  18],
       [ 83, 587,   8],
       [ 47,  24,  23]], dtype=int64)