In [63]:
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, cohen_kappa_score

from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train"

In [64]:
percent = 100
random = False
batch_size = 24
th = 0.05

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.gaussian_smoothing(5)
factory.clahe(clip_limit=1.5)
factory.pad2square(fill=np.nan)
factory.resize((150,150))
factory.hair_removal()
factory.normalize2float()

factory_loader = FactoryLoader(path=VAL_PATH, batch_size=batch_size, factory=factory, percentage=percent, shuffle=random)

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(GradientExtractor(threshold=th)) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", threshold=th))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))

strategy.add_extractor(FourierTransformExtractor())

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation',]))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train
INFO:utils.ml:Preprocessing steps


In [65]:
pipeline.feature_matrix = None
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/212 batches.
Processed 10/212 batches.
Processed 15/212 batches.
Processed 20/212 batches.
Processed 25/212 batches.
Processed 30/212 batches.
Processed 35/212 batches.
Processed 40/212 batches.
Processed 45/212 batches.
Processed 50/212 batches.
Processed 55/212 batches.
Processed 60/212 batches.
Processed 65/212 batches.
Processed 70/212 batches.
Processed 75/212 batches.
Processed 80/212 batches.
Processed 85/212 batches.
Processed 90/212 batches.
Processed 95/212 batches.
Processed 100/212 batches.
Processed 105/212 batches.
Processed 110/212 batches.
Processed 115/212 batches.
Processed 120/212 batches.
Processed 125/212 batches.
Processed 130/212 batches.
Processed 135/212 batches.
Processed 140/212 batches.
Processed 145/212 batches.
Processed 150/212 batches.
Processed 155/212 batches.
Processed 160/212 batches.
Processed 165/212 batches.
Processed 170/212 batches.
Processed 175/212 batches.
Processed 180/212 batches.
Processed 185/212 batches.
Processed 190/212 bat

INFO:utils.ml:Feature extraction completed. Extracted 5082 features.


Processed 212/212 batches.


In [66]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

rf1 = RandomForestClassifier(n_estimators=100)
rf2 = RandomForestClassifier(n_estimators=150)
# Random Forest with Out-of-Bag Error (OOB)
rf3 = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf4 = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=42)
# Random Forest with Out-of-Bag Error (OOB)
rf5 = RandomForestClassifier(n_estimators=150, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf6 = RandomForestClassifier(n_estimators=150, bootstrap=False, random_state=42)
# Random Forest with Out-of-Bag Error (OOB)
rf7 = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf8 = RandomForestClassifier(n_estimators=200, bootstrap=False, random_state=42)


xgb1 = XGBClassifier(n_estimators=350)
xgb2 = XGBClassifier(n_estimators=450)
xgb3 = XGBClassifier(n_estimators=550)
xgb4 = XGBClassifier(learning_rate=0.05, n_estimators=400, max_depth=3, min_child_weight=4, subsample=0.8, colsample_bytree=0.8)
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7) # 2ND
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb8 = XGBClassifier(learning_rate=0.05, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb9 = XGBClassifier(learning_rate=0.05, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb10 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb11 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)


pipeline.classifiers = [rf1, rf2, rf3, rf4, rf5, rf6, rf7, rf8,
                        # svm1, svm2, svm3, svm6, svm7, svm8,
                        xgb1, xgb2, xgb3, xgb4, xgb5, xgb6, xgb7, xgb8, xgb9, xgb10, xgb11]
pipeline.fitted_classifiers = {}

In [67]:
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_B_std', 0.035836623648281996), ('color_moments_lab_B_var', 0.028190839184201014), ('color_moments_rgb_R_iqr', 0.02680915726699322), ('color_moments_rgb_B_std', 0.023399890092729844), ('color_moments_rgb_B_var', 0.022873448718573615), ('color_moments_rgb_R_kurtosis', 0.015037153303222724), ('color_moments_rgb_B_iqr', 0.014629919960877211), ('color_moments_rgb_R_skew', 0.013441741071393424), ('color_moments_lab_L_iqr', 0.013417129191515751), ('color_moments_rgb_B_kurtosis', 0.01340818926157895)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 5.232849836349487 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_std', 0.03476400943059128), ('color_moments_rgb_B_std', 0.031623347028903674), ('color_moments

In [68]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier4
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier5
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier6


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16
INFO:utils.ml:Predictions made with classifier: XGBClassifier17
INFO:utils.ml:Predictions made with classifier: XGBClassifier18


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier4': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier5': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier6': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier7': array([0, 0, 0, ..., 1, 0, 0]),
 'XGBClassifier8': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier15': array([0

In [69]:
import pandas as pd

df = pd.DataFrame(data=pipeline.calculate_metrics())
df.T

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8181102362204724, 'precision': 0.827538775076741, 'recall': 0.6062655189892817, 'f1': 0.6016283301347637, 'kappa': 0.6585735433774649}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8173228346456692, 'precision': 0.8328090164342988, 'recall': 0.6084729301943858, 'f1': 0.6068594006669649, 'kappa': 0.6571130651745856}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8133858267716535, 'precision': 0.8164144782641892, 'recall': 0.599550401442587, 'f1': 0.5922436728289787, 'kappa': 0.6493200544329283}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.831496062992126, 'precision': 0.8596862180836959, 'recall': 0.6367154427034694, 'f1': 0.6494340977372557, 'kappa': 0.6848035850141954}
INFO:utils.ml:Metrics for classifi

cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.81811,0.827539,0.606266,0.601628,0.658574
RandomForestClassifier1,0.817323,0.832809,0.608473,0.606859,0.657113
RandomForestClassifier2,0.813386,0.816414,0.59955,0.592244,0.64932
RandomForestClassifier3,0.831496,0.859686,0.636715,0.649434,0.684804
RandomForestClassifier4,0.818898,0.827806,0.605869,0.601856,0.659266
RandomForestClassifier5,0.824409,0.848716,0.622772,0.628672,0.671154
RandomForestClassifier6,0.814173,0.817003,0.600042,0.592887,0.650507
RandomForestClassifier7,0.831496,0.883159,0.633661,0.644756,0.68431
XGBClassifier8,0.853543,0.826306,0.675628,0.699199,0.727023


___
# Balance the data

In [70]:
# Backup
pipeline.backup_feature_matrix = pipeline.feature_matrix.copy()
pipeline.backup_labels = pipeline.labels.copy()

In [71]:
print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 189)
(5082, 189)
(5082,)
(5082,)


In [72]:
pipeline.get_feature_names()

['gradient_magnitude_mean',
 'gradient_magnitude_std',
 'gradient_direction_mean',
 'gradient_direction_std',
 'color_moments_rgb_R_mean',
 'color_moments_rgb_R_std',
 'color_moments_rgb_R_skew',
 'color_moments_rgb_R_kurtosis',
 'color_moments_rgb_R_median',
 'color_moments_rgb_R_var',
 'color_moments_rgb_R_min',
 'color_moments_rgb_R_max',
 'color_moments_rgb_R_iqr',
 'color_moments_rgb_R_entropy',
 'color_moments_rgb_G_mean',
 'color_moments_rgb_G_std',
 'color_moments_rgb_G_skew',
 'color_moments_rgb_G_kurtosis',
 'color_moments_rgb_G_median',
 'color_moments_rgb_G_var',
 'color_moments_rgb_G_min',
 'color_moments_rgb_G_max',
 'color_moments_rgb_G_iqr',
 'color_moments_rgb_G_entropy',
 'color_moments_rgb_B_mean',
 'color_moments_rgb_B_std',
 'color_moments_rgb_B_skew',
 'color_moments_rgb_B_kurtosis',
 'color_moments_rgb_B_median',
 'color_moments_rgb_B_var',
 'color_moments_rgb_B_min',
 'color_moments_rgb_B_max',
 'color_moments_rgb_B_iqr',
 'color_moments_rgb_B_entropy',
 'color_

In [73]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)


pipeline.feature_matrix, pipeline.labels = smote.fit_resample(np.nan_to_num(pipeline.feature_matrix), pipeline.labels)

In [74]:
pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_rgb_B_var', 0.025598212319409112), ('color_moments_lab_B_std', 0.01981796496559757), ('color_moments_rgb_R_iqr', 0.019723423758789282), ('color_moments_lab_B_var', 0.019448206465166474), ('color_moments_rgb_B_std', 0.01870932041167769), ('color_moments_rgb_G_var', 0.013839536982092561), ('color_moments_rgb_G_iqr', 0.013254745886876413), ('color_moments_lab_L_std', 0.012971168282269694), ('color_moments_rgb_R_kurtosis', 0.012922128117509323), ('color_moments_lab_L_iqr', 0.012812086870632953)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 8.149178743362427 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_rgb_B_std', 0.023384471136222856), ('color_moments_rgb_B_var', 0.02118293292232135), ('color_moments_lab_B

In [75]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier4
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier5
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier6


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16
INFO:utils.ml:Predictions made with classifier: XGBClassifier17
INFO:utils.ml:Predictions made with classifier: XGBClassifier18


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier4': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier5': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier6': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier7': array([0, 0, 2, ..., 1, 0, 0]),
 'XGBClassifier8': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier15': array([0

In [76]:
import pandas as pd

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.805511811023622, 'precision': 0.6899201794134497, 'recall': 0.6959976981925063, 'f1': 0.6921816634192233, 'kappa': 0.6535756724402213}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8251968503937008, 'precision': 0.7313685509296235, 'recall': 0.7258711617952386, 'f1': 0.7274374493930725, 'kappa': 0.687181012062737}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8188976377952756, 'precision': 0.7092743317422663, 'recall': 0.7023673681735035, 'f1': 0.7048667154406604, 'kappa': 0.6749026154702281}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8244094488188977, 'precision': 0.7118330676988464, 'recall': 0.70112156401921, 'f1': 0.7048206261763315, 'kappa': 0.6846087370985919}
INFO:utils.ml:Metrics for classifie

cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.805512,0.68992,0.695998,0.692182,0.653576
RandomForestClassifier1,0.825197,0.731369,0.725871,0.727437,0.687181
RandomForestClassifier2,0.818898,0.709274,0.702367,0.704867,0.674903
RandomForestClassifier3,0.824409,0.711833,0.701122,0.704821,0.684609
RandomForestClassifier4,0.81811,0.704236,0.695411,0.698755,0.672999
RandomForestClassifier5,0.829921,0.725423,0.708328,0.714473,0.693897
RandomForestClassifier6,0.811811,0.697589,0.687891,0.691619,0.661385
RandomForestClassifier7,0.833071,0.731547,0.701665,0.711894,0.698037
XGBClassifier8,0.852756,0.739991,0.723297,0.730154,0.733777


In [77]:
highest = df_smote[df_smote["kappa"] == df_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[438,  36,  24],
       [ 53, 609,  16],
       [ 33,  21,  40]], dtype=int64)

___
# Borderline SMOTE

In [78]:
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 189)
(8139, 189)
(5082,)
(8139,)


In [80]:
pipeline.fitted_classifiers = {}

pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_rgb_B_std', 0.023421505302294903), ('color_moments_lab_B_std', 0.021776453230077073), ('color_moments_lab_B_var', 0.018737356667564), ('color_moments_rgb_B_var', 0.018174891854928894), ('color_moments_rgb_B_kurtosis', 0.01601242465270787), ('color_moments_lab_L_std', 0.015115134696839116), ('color_moments_lab_L_var', 0.014840766215370187), ('color_moments_rgb_G_skew', 0.013397361523664135), ('color_moments_lab_L_iqr', 0.01285925338918596), ('color_moments_lab_L_kurtosis', 0.012528294161356243)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 7.734049320220947 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_rgb_B_std', 0.029811777377535424), ('color_moments_rgb_B_var', 0.020997016752816042), ('color_moments_l

In [81]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier4
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier5
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier6


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12
INFO:utils.ml:Predictions made with classifier: XGBClassifier13
INFO:utils.ml:Predictions made with classifier: XGBClassifier14
INFO:utils.ml:Predictions made with classifier: XGBClassifier15
INFO:utils.ml:Predictions made with classifier: XGBClassifier16
INFO:utils.ml:Predictions made with classifier: XGBClassifier17
INFO:utils.ml:Predictions made with classifier: XGBClassifier18


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier4': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier5': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier6': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier7': array([0, 0, 2, ..., 1, 0, 0]),
 'XGBClassifier8': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier13': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier14': array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 'XGBClassifier15': array([0

In [82]:
df_borderline_smote = pd.DataFrame(data=pipeline.calculate_metrics(
    ["accuracy", "precision", "recall", "f1", "kappa"],))
df_borderline_smote = df_borderline_smote.T
df_borderline_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8125984251968504, 'precision': 0.6906785485699266, 'recall': 0.6811737000093515, 'f1': 0.6839356076513999, 'kappa': 0.6637146480941679}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8094488188976378, 'precision': 0.6921061329424761, 'recall': 0.6819061849607682, 'f1': 0.684910381765694, 'kappa': 0.6581996930536711}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8078740157480315, 'precision': 0.6875151485274604, 'recall': 0.6837996560395855, 'f1': 0.6842103529513204, 'kappa': 0.6564386161753684}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8204724409448819, 'precision': 0.7099172470689498, 'recall': 0.6776375570694251, 'f1': 0.687644759595656, 'kappa': 0.674509786288854}
INFO:utils.ml:Metrics for classifi

cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.812598,0.690679,0.681174,0.683936,0.663715
RandomForestClassifier1,0.809449,0.692106,0.681906,0.68491,0.6582
RandomForestClassifier2,0.807874,0.687515,0.6838,0.68421,0.656439
RandomForestClassifier3,0.820472,0.709917,0.677638,0.687645,0.67451
RandomForestClassifier4,0.802362,0.676383,0.676593,0.675213,0.647425
RandomForestClassifier5,0.818898,0.710362,0.682408,0.691451,0.672581
RandomForestClassifier6,0.807087,0.67767,0.673789,0.674462,0.654462
RandomForestClassifier7,0.824409,0.717996,0.676686,0.689103,0.68034
XGBClassifier8,0.847244,0.748192,0.721488,0.732217,0.722792


In [83]:
highest = df_borderline_smote[df_borderline_smote["kappa"] == df_borderline_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[434,  44,  20],
       [ 57, 610,  11],
       [ 34,  21,  39]], dtype=int64)

___
# PCA

In [84]:
from sklearn.decomposition import PCA
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

pca = PCA(n_components=100)

pipeline.feature_matrix = pca.fit_transform(pipeline.feature_matrix, pipeline.labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

print(pca.get_feature_names_out())

pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0


(5082, 189)
(8139, 100)
(5082,)
(8139,)
['pca0' 'pca1' 'pca2' 'pca3' 'pca4' 'pca5' 'pca6' 'pca7' 'pca8' 'pca9'
 'pca10' 'pca11' 'pca12' 'pca13' 'pca14' 'pca15' 'pca16' 'pca17' 'pca18'
 'pca19' 'pca20' 'pca21' 'pca22' 'pca23' 'pca24' 'pca25' 'pca26' 'pca27'
 'pca28' 'pca29' 'pca30' 'pca31' 'pca32' 'pca33' 'pca34' 'pca35' 'pca36'
 'pca37' 'pca38' 'pca39' 'pca40' 'pca41' 'pca42' 'pca43' 'pca44' 'pca45'
 'pca46' 'pca47' 'pca48' 'pca49' 'pca50' 'pca51' 'pca52' 'pca53' 'pca54'
 'pca55' 'pca56' 'pca57' 'pca58' 'pca59' 'pca60' 'pca61' 'pca62' 'pca63'
 'pca64' 'pca65' 'pca66' 'pca67' 'pca68' 'pca69' 'pca70' 'pca71' 'pca72'
 'pca73' 'pca74' 'pca75' 'pca76' 'pca77' 'pca78' 'pca79' 'pca80' 'pca81'
 'pca82' 'pca83' 'pca84' 'pca85' 'pca86' 'pca87' 'pca88' 'pca89' 'pca90'
 'pca91' 'pca92' 'pca93' 'pca94' 'pca95' 'pca96' 'pca97' 'pca98' 'pca99']


INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('gradient_magnitude_std', 0.05193733288780519), ('gradient_direction_mean', 0.04547206596225948), ('color_moments_rgb_R_var', 0.03499697481519758), ('color_moments_rgb_R_median', 0.024880601172326693), ('gradient_direction_std', 0.02195455477136229), ('color_moments_rgb_R_std', 0.02063196882941278), ('color_moments_rgb_G_iqr', 0.020009975507693185), ('color_moments_rgb_R_skew', 0.01592724709264969), ('color_moments_lab_L_iqr', 0.015840695983661668), ('color_moments_lab_L_skew', 0.015288668979688495)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 7.409027099609375 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('gradient_magnitude_std', 0.05112300921980831), ('gradient_direction_mean', 0.04181836225529221), ('color_moments_rgb_R_var', 0.0339446742308377), ('color_moments_rgb_R_median', 0.026949734788927534), ('gradient_direction_

In [85]:
# Load and extract features from the new dataset
new_loader = FactoryLoader(path=VAL_PATH, factory=pipeline.loader.get_factory(),
                           percentage=percent, batch_size=pipeline.batch_size)
new_feature_matrix, new_labels = pipeline.feature_strategy.run(new_loader.get_loader())

new_feature_matrix = np.nan_to_num(new_feature_matrix) # Impute nans
new_feature_matrix = pca.transform(new_feature_matrix) # Impute nans

# Store predictions in the class attribute
pipeline.predictions = {"GT": new_labels, }
for clf_name, clf in pipeline.fitted_classifiers.items():

    pipeline.predictions[clf_name] = clf.predict(new_feature_matrix)
    if pipeline.verbose:
        logger.info("Predictions made with classifier: %s", clf_name)


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.utils:Predictions made with classifier: RandomForestClassifier0
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier1
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier2
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier3
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier4
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier5
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier6


Processed 53/53 batches.


INFO:utils.utils:Predictions made with classifier: RandomForestClassifier7
INFO:utils.utils:Predictions made with classifier: XGBClassifier8
INFO:utils.utils:Predictions made with classifier: XGBClassifier9
INFO:utils.utils:Predictions made with classifier: XGBClassifier10
INFO:utils.utils:Predictions made with classifier: XGBClassifier11
INFO:utils.utils:Predictions made with classifier: XGBClassifier12
INFO:utils.utils:Predictions made with classifier: XGBClassifier13
INFO:utils.utils:Predictions made with classifier: XGBClassifier14
INFO:utils.utils:Predictions made with classifier: XGBClassifier15
INFO:utils.utils:Predictions made with classifier: XGBClassifier16
INFO:utils.utils:Predictions made with classifier: XGBClassifier17
INFO:utils.utils:Predictions made with classifier: XGBClassifier18


In [86]:
df_pca = pd.DataFrame(data=pipeline.calculate_metrics(avg="macro"))
df_pca = df_pca.T
df_pca

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8, 'precision': 0.6839895938000075, 'recall': 0.6548365053763034, 'f1': 0.664893747950941, 'kappa': 0.6353446014994076}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8039370078740158, 'precision': 0.6938762836294275, 'recall': 0.65711701384084, 'f1': 0.6692186554459946, 'kappa': 0.6417225595939454}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8062992125984252, 'precision': 0.6888683162293187, 'recall': 0.6557151848465853, 'f1': 0.6667785524460194, 'kappa': 0.6458401349439546}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8078740157480315, 'precision': 0.6894210429366079, 'recall': 0.6394379375280573, 'f1': 0.6513492948275558, 'kappa': 0.6459986931150927}
INFO:utils.ml:Metrics for classifier RandomForest

cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated with a value
cannot access local variable 'report' where it is not associated

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.8,0.68399,0.654837,0.664894,0.635345
RandomForestClassifier1,0.803937,0.693876,0.657117,0.669219,0.641723
RandomForestClassifier2,0.806299,0.688868,0.655715,0.666779,0.64584
RandomForestClassifier3,0.807874,0.689421,0.639438,0.651349,0.645999
RandomForestClassifier4,0.806299,0.686486,0.659303,0.668703,0.647206
RandomForestClassifier5,0.811811,0.699755,0.648716,0.661622,0.654141
RandomForestClassifier6,0.805512,0.675364,0.653058,0.660583,0.646149
RandomForestClassifier7,0.814173,0.701351,0.650368,0.663245,0.658482
XGBClassifier8,0.853543,0.745579,0.711215,0.723881,0.732941


In [87]:
highest = df_pca[df_pca["kappa"] == df_pca.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[436,  40,  22],
       [ 52, 615,  11],
       [ 35,  26,  33]], dtype=int64)