In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, cohen_kappa_score

from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train"

INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [2]:
percent = 100
random = False
batch_size = 24
th = 0.1

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.pad2square(fill=np.nan)
factory.resize((200,200))
factory.normalize2float()

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(GradientExtractor(threshold=th)) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", threshold=th))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))
strategy.add_extractor(LBPExtractor(radius=5, n_points=40))

strategy.add_extractor(FourierTransformExtractor())
strategy.add_extractor(FFTExtractor())

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train
INFO:utils.ml:Preprocessing steps


In [3]:
pipeline.feature_matrix = None
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/212 batches.
Processed 10/212 batches.
Processed 15/212 batches.
Processed 20/212 batches.
Processed 25/212 batches.
Processed 30/212 batches.
Processed 35/212 batches.
Processed 40/212 batches.
Processed 45/212 batches.
Processed 50/212 batches.
Processed 55/212 batches.
Processed 60/212 batches.
Processed 65/212 batches.
Processed 70/212 batches.
Processed 75/212 batches.
Processed 80/212 batches.
Processed 85/212 batches.
Processed 90/212 batches.
Processed 95/212 batches.
Processed 100/212 batches.
Processed 105/212 batches.
Processed 110/212 batches.
Processed 115/212 batches.
Processed 120/212 batches.
Processed 125/212 batches.
Processed 130/212 batches.
Processed 135/212 batches.
Processed 140/212 batches.
Processed 145/212 batches.
Processed 150/212 batches.
Processed 155/212 batches.
Processed 160/212 batches.
Processed 165/212 batches.
Processed 170/212 batches.
Processed 175/212 batches.
Processed 180/212 batches.
Processed 185/212 batches.
Processed 190/212 bat

INFO:utils.ml:Feature extraction completed. Extracted 5082 features.


Processed 212/212 batches.


In [4]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Random Forest with Bootstrap Disabled
rf0 = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=42)
# Random Forest with Out-of-Bag Error (OOB)
rf1 = RandomForestClassifier(n_estimators=150, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf2 = RandomForestClassifier(n_estimators=150, bootstrap=False, random_state=42)
# Random Forest with Bootstrap Disabled
rf3 = RandomForestClassifier(n_estimators=200, bootstrap=False, random_state=42)

xgb4 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8)
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8) # THIS!!
xgb8 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)


xgb9 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb10 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb11 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb12 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb13 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb14 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb15 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb16 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)

pipeline.classifiers = [rf0, rf1, rf2, rf3,
                        # svm1, svm2, svm3, svm6, svm7, svm8,
                        xgb8, xgb9, xgb10, xgb11, xgb12, xgb13, xgb14, xgb15, xgb16]
pipeline.fitted_classifiers = {}

In [5]:
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 9.719164848327637 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier1; Done in 10.235319375991821 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier2


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier2; Done in 14.50141191482544 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier3


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier3; Done in 20.471442699432373 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier4


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier4; Done in 28.371830463409424 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier5


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier5; Done in 28.825389623641968 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier6


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier6; Done in 16.76242995262146 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier7


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier7; Done in 27.435635805130005 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier8


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier8; Done in 33.596564054489136 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier9


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier9; Done in 31.460536241531372 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier10


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier10; Done in 17.13995671272278 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier11


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier11; Done in 31.850087881088257 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier12


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier12; Done in 40.62158131599426 seconds
INFO:utils.ml:Fitting completed in 310.99 seconds.


'list' object has no attribute 'shape'


In [6]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 1, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 1, ..., 1, 0, 0]),
 'XGBClassifier4': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 0, ..., 1, 2, 0], dtype=int64)}

In [7]:
import pandas as pd

df = pd.DataFrame(data=pipeline.calculate_metrics())
df = df.T
df

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8401574803149606, 'precision': 0.826753742204386, 'recall': 0.6538082285837002, 'f1': 0.6730170110108676, 'kappa': 0.7012990323298235}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8228346456692913, 'precision': 0.8771457300869065, 'recall': 0.6193007768222202, 'f1': 0.62716131250614, 'kappa': 0.6661643067433058}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.84251968503937, 'precision': 0.8413347719857666, 'recall': 0.6543946450460097, 'f1': 0.6748697276222165, 'kappa': 0.7048364980128756}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8448818897637795, 'precision': 0.8376211622021678, 'recall': 0.6499383588455371, 'f1': 0.6669742489766524, 'kappa': 0.7087467928271083}
INFO:utils.ml:Metrics for classifier

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.840157,0.826754,0.653808,0.673017,0.701299
RandomForestClassifier1,0.822835,0.877146,0.619301,0.627161,0.666164
RandomForestClassifier2,0.84252,0.841335,0.654395,0.67487,0.704836
RandomForestClassifier3,0.844882,0.837621,0.649938,0.666974,0.708747
XGBClassifier4,0.874016,0.862795,0.710503,0.742119,0.765691
XGBClassifier5,0.874016,0.840259,0.719311,0.74955,0.766453
XGBClassifier6,0.874016,0.849845,0.713379,0.744286,0.765824
XGBClassifier7,0.866929,0.826665,0.693682,0.719099,0.75255
XGBClassifier8,0.876378,0.86848,0.721141,0.754894,0.770453


In [8]:
highest = df[df["kappa"] == df.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[446,  48,   4],
       [ 41, 636,   1],
       [ 45,  18,  31]], dtype=int64)

___
# Balance the data

In [9]:
# Backup
pipeline.backup_feature_matrix = pipeline.feature_matrix.copy()
pipeline.backup_labels = pipeline.labels.copy()

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(np.nan_to_num(pipeline.feature_matrix), pipeline.labels)

In [10]:
print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 519)
(5082, 519)
(5082,)
(5082,)


In [12]:
pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 18.71985125541687 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier1; Done in 25.031797170639038 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier2


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier2; Done in 36.87735342979431 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier3


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier3; Done in 36.77544021606445 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier4


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier4; Done in 34.01800513267517 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier5


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier5; Done in 37.792242765426636 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier6


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier6; Done in 22.587767601013184 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier7


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier7; Done in 33.2621123790741 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier8


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier8; Done in 45.6057813167572 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier9


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier9; Done in 44.74539017677307 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier10


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier10; Done in 25.86698889732361 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier11


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier11; Done in 41.84835958480835 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier12


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier12; Done in 51.43301296234131 seconds
INFO:utils.ml:Fitting completed in 454.58 seconds.


'list' object has no attribute 'shape'


In [13]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 2, 0]),
 'XGBClassifier4': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 1, 2, 0], dtype=int64)}

In [14]:
import pandas as pd

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.841732283464567, 'precision': 0.7557113301923907, 'recall': 0.7388946988607161, 'f1': 0.7460230475142774, 'kappa': 0.714636731116636}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8251968503937008, 'precision': 0.7362288583250421, 'recall': 0.7359565671208023, 'f1': 0.7357579621937909, 'kappa': 0.6868196319236477}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.841732283464567, 'precision': 0.7522776982815214, 'recall': 0.7333188903231075, 'f1': 0.741155668513055, 'kappa': 0.7144362557555743}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8409448818897638, 'precision': 0.7497403278063656, 'recall': 0.7322941423637185, 'f1': 0.7396212076997917, 'kappa': 0.7130296657568571}
INFO:utils.ml:Metrics for classifie

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.841732,0.755711,0.738895,0.746023,0.714637
RandomForestClassifier1,0.825197,0.736229,0.735957,0.735758,0.68682
RandomForestClassifier2,0.841732,0.752278,0.733319,0.741156,0.714436
RandomForestClassifier3,0.840945,0.74974,0.732294,0.739621,0.71303
XGBClassifier4,0.87874,0.791175,0.778696,0.784562,0.7808
XGBClassifier5,0.876378,0.787616,0.771823,0.778952,0.776632
XGBClassifier6,0.872441,0.7777,0.771886,0.774579,0.770419
XGBClassifier7,0.885039,0.801092,0.781174,0.790012,0.792094
XGBClassifier8,0.880315,0.792004,0.782911,0.787249,0.783973


In [15]:
import pandas as pd

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.841732283464567, 'precision': 0.7557113301923907, 'recall': 0.7388946988607161, 'f1': 0.7460230475142774, 'kappa': 0.714636731116636}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8251968503937008, 'precision': 0.7362288583250421, 'recall': 0.7359565671208023, 'f1': 0.7357579621937909, 'kappa': 0.6868196319236477}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.841732283464567, 'precision': 0.7522776982815214, 'recall': 0.7333188903231075, 'f1': 0.741155668513055, 'kappa': 0.7144362557555743}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8409448818897638, 'precision': 0.7497403278063656, 'recall': 0.7322941423637185, 'f1': 0.7396212076997917, 'kappa': 0.7130296657568571}
INFO:utils.ml:Metrics for classifie

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.841732,0.755711,0.738895,0.746023,0.714637
RandomForestClassifier1,0.825197,0.736229,0.735957,0.735758,0.68682
RandomForestClassifier2,0.841732,0.752278,0.733319,0.741156,0.714436
RandomForestClassifier3,0.840945,0.74974,0.732294,0.739621,0.71303
XGBClassifier4,0.87874,0.791175,0.778696,0.784562,0.7808
XGBClassifier5,0.876378,0.787616,0.771823,0.778952,0.776632
XGBClassifier6,0.872441,0.7777,0.771886,0.774579,0.770419
XGBClassifier7,0.885039,0.801092,0.781174,0.790012,0.792094
XGBClassifier8,0.880315,0.792004,0.782911,0.787249,0.783973


In [16]:
highest = df_smote[df_smote["kappa"] == df_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[444,  31,  23],
       [ 38, 631,   9],
       [ 33,  12,  49]], dtype=int64)

___
# Borderline SMOTE

In [17]:
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 519)
(8139, 519)
(5082,)
(8139,)


In [18]:
pipeline.fitted_classifiers = {}

pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 19.617548942565918 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier1; Done in 20.356825828552246 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier2


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier2; Done in 29.118584394454956 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier3


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier3; Done in 37.67693829536438 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier4


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier4; Done in 34.55964517593384 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier5


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier5; Done in 35.703076124191284 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier6


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier6; Done in 19.952351331710815 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier7


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier7; Done in 30.00417923927307 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier8


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier8; Done in 49.55645823478699 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier9


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier9; Done in 44.49885630607605 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier10


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier10; Done in 24.57122564315796 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier11


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier11; Done in 41.96625638008118 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier12


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier12; Done in 50.3948757648468 seconds
INFO:utils.ml:Fitting completed in 437.98 seconds.


'list' object has no attribute 'shape'


In [19]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier2': array([2, 0, 2, ..., 1, 2, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 2, 0]),
 'XGBClassifier4': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 0, 2, 0], dtype=int64)}

In [20]:
df_borderline_smote = pd.DataFrame(data=pipeline.calculate_metrics(
    ["accuracy", "precision", "recall", "f1", "kappa"],))
df_borderline_smote = df_borderline_smote.T
df_borderline_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8496062992125984, 'precision': 0.7733413473351555, 'recall': 0.7446996294977185, 'f1': 0.756405588301574, 'kappa': 0.7278774960735921}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8275590551181102, 'precision': 0.7405335669910782, 'recall': 0.7316779828786054, 'f1': 0.7354332524492153, 'kappa': 0.6900312498606906}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.84251968503937, 'precision': 0.7636128425502746, 'recall': 0.7420853942349451, 'f1': 0.7512525555551118, 'kappa': 0.7155132633540762}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.84251968503937, 'precision': 0.763685815714855, 'recall': 0.7422630961932208, 'f1': 0.7513576672496395, 'kappa': 0.7155706056778378}
INFO:utils.ml:Metrics for classifier 

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.849606,0.773341,0.7447,0.756406,0.727877
RandomForestClassifier1,0.827559,0.740534,0.731678,0.735433,0.690031
RandomForestClassifier2,0.84252,0.763613,0.742085,0.751253,0.715513
RandomForestClassifier3,0.84252,0.763686,0.742263,0.751358,0.715571
XGBClassifier4,0.879528,0.798809,0.782064,0.78982,0.781894
XGBClassifier5,0.879528,0.798127,0.785474,0.791405,0.782399
XGBClassifier6,0.87874,0.790463,0.782816,0.786334,0.781702
XGBClassifier7,0.875591,0.794419,0.773852,0.783059,0.774778
XGBClassifier8,0.87874,0.798066,0.781573,0.789204,0.780557


In [21]:
highest = df_borderline_smote[df_borderline_smote["kappa"] == df_borderline_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[438,  35,  25],
       [ 40, 629,   9],
       [ 30,  13,  51]], dtype=int64)

___
# PCA

In [22]:
from sklearn.decomposition import PCA
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

pca = PCA(n_components=200)

pipeline.feature_matrix = pca.fit_transform(pipeline.feature_matrix, pipeline.labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

print(pca.get_feature_names_out())

pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0


(5082, 519)
(8139, 200)
(5082,)
(8139,)
['pca0' 'pca1' 'pca2' 'pca3' 'pca4' 'pca5' 'pca6' 'pca7' 'pca8' 'pca9'
 'pca10' 'pca11' 'pca12' 'pca13' 'pca14' 'pca15' 'pca16' 'pca17' 'pca18'
 'pca19' 'pca20' 'pca21' 'pca22' 'pca23' 'pca24' 'pca25' 'pca26' 'pca27'
 'pca28' 'pca29' 'pca30' 'pca31' 'pca32' 'pca33' 'pca34' 'pca35' 'pca36'
 'pca37' 'pca38' 'pca39' 'pca40' 'pca41' 'pca42' 'pca43' 'pca44' 'pca45'
 'pca46' 'pca47' 'pca48' 'pca49' 'pca50' 'pca51' 'pca52' 'pca53' 'pca54'
 'pca55' 'pca56' 'pca57' 'pca58' 'pca59' 'pca60' 'pca61' 'pca62' 'pca63'
 'pca64' 'pca65' 'pca66' 'pca67' 'pca68' 'pca69' 'pca70' 'pca71' 'pca72'
 'pca73' 'pca74' 'pca75' 'pca76' 'pca77' 'pca78' 'pca79' 'pca80' 'pca81'
 'pca82' 'pca83' 'pca84' 'pca85' 'pca86' 'pca87' 'pca88' 'pca89' 'pca90'
 'pca91' 'pca92' 'pca93' 'pca94' 'pca95' 'pca96' 'pca97' 'pca98' 'pca99'
 'pca100' 'pca101' 'pca102' 'pca103' 'pca104' 'pca105' 'pca106' 'pca107'
 'pca108' 'pca109' 'pca110' 'pca111' 'pca112' 'pca113' 'pca114' 'pca115'
 'pca116' 'pc

INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 13.480910301208496 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier1; Done in 13.796612739562988 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier2


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier2; Done in 20.23788571357727 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier3


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: RandomForestClassifier3; Done in 27.195294857025146 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier4


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier4; Done in 11.358813524246216 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier5


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier5; Done in 12.353408813476562 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier6


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier6; Done in 7.158859968185425 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier7


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier7; Done in 11.075444459915161 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier8


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier8; Done in 14.238271236419678 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier9


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier9; Done in 14.401986360549927 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier10


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier10; Done in 7.853266477584839 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier11


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier11; Done in 13.49010181427002 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier12


'list' object has no attribute 'shape'


INFO:utils.ml:Fitted classifier: XGBClassifier12; Done in 16.943157196044922 seconds
INFO:utils.ml:Fitting completed in 183.61 seconds.


'list' object has no attribute 'shape'


In [23]:
# Load and extract features from the new dataset
new_loader = FactoryLoader(path=VAL_PATH, factory=pipeline.loader.get_factory(),
                           percentage=percent, batch_size=pipeline.batch_size)
new_feature_matrix, new_labels = pipeline.feature_strategy.run(new_loader.get_loader())

new_feature_matrix = np.nan_to_num(new_feature_matrix) # Impute nans
new_feature_matrix = pca.transform(new_feature_matrix) # Impute nans

# Store predictions in the class attribute
pipeline.predictions = {"GT": new_labels, }
for clf_name, clf in pipeline.fitted_classifiers.items():

    pipeline.predictions[clf_name] = clf.predict(new_feature_matrix)
    if pipeline.verbose:
        logger.info("Predictions made with classifier: %s", clf_name)


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.utils:Predictions made with classifier: RandomForestClassifier0
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier1
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier2
INFO:utils.utils:Predictions made with classifier: RandomForestClassifier3
INFO:utils.utils:Predictions made with classifier: XGBClassifier4
INFO:utils.utils:Predictions made with classifier: XGBClassifier5
INFO:utils.utils:Predictions made with classifier: XGBClassifier6
INFO:utils.utils:Predictions made with classifier: XGBClassifier7
INFO:utils.utils:Predictions made with classifier: XGBClassifier8
INFO:utils.utils:Predictions made with classifier: XGBClassifier9
INFO:utils.utils:Predictions made with classifier: XGBClassifier10


Processed 53/53 batches.


INFO:utils.utils:Predictions made with classifier: XGBClassifier11
INFO:utils.utils:Predictions made with classifier: XGBClassifier12


In [24]:
df_pca = pd.DataFrame(data=pipeline.calculate_metrics(avg="macro"))
df_pca = df_pca.T
df_pca

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.7881889763779527, 'precision': 0.7925925585930726, 'recall': 0.6115862432974225, 'f1': 0.6348015395668556, 'kappa': 0.5996761129807917}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.7771653543307087, 'precision': 0.7690614668809631, 'recall': 0.6020042008238817, 'f1': 0.6224017823506117, 'kappa': 0.5802579578260909}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.7889763779527559, 'precision': 0.790245501339155, 'recall': 0.6093788320923184, 'f1': 0.630901956667494, 'kappa': 0.6009754038781683}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.794488188976378, 'precision': 0.8245795682466525, 'recall': 0.6162301878070243, 'f1': 0.6406354211359363, 'kappa': 0.6111314459476581}
INFO:utils.ml:Metrics for classifi

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.788189,0.792593,0.611586,0.634802,0.599676
RandomForestClassifier1,0.777165,0.769061,0.602004,0.622402,0.580258
RandomForestClassifier2,0.788976,0.790246,0.609379,0.630902,0.600975
RandomForestClassifier3,0.794488,0.82458,0.61623,0.640635,0.611131
XGBClassifier4,0.837008,0.783957,0.68132,0.706259,0.697973
XGBClassifier5,0.833071,0.770203,0.672397,0.69508,0.690662
XGBClassifier6,0.832283,0.774151,0.686645,0.711108,0.69021
XGBClassifier7,0.837008,0.78374,0.66928,0.692368,0.696941
XGBClassifier8,0.837795,0.784582,0.681634,0.706751,0.699246


In [27]:
highest = df_pca[df_pca["kappa"] == df_pca.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[431,  61,   6],
       [ 63, 609,   6],
       [ 36,  31,  27]], dtype=int64)

In [26]:
print("f")

f
