In [26]:
import numpy as np
from pygments.lexer import combined
from sklearn.metrics import confusion_matrix, recall_score, cohen_kappa_score

from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train"
TEST_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\test"

In [27]:
percent = 100
random = False
batch_size = 24
th = 0.01

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.pad2square(fill=np.nan)
factory.resize((200,200))
factory.normalize2float()

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(GradientExtractor(threshold=th)) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", threshold=th))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))
strategy.add_extractor(LBPExtractor(radius=5, n_points=40))

strategy.add_extractor(FourierTransformExtractor())
strategy.add_extractor(FFTExtractor())

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train
INFO:utils.ml:Preprocessing steps


In [28]:
pipeline.feature_matrix = None
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/212 batches.
Processed 10/212 batches.
Processed 15/212 batches.
Processed 20/212 batches.
Processed 25/212 batches.
Processed 30/212 batches.
Processed 35/212 batches.
Processed 40/212 batches.
Processed 45/212 batches.
Processed 50/212 batches.
Processed 55/212 batches.
Processed 60/212 batches.
Processed 65/212 batches.
Processed 70/212 batches.
Processed 75/212 batches.
Processed 80/212 batches.
Processed 85/212 batches.
Processed 90/212 batches.
Processed 95/212 batches.
Processed 100/212 batches.
Processed 105/212 batches.
Processed 110/212 batches.
Processed 115/212 batches.
Processed 120/212 batches.
Processed 125/212 batches.
Processed 130/212 batches.
Processed 135/212 batches.
Processed 140/212 batches.
Processed 145/212 batches.
Processed 150/212 batches.
Processed 155/212 batches.
Processed 160/212 batches.
Processed 165/212 batches.
Processed 170/212 batches.
Processed 175/212 batches.
Processed 180/212 batches.
Processed 185/212 batches.
Processed 190/212 bat

INFO:utils.ml:Feature extraction completed. Extracted 5082 features.


Processed 212/212 batches.


In [29]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


xgb0 = XGBClassifier(learning_rate=0.1, n_estimators=1200, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1, random_state=42) # this
xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8, random_state=42) # THIS
xgb2 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1, random_state=42) # THIS
xgb3 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7, random_state=42)
xgb4 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1, random_state=42) # this
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=9, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb8 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=9, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, random_state=42)

CLFS = [xgb0, xgb1, xgb2, xgb3, xgb4, xgb5, xgb6, xgb7, xgb8]

pipeline.classifiers = CLFS.copy()
pipeline.fitted_classifiers = {}

___
# Balance the data

In [30]:
# Backup
pipeline.backup_feature_matrix = pipeline.feature_matrix.copy()
pipeline.backup_labels = pipeline.labels.copy()

In [31]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(np.nan_to_num(pipeline.feature_matrix), pipeline.labels)

In [32]:
print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 519)
(8139, 519)
(5082,)
(8139,)


In [33]:
pipeline.fitted_classifiers = {}
pipeline.classifiers = CLFS.copy()
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: XGBClassifier0
INFO:utils.ml:Top 10 features for XGBClassifier0: [('fft_radial_variance_78', 0.03414021), ('color_moments_rgb_B_iqr', 0.02673978), ('color_moments_lab_L_iqr', 0.014872222), ('color_moments_lab_B_var', 0.011374275), ('color_moments_rgb_B_std', 0.009145864), ('color_moments_hsv_H_median', 0.008565589), ('fft_radial_variance_53', 0.008496182), ('fft_radial_mean_130', 0.008024439), ('lbp_rad1_bins8_4', 0.007917851), ('fft_radial_variance_114', 0.0074759256)]
INFO:utils.ml:Fitted classifier: XGBClassifier0; Done in 110.83021521568298 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier1
INFO:utils.ml:Top 10 features for XGBClassifier1: [('color_moments_rgb_B_iqr', 0.029413948), ('color_moments_lab_L_iqr', 0.017863724), ('fft_radial_mean_86', 0.017613973), ('fft_radial_variance_78', 0.014967409), ('color_moments_lab_B_var', 0.0109832585), ('fft_radial_variance_35', 0.007976364), ('color_moments_rgb_B_st

In [34]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier0
INFO:utils.ml:Predictions made with classifier: XGBClassifier1
INFO:utils.ml:Predictions made with classifier: XGBClassifier2
INFO:utils.ml:Predictions made with classifier: XGBClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8


Processed 53/53 batches.


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'XGBClassifier0': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier1': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier2': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier3': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier4': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 2, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 1, 2, 0], dtype=int64)}

In [35]:
import pandas as pd

smote_classifiers = pipeline.fitted_classifiers.copy()
smote_predictions = pipeline.predictions.copy()

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier XGBClassifier0: {'accuracy': 0.8842519685039371, 'precision': 0.8074837706210832, 'recall': 0.7893123992864699, 'f1': 0.7976096585149709, 'kappa': 0.7906480725582897}
INFO:utils.ml:Metrics for classifier XGBClassifier1: {'accuracy': 0.8740157480314961, 'precision': 0.7841445940295183, 'recall': 0.7761013299416205, 'f1': 0.77988495683285, 'kappa': 0.7729233437484634}
INFO:utils.ml:Metrics for classifier XGBClassifier2: {'accuracy': 0.8826771653543307, 'precision': 0.8062779225338229, 'recall': 0.788329115117345, 'f1': 0.79650263028054, 'kappa': 0.7878853776207472}
INFO:utils.ml:Metrics for classifier XGBClassifier3: {'accuracy': 0.8771653543307086, 'precision': 0.7901299286285658, 'recall': 0.7744803351988258, 'f1': 0.7817134528975345, 'kappa': 0.7777069663486137}
INFO:utils.ml:Metrics for classifier XGBClassifier4: {'accuracy': 0.8755

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
XGBClassifier0,0.884252,0.807484,0.789312,0.79761,0.790648
XGBClassifier1,0.874016,0.784145,0.776101,0.779885,0.772923
XGBClassifier2,0.882677,0.806278,0.788329,0.796503,0.787885
XGBClassifier3,0.877165,0.79013,0.77448,0.781713,0.777707
XGBClassifier4,0.875591,0.786774,0.780317,0.783357,0.775953
XGBClassifier5,0.881102,0.803272,0.78699,0.794491,0.785049
XGBClassifier6,0.879528,0.79344,0.782064,0.787463,0.782234
XGBClassifier7,0.875591,0.787166,0.776196,0.781399,0.775163
XGBClassifier8,0.875591,0.787166,0.776196,0.781399,0.775163


In [36]:
# BEST

In [37]:
highest = df_smote[df_smote["kappa"] == df_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[441,  36,  21],
       [ 38, 630,  10],
       [ 29,  13,  52]], dtype=int64)

___
# Borderline SMOTE

In [38]:
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(pipeline.backup_feature_matrix, pipeline.backup_labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 519)
(8139, 519)
(5082,)
(8139,)


In [39]:
pipeline.fitted_classifiers = {}
pipeline.classifiers = CLFS.copy()
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: XGBClassifier0
INFO:utils.ml:Top 10 features for XGBClassifier0: [('color_moments_rgb_B_iqr', 0.032523032), ('fft_radial_variance_95', 0.01458469), ('fft_radial_variance_76', 0.012896298), ('fft_radial_mean_115', 0.01176386), ('color_moments_lab_L_iqr', 0.0112523995), ('color_moments_rgb_B_std', 0.010745317), ('lbp_rad4_bins32_8', 0.010277651), ('color_moments_lab_B_var', 0.010192127), ('fft_radial_mean_87', 0.00889076), ('fft_radial_mean_133', 0.008795581)]
INFO:utils.ml:Fitted classifier: XGBClassifier0; Done in 57.35542416572571 seconds
INFO:utils.ml:Fitting classifier: XGBClassifier1
INFO:utils.ml:Top 10 features for XGBClassifier1: [('color_moments_rgb_B_iqr', 0.035296135), ('fft_radial_mean_75', 0.013281713), ('color_moments_lab_L_iqr', 0.011927683), ('color_moments_lab_B_var', 0.010379287), ('lbp_rad4_bins32_8', 0.009416714), ('fft_radial_mean_17', 0.009127008), ('fft_radial_variance_14', 0.009101782), ('fft_

In [40]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier0
INFO:utils.ml:Predictions made with classifier: XGBClassifier1
INFO:utils.ml:Predictions made with classifier: XGBClassifier2
INFO:utils.ml:Predictions made with classifier: XGBClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8


Processed 53/53 batches.


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'XGBClassifier0': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier1': array([0, 0, 2, ..., 0, 2, 2], dtype=int64),
 'XGBClassifier2': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier3': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier4': array([0, 0, 2, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 0, ..., 1, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 0, ..., 0, 2, 0], dtype=int64)}

In [41]:
borderline_smote_classifiers = pipeline.fitted_classifiers.copy()
borderline_smote_predictions = pipeline.predictions.copy()

df_borderline_smote = pd.DataFrame(data=pipeline.calculate_metrics(
    ["accuracy", "precision", "recall", "f1", "kappa"],))
df_borderline_smote = df_borderline_smote.T
df_borderline_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier XGBClassifier0: {'accuracy': 0.8826771653543307, 'precision': 0.7960991841754254, 'recall': 0.7769997960838522, 'f1': 0.7854260171501551, 'kappa': 0.7879072499114552}
INFO:utils.ml:Metrics for classifier XGBClassifier1: {'accuracy': 0.878740157480315, 'precision': 0.7975821336286453, 'recall': 0.7848046929448794, 'f1': 0.7908011827759959, 'kappa': 0.7809324539866306}
INFO:utils.ml:Metrics for classifier XGBClassifier2: {'accuracy': 0.8811023622047244, 'precision': 0.7965245703121483, 'recall': 0.7787155652043939, 'f1': 0.7866550780653344, 'kappa': 0.785157650390765}
INFO:utils.ml:Metrics for classifier XGBClassifier3: {'accuracy': 0.8748031496062992, 'precision': 0.7849104519067027, 'recall': 0.7733608128616897, 'f1': 0.7787580585556823, 'kappa': 0.7739110378617557}
INFO:utils.ml:Metrics for classifier XGBClassifier4: {'accuracy': 0.8

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
XGBClassifier0,0.882677,0.796099,0.777,0.785426,0.787907
XGBClassifier1,0.87874,0.797582,0.784805,0.790801,0.780932
XGBClassifier2,0.881102,0.796525,0.778716,0.786655,0.785158
XGBClassifier3,0.874803,0.78491,0.773361,0.778758,0.773911
XGBClassifier4,0.877165,0.79673,0.780589,0.788039,0.777842
XGBClassifier5,0.883465,0.801696,0.7836,0.791639,0.789511
XGBClassifier6,0.877165,0.788957,0.768727,0.777831,0.777314
XGBClassifier7,0.876378,0.805867,0.774166,0.787796,0.775422
XGBClassifier8,0.876378,0.805867,0.774166,0.787796,0.775422


In [47]:
highest = df_borderline_smote[df_borderline_smote["kappa"] == df_borderline_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[446,  32,  20],
       [ 40, 626,  12],
       [ 32,  12,  50]], dtype=int64)

# TRY CROSSVAL

In [None]:
val_loader = FactoryLoader(VAL_PATH, pipeline.batch_size, factory, shuffle=False)

# test_loader.show_images(100)

val_feature_matrix, val_labels = pipeline.feature_strategy.run(val_loader.get_loader())

In [49]:

combined_feature_matrix = np.vstack([pipeline.backup_feature_matrix, val_feature_matrix])
combined_labels = np.hstack([pipeline.backup_labels, val_labels])

print(combined_feature_matrix.shape)
print(combined_labels.shape)

smote = SMOTE(random_state=42)
new_combined_feature_matrix, new_combined_labels = smote.fit_resample(np.nan_to_num(pipeline.feature_matrix), pipeline.labels)

print(new_combined_feature_matrix.shape)
print(new_combined_labels.shape)

(6352, 519)
(6352,)


In [52]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

# Define the kappa scorer
kappa_scorer = make_scorer(cohen_kappa_score)

# Assuming X_combined and y_combined are the combined training and validation sets
# and `model` is your ML model instance (e.g., an sklearn classifier)

# Perform cross-validation using kappa score
kappa_scores = cross_val_score(smote_classifiers["XGBClassifier0"], combined_feature_matrix, combined_labels, cv=10, scoring=kappa_scorer)

# Display the results
print("Cross-validation Kappa scores:", kappa_scores)
print("Average Kappa score:", np.mean(kappa_scores))

Cross-validation Kappa scores: [0.77124177 0.73338738 0.73136772 0.75889723 0.73866863 0.74554842
 0.74609806 0.73142993 0.79129577 0.77961055]
Average Kappa score: 0.7527545463247926


___
# TEST SET

In [None]:
test_loader = FactoryLoader(TEST_PATH, pipeline.batch_size, pipeline.loader.get_factory(), shuffle=False)

# test_loader.show_images(100)

test_feature_matrix, _ = pipeline.feature_strategy.run(test_loader.get_loader())
test_loader.show_images(40)

In [None]:
test_predictions = smote_classifiers["XGBClassifier14"].predict(test_feature_matrix)

np.histogram(test_predictions, bins=3)

result = pd.DataFrame(data=test_predictions.reshape(-1,1), columns=["class"])
result.to_excel("multiclass_results.xlsx", index=False, header=False)
feature_importances_df = pd.DataFrame(data=smote_classifiers["XGBClassifier14"].feature_importances_, index=strategy.get_feature_names())
