In [28]:
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, cohen_kappa_score

from utils.loader import FactoryLoader
from utils.ml import MLPipeline
from utils.preprocessing import PreprocessingFactory
from utils.feature_extraction import *
from utils.utils import *

VAL_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val"
TRAIN_PATH = r"C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train"

In [29]:
percent = 100
random = False
batch_size = 24
th = 0.1

# Initialize the FactoryLoader
factory = PreprocessingFactory()
factory.pad2square(fill=np.nan)
factory.resize((150,150))
factory.normalize2float()

factory_loader = FactoryLoader(path=VAL_PATH, batch_size=batch_size, factory=factory, percentage=percent, shuffle=random)

# Create the feature extraction pipeline strategy and add desired features
strategy = FeatureExtractionStrategy()

strategy.add_extractor(GradientExtractor(threshold=th)) # Add gradient feature

strategy.add_extractor(ColorMomentsExtractor("rgb", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("lab", threshold=th))   # Add color moments feature
strategy.add_extractor(ColorMomentsExtractor("hsv", threshold=th))   # Add color moments feature

strategy.add_extractor(LBPExtractor(radius=1, n_points=8))
strategy.add_extractor(LBPExtractor(radius=2, n_points=16))
strategy.add_extractor(LBPExtractor(radius=3, n_points=24))
strategy.add_extractor(LBPExtractor(radius=4, n_points=32))
strategy.add_extractor(LBPExtractor(radius=5, n_points=40))

strategy.add_extractor(FourierTransformExtractor())
strategy.add_extractor(FFTExtractor())

strategy.add_extractor(GLCMExtractor(properties=['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', "ASM"]))

pipeline = MLPipeline(dataset_path=TRAIN_PATH, preprocessing_factory=factory, feature_strategy=strategy, classifiers=[], percentage=percent, verbose=True, shuffle=random, batch_size=batch_size)

INFO:utils.ml:MLPipeline initialized with dataset path: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\train
INFO:utils.ml:Preprocessing steps


In [30]:
pipeline.feature_matrix = None
pipeline.run_feature_extraction()

INFO:utils.ml:Running feature extraction...


Processed 5/212 batches.
Processed 10/212 batches.
Processed 15/212 batches.
Processed 20/212 batches.
Processed 25/212 batches.
Processed 30/212 batches.
Processed 35/212 batches.
Processed 40/212 batches.
Processed 45/212 batches.
Processed 50/212 batches.
Processed 55/212 batches.
Processed 60/212 batches.
Processed 65/212 batches.
Processed 70/212 batches.
Processed 75/212 batches.
Processed 80/212 batches.
Processed 85/212 batches.
Processed 90/212 batches.
Processed 95/212 batches.
Processed 100/212 batches.
Processed 105/212 batches.
Processed 110/212 batches.
Processed 115/212 batches.
Processed 120/212 batches.
Processed 125/212 batches.
Processed 130/212 batches.
Processed 135/212 batches.
Processed 140/212 batches.
Processed 145/212 batches.
Processed 150/212 batches.
Processed 155/212 batches.
Processed 160/212 batches.
Processed 165/212 batches.
Processed 170/212 batches.
Processed 175/212 batches.
Processed 180/212 batches.
Processed 185/212 batches.
Processed 190/212 bat

INFO:utils.ml:Feature extraction completed. Extracted 5082 features.


Processed 212/212 batches.


In [31]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Random Forest with Bootstrap Disabled
rf0 = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=42)
# Random Forest with Out-of-Bag Error (OOB)
rf1 = RandomForestClassifier(n_estimators=150, oob_score=True, random_state=42)
# Random Forest with Bootstrap Disabled
rf2 = RandomForestClassifier(n_estimators=150, bootstrap=False, random_state=42)
# Random Forest with Bootstrap Disabled
rf3 = RandomForestClassifier(n_estimators=200, bootstrap=False, random_state=42)

xgb4 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7) # 2ND
xgb5 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb6 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb7 = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)
xgb8 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)


xgb9 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb10 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb11 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb12 = XGBClassifier(learning_rate=0.1, n_estimators=750, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)

xgb13 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=3, subsample=0.7, colsample_bytree=0.7)
xgb14 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8) # THIS
xgb15 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1)
xgb16 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=7, min_child_weight=1, subsample=0.8, colsample_bytree=0.8)



pipeline.classifiers = [rf0, rf1, rf2, rf3,
                        # svm1, svm2, svm3, svm6, svm7, svm8,
                        xgb8, xgb9, xgb10, xgb11, xgb12, xgb13, xgb14, xgb15, xgb16]
pipeline.fitted_classifiers = {}

In [32]:
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_B_std', 0.04215343644857018), ('color_moments_lab_B_var', 0.03128957847280606), ('color_moments_lab_L_iqr', 0.022582198396617043), ('color_moments_rgb_B_std', 0.020792965896980386), ('color_moments_rgb_B_iqr', 0.019230327362332614), ('color_moments_rgb_B_var', 0.019047079061545246), ('lbp_rad1_bins8_4', 0.011604825198381869), ('color_moments_rgb_B_kurtosis', 0.011532230120517272), ('color_moments_lab_B_skew', 0.011445582192714783), ('color_moments_lab_B_iqr', 0.011086083782091081)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 5.609627962112427 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_std', 0.04051510272458046), ('color_moments_lab_B_var', 0.030963890623933097), ('color_moments_lab_L_iqr',

In [33]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12


Processed 53/53 batches.


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 0, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 0, ..., 1, 0, 0]),
 'XGBClassifier4': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 0, ..., 0, 2, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 0, ..., 0, 0, 0], dtype=int64)}

In [34]:
import pandas as pd

df = pd.DataFrame(data=pipeline.calculate_metrics())
df = df.T
df

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.841732283464567, 'precision': 0.8432633294001132, 'recall': 0.6389861208469099, 'f1': 0.6503783001865329, 'kappa': 0.7022729287283822}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8291338582677166, 'precision': 0.8816627816627817, 'recall': 0.6210679660838799, 'f1': 0.6262076017069599, 'kappa': 0.677814201006341}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8401574803149606, 'precision': 0.8889376946880453, 'recall': 0.635126081429843, 'f1': 0.6449280574643127, 'kappa': 0.6990113759479957}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8448818897637795, 'precision': 0.8921406016444869, 'recall': 0.6472393055558707, 'f1': 0.6639505174339944, 'kappa': 0.7080752733840818}
INFO:utils.ml:Metrics for classifi

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.841732,0.843263,0.638986,0.650378,0.702273
RandomForestClassifier1,0.829134,0.881663,0.621068,0.626208,0.677814
RandomForestClassifier2,0.840157,0.888938,0.635126,0.644928,0.699011
RandomForestClassifier3,0.844882,0.892141,0.647239,0.663951,0.708075
XGBClassifier4,0.867717,0.851836,0.694529,0.722224,0.753619
XGBClassifier5,0.871654,0.821496,0.703096,0.728705,0.761878
XGBClassifier6,0.871654,0.830095,0.698054,0.722879,0.761682
XGBClassifier7,0.870079,0.815953,0.690962,0.712794,0.758795
XGBClassifier8,0.869291,0.838142,0.698745,0.725617,0.757144


In [35]:
highest = df[df["kappa"] == df.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[449,  41,   8],
       [ 42, 633,   3],
       [ 47,  21,  26]], dtype=int64)

___
# Balance the data

In [36]:
# Backup
pipeline.backup_feature_matrix = pipeline.feature_matrix.copy()
pipeline.backup_labels = pipeline.labels.copy()

In [37]:
print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 189)
(5082, 189)
(5082,)
(5082,)


In [38]:
pipeline.get_feature_names()

['gradient_magnitude_mean',
 'gradient_magnitude_std',
 'gradient_direction_mean',
 'gradient_direction_std',
 'color_moments_rgb_R_mean',
 'color_moments_rgb_R_std',
 'color_moments_rgb_R_skew',
 'color_moments_rgb_R_kurtosis',
 'color_moments_rgb_R_median',
 'color_moments_rgb_R_var',
 'color_moments_rgb_R_min',
 'color_moments_rgb_R_max',
 'color_moments_rgb_R_iqr',
 'color_moments_rgb_R_entropy',
 'color_moments_rgb_G_mean',
 'color_moments_rgb_G_std',
 'color_moments_rgb_G_skew',
 'color_moments_rgb_G_kurtosis',
 'color_moments_rgb_G_median',
 'color_moments_rgb_G_var',
 'color_moments_rgb_G_min',
 'color_moments_rgb_G_max',
 'color_moments_rgb_G_iqr',
 'color_moments_rgb_G_entropy',
 'color_moments_rgb_B_mean',
 'color_moments_rgb_B_std',
 'color_moments_rgb_B_skew',
 'color_moments_rgb_B_kurtosis',
 'color_moments_rgb_B_median',
 'color_moments_rgb_B_var',
 'color_moments_rgb_B_min',
 'color_moments_rgb_B_max',
 'color_moments_rgb_B_iqr',
 'color_moments_rgb_B_entropy',
 'color_

In [39]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)


pipeline.feature_matrix, pipeline.labels = smote.fit_resample(np.nan_to_num(pipeline.feature_matrix), pipeline.labels)

In [40]:
pipeline.fitted_classifiers = {}
pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_B_std', 0.02785575828196386), ('color_moments_rgb_B_std', 0.020073434168764264), ('color_moments_lab_L_iqr', 0.018552789115056992), ('color_moments_lab_B_var', 0.018013427787625942), ('color_moments_rgb_B_iqr', 0.01795349239382182), ('color_moments_rgb_B_var', 0.015061474715758466), ('gradient_direction_std', 0.015023963244203098), ('lbp_rad1_bins8_4', 0.012304962013360001), ('color_moments_rgb_G_iqr', 0.009811632383418017), ('color_moments_rgb_R_kurtosis', 0.009700275364815615)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 11.045897960662842 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_std', 0.026315139896826562), ('color_moments_lab_L_iqr', 0.022162327827716485), ('color_moments_lab_B_var',

In [41]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10
INFO:utils.ml:Predictions made with classifier: XGBClassifier11


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier12


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 0, 0]),
 'XGBClassifier4': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 0, 0, 0], dtype=int64)}

In [42]:
import pandas as pd

df_smote = pd.DataFrame(data=pipeline.calculate_metrics())
df_smote = df_smote.T
df_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8362204724409449, 'precision': 0.7376897970767703, 'recall': 0.722524567610806, 'f1': 0.7290675080441099, 'kappa': 0.7042225765420377}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.8275590551181102, 'precision': 0.7202776587643637, 'recall': 0.7218038034268499, 'f1': 0.7209435872528968, 'kappa': 0.6906490663748115}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8354330708661417, 'precision': 0.7363151613625997, 'recall': 0.7161017130720837, 'f1': 0.7244461254762924, 'kappa': 0.7023219404531731}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8385826771653543, 'precision': 0.742108379670829, 'recall': 0.7187790892434358, 'f1': 0.7281125522131936, 'kappa': 0.7079903093384777}
INFO:utils.ml:Metrics for classif

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.83622,0.73769,0.722525,0.729068,0.704223
RandomForestClassifier1,0.827559,0.720278,0.721804,0.720944,0.690649
RandomForestClassifier2,0.835433,0.736315,0.716102,0.724446,0.702322
RandomForestClassifier3,0.838583,0.742108,0.718779,0.728113,0.70799
XGBClassifier4,0.874803,0.778322,0.770306,0.774109,0.774161
XGBClassifier5,0.866142,0.775351,0.755024,0.764029,0.757465
XGBClassifier6,0.862992,0.76807,0.758989,0.76324,0.752843
XGBClassifier7,0.873228,0.785685,0.769678,0.776932,0.770849
XGBClassifier8,0.874803,0.780013,0.767074,0.77309,0.773626


In [43]:
highest = df_smote[df_smote["kappa"] == df_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[438,  36,  24],
       [ 38, 627,  13],
       [ 32,  15,  47]], dtype=int64)

___
# Borderline SMOTE

In [44]:
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)

pipeline.feature_matrix, pipeline.labels = smote.fit_resample(
    np.nan_to_num(pipeline.backup_feature_matrix), pipeline.backup_labels)

print(pipeline.backup_feature_matrix.shape)
print(pipeline.feature_matrix.shape)

print(pipeline.backup_labels.shape)
print(pipeline.labels.shape)

(5082, 189)
(8139, 189)
(5082,)
(8139,)


In [45]:
pipeline.fitted_classifiers = {}

pipeline.fit_classifiers()

INFO:utils.ml:Fitting classifiers...
INFO:utils.ml:Fitting classifier: RandomForestClassifier0
INFO:utils.ml:Top 10 features for RandomForestClassifier0: [('color_moments_lab_B_std', 0.0268864453409031), ('color_moments_lab_L_iqr', 0.02044096228212243), ('color_moments_lab_B_var', 0.019790453619906542), ('color_moments_rgb_B_std', 0.017477040576347896), ('color_moments_rgb_B_iqr', 0.015372329569749401), ('color_moments_rgb_B_var', 0.013501314181172504), ('lbp_rad1_bins8_4', 0.011843153025938344), ('color_moments_rgb_R_kurtosis', 0.011026341290058854), ('color_moments_rgb_B_kurtosis', 0.01083527029073624), ('color_moments_rgb_G_skew', 0.010716065380903974)]
INFO:utils.ml:Fitted classifier: RandomForestClassifier0; Done in 10.602392673492432 seconds
INFO:utils.ml:Fitting classifier: RandomForestClassifier1
INFO:utils.ml:Top 10 features for RandomForestClassifier1: [('color_moments_lab_B_std', 0.025447869555592216), ('color_moments_lab_L_iqr', 0.0215959513748903), ('color_moments_lab_B_va

In [46]:
pipeline.predict_with_classifiers(VAL_PATH, percent)

INFO:utils.ml:Predicting with classifiers on dataset: C:\Users\gimes\Src\repos\CADx-Project\dataset\multiclass\val


Processed 5/53 batches.
Processed 10/53 batches.
Processed 15/53 batches.
Processed 20/53 batches.
Processed 25/53 batches.
Processed 30/53 batches.
Processed 35/53 batches.
Processed 40/53 batches.
Processed 45/53 batches.
Processed 50/53 batches.


INFO:utils.ml:Predictions made with classifier: RandomForestClassifier0
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier1
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier2
INFO:utils.ml:Predictions made with classifier: RandomForestClassifier3
INFO:utils.ml:Predictions made with classifier: XGBClassifier4
INFO:utils.ml:Predictions made with classifier: XGBClassifier5
INFO:utils.ml:Predictions made with classifier: XGBClassifier6
INFO:utils.ml:Predictions made with classifier: XGBClassifier7
INFO:utils.ml:Predictions made with classifier: XGBClassifier8
INFO:utils.ml:Predictions made with classifier: XGBClassifier9
INFO:utils.ml:Predictions made with classifier: XGBClassifier10


Processed 53/53 batches.


INFO:utils.ml:Predictions made with classifier: XGBClassifier11
INFO:utils.ml:Predictions made with classifier: XGBClassifier12


{'GT': array([0, 0, 0, ..., 2, 2, 2]),
 'RandomForestClassifier0': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier1': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier2': array([0, 0, 2, ..., 1, 0, 0]),
 'RandomForestClassifier3': array([0, 0, 2, ..., 1, 0, 0]),
 'XGBClassifier4': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier5': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier6': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier7': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier8': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier9': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier10': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier11': array([0, 0, 2, ..., 0, 0, 0], dtype=int64),
 'XGBClassifier12': array([0, 0, 2, ..., 0, 0, 0], dtype=int64)}

In [47]:
df_borderline_smote = pd.DataFrame(data=pipeline.calculate_metrics(
    ["accuracy", "precision", "recall", "f1", "kappa"],))
df_borderline_smote = df_borderline_smote.T
df_borderline_smote

INFO:utils.ml:Metrics for classifier GT: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'kappa': 1.0}
INFO:utils.ml:Metrics for classifier RandomForestClassifier0: {'accuracy': 0.8433070866141732, 'precision': 0.7493291233171293, 'recall': 0.7247833989570283, 'f1': 0.734778349453585, 'kappa': 0.7162505052319577}
INFO:utils.ml:Metrics for classifier RandomForestClassifier1: {'accuracy': 0.831496062992126, 'precision': 0.724771322921228, 'recall': 0.710055939568227, 'f1': 0.716208445676218, 'kappa': 0.6957981688343668}
INFO:utils.ml:Metrics for classifier RandomForestClassifier2: {'accuracy': 0.8440944881889764, 'precision': 0.7537905406616977, 'recall': 0.7250973390833151, 'f1': 0.7366623342686385, 'kappa': 0.7171912114014252}
INFO:utils.ml:Metrics for classifier RandomForestClassifier3: {'accuracy': 0.8448818897637795, 'precision': 0.7562745903582903, 'recall': 0.728288034457544, 'f1': 0.7397858551704705, 'kappa': 0.7185764935614207}
INFO:utils.ml:Metrics for classifier 

Unnamed: 0,accuracy,precision,recall,f1,kappa
GT,1.0,1.0,1.0,1.0,1.0
RandomForestClassifier0,0.843307,0.749329,0.724783,0.734778,0.716251
RandomForestClassifier1,0.831496,0.724771,0.710056,0.716208,0.695798
RandomForestClassifier2,0.844094,0.753791,0.725097,0.736662,0.717191
RandomForestClassifier3,0.844882,0.756275,0.728288,0.739786,0.718576
XGBClassifier4,0.874803,0.778256,0.767252,0.772395,0.773865
XGBClassifier5,0.873228,0.786372,0.763036,0.773337,0.769968
XGBClassifier6,0.867717,0.7766,0.762116,0.768785,0.760752
XGBClassifier7,0.879528,0.79184,0.774856,0.78236,0.782444
XGBClassifier8,0.877953,0.782246,0.769574,0.775452,0.77941


In [48]:
highest = df_borderline_smote[df_borderline_smote["kappa"] == df_borderline_smote.iloc[1:]["kappa"].max()]
idx = highest.index[0]

confusion_matrix(pipeline.predictions["GT"], pipeline.predictions[idx])

array([[445,  29,  24],
       [ 44, 624,  10],
       [ 30,  16,  48]], dtype=int64)