In [1]:
import pandas as pd
from joblib import load
import numpy as np

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

## Cargo datos

In [2]:
val_data = pd.read_csv("data_separated/val_data.csv")
test_data = pd.read_csv("data_separated/test_data.csv")

In [3]:
y_val = pd.read_csv("data_separated/y_val.csv")
y_test = pd.read_csv("data_separated/y_test.csv")

In [4]:
mut_columns = [col for col in val_data.columns if "_mut" in col]

### Pequeño parche
Por alguna razon los dtypes de test y train son distintos. En train las columnas '_mut' son object y en test las columnas '_mut' son imt

In [5]:
print(test_data.dtypes)

age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
cellularity                object
                           ...   
hras_mut                    int64
prps2_mut                   int64
smarcb1_mut                 int64
stmn2_mut                   int64
siah1_mut                   int64
Length: 689, dtype: object


In [6]:
for col in mut_columns:
    test_data[col] = test_data[col].astype('object')

print("Adjusted test dataframe data types:")
print(test_data.dtypes)

Adjusted test dataframe data types:
age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
cellularity                object
                           ...   
hras_mut                   object
prps2_mut                  object
smarcb1_mut                object
stmn2_mut                  object
siah1_mut                  object
Length: 689, dtype: object


In [7]:
for col in mut_columns:
    val_data[col] = val_data[col].astype('object')

print("Adjusted test dataframe data types:")
print(val_data.dtypes)

Adjusted test dataframe data types:
age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
cellularity                object
                           ...   
hras_mut                   object
prps2_mut                  object
smarcb1_mut                object
stmn2_mut                  object
siah1_mut                  object
Length: 689, dtype: object


## Definition de la clase modelo y prediccion

In [8]:
class Models():
    def __init__(self, scaler_prefix: str, algorithm: str):
        self.algorithm = algorithm
        self.scaler = scaler_prefix
        self.scaler_instance = load(f"results/{scaler_prefix}_scaler.joblib")
        self.model_instance = load(f"results/{algorithm}_model_{scaler_prefix}.joblib")
    
    def predict(self):
        self.predictions = self.model_instance.predict(self.data)
    
    def scale(self, data):
        self.data = self.scaler_instance.transform(data)
    
    def cross_validate(self, X, y, cv):
        return pd.DataFrame(
            [[self.scaler, 
             self.algorithm, 
             cross_val_score(self.model_instance, X, y, cv=cv).mean()
            ]], columns=["scaler", "algorithm", "cv"]
        )

    def cv_predict(self, X, y, cv):
        self.predictions = cross_val_predict(self.model_instance, X, y, cv=cv)
        return self.get_metrics(y)

    
    def get_metrics(self, y_true):
        acc_value = accuracy_score(y_pred=self.predictions, y_true=y_true) 
        recall_value = recall_score(y_pred=self.predictions, y_true=y_true)
        precision_value = precision_score(y_pred=self.predictions, y_true=y_true) 
        f1_value = f1_score(y_pred=self.predictions, y_true=y_true)
        mcc_value = matthews_corrcoef(y_pred=self.predictions, y_true=y_true)
        cm = confusion_matrix(y_pred=self.predictions, y_true=y_true)
    
        return pd.DataFrame(
            [[self.scaler, self.algorithm, acc_value, recall_value, precision_value, f1_value, mcc_value, cm]],
            columns=["scaler", "algorithm", "acc", "recall", "precision", "f1", "mcc", "cm"]
        )

In [9]:
scaler_names = ["std", "minmax", "robust"]
algorithm_names = ["ada", "dt", "knn", "rf", "sdgc", "svm"]

In [10]:
models = []
scalers = []

for scaler in scaler_names:
    for algorithm in algorithm_names:
        models.append(Models(scaler, algorithm))
# std ada
# std dt
# ...
# minmax ada
# minmax dt

In [11]:
for model in models:
    model.scale(val_data)
    model.predict()
    



In [12]:
metrics_val = pd.DataFrame(columns=["scaler", "algorithm","acc", "recall", "precision", "f1", "mcc", "cm"])
for model in models:
    metrics_val = pd.concat([metrics_val, model.get_metrics(y_val)])
metrics_val.reset_index(drop=True)

  metrics_val = pd.concat([metrics_val, model.get_metrics(y_val)])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,scaler,algorithm,acc,recall,precision,f1,mcc,cm
0,std,ada,0.680203,0.653061,0.688172,0.670157,0.36069,"[[70, 29], [34, 64]]"
1,std,dt,0.609137,0.591837,0.610526,0.601036,0.218234,"[[62, 37], [40, 58]]"
2,std,knn,0.492386,0.918367,0.494505,0.642857,-0.020597,"[[7, 92], [8, 90]]"
3,std,rf,0.639594,0.438776,0.728814,0.547771,0.302548,"[[83, 16], [55, 43]]"
4,std,sdgc,0.502538,0.0,0.0,0.0,0.0,"[[99, 0], [98, 0]]"
5,std,svm,0.507614,0.010204,1.0,0.020202,0.071792,"[[99, 0], [97, 1]]"
6,minmax,ada,0.680203,0.653061,0.688172,0.670157,0.36069,"[[70, 29], [34, 64]]"
7,minmax,dt,0.624365,0.591837,0.630435,0.610526,0.248942,"[[65, 34], [40, 58]]"
8,minmax,knn,0.558376,0.530612,0.55914,0.544503,0.116651,"[[58, 41], [46, 52]]"
9,minmax,rf,0.619289,0.428571,0.688525,0.528302,0.255922,"[[80, 19], [56, 42]]"


In [13]:
for model in models:
    model.scale(test_data)
    model.predict()



In [14]:
metrics_test = pd.DataFrame(columns=["scaler", "algorithm","acc", "recall", "precision", "f1", "mcc", "cm"])
for model in models:
    metrics_test = pd.concat([metrics_test, model.get_metrics(y_test)])
metrics_test.reset_index(drop=True)

  metrics_test = pd.concat([metrics_test, model.get_metrics(y_test)])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,scaler,algorithm,acc,recall,precision,f1,mcc,cm
0,std,ada,0.645455,0.555556,0.568182,0.561798,0.264198,"[[46, 19], [20, 25]]"
1,std,dt,0.536364,0.311111,0.411765,0.35443,0.003637,"[[45, 20], [31, 14]]"
2,std,knn,0.427273,0.933333,0.411765,0.571429,0.019418,"[[5, 60], [3, 42]]"
3,std,rf,0.654545,0.377778,0.62963,0.472222,0.255834,"[[55, 10], [28, 17]]"
4,std,sdgc,0.590909,0.0,0.0,0.0,0.0,"[[65, 0], [45, 0]]"
5,std,svm,0.590909,0.0,0.0,0.0,0.0,"[[65, 0], [45, 0]]"
6,minmax,ada,0.645455,0.555556,0.568182,0.561798,0.264198,"[[46, 19], [20, 25]]"
7,minmax,dt,0.536364,0.311111,0.411765,0.35443,0.003637,"[[45, 20], [31, 14]]"
8,minmax,knn,0.545455,0.444444,0.444444,0.444444,0.059829,"[[40, 25], [25, 20]]"
9,minmax,rf,0.636364,0.311111,0.608696,0.411765,0.20874,"[[56, 9], [31, 14]]"


In [16]:
metrics_val.sort_values(by=["f1","mcc"], ascending=False)

Unnamed: 0,scaler,algorithm,acc,recall,precision,f1,mcc,cm
0,std,ada,0.680203,0.653061,0.688172,0.670157,0.36069,"[[70, 29], [34, 64]]"
0,minmax,ada,0.680203,0.653061,0.688172,0.670157,0.36069,"[[70, 29], [34, 64]]"
0,robust,ada,0.680203,0.653061,0.688172,0.670157,0.36069,"[[70, 29], [34, 64]]"
0,std,knn,0.492386,0.918367,0.494505,0.642857,-0.020597,"[[7, 92], [8, 90]]"
0,minmax,sdgc,0.64467,0.581633,0.662791,0.619565,0.291053,"[[70, 29], [41, 57]]"
0,robust,sdgc,0.624365,0.612245,0.625,0.618557,0.248685,"[[63, 36], [38, 60]]"
0,robust,svm,0.664975,0.530612,0.722222,0.611765,0.341167,"[[79, 20], [46, 52]]"
0,minmax,dt,0.624365,0.591837,0.630435,0.610526,0.248942,"[[65, 34], [40, 58]]"
0,std,dt,0.609137,0.591837,0.610526,0.601036,0.218234,"[[62, 37], [40, 58]]"
0,robust,knn,0.583756,0.591837,0.58,0.585859,0.167612,"[[57, 42], [40, 58]]"


In [17]:
metrics = pd.merge(metrics_val, metrics_test, on=["scaler", "algorithm"], how="inner", suffixes=["_val","_test"])
metrics = metrics.sort_values(by=["f1_val", "f1_test", "mcc_val", "mcc_test"], ascending=False)
metrics[metrics["f1_val"] != 1.0]

Unnamed: 0,scaler,algorithm,acc_val,recall_val,precision_val,f1_val,mcc_val,cm_val,acc_test,recall_test,precision_test,f1_test,mcc_test,cm_test
0,std,ada,0.680203,0.653061,0.688172,0.670157,0.36069,"[[70, 29], [34, 64]]",0.645455,0.555556,0.568182,0.561798,0.264198,"[[46, 19], [20, 25]]"
6,minmax,ada,0.680203,0.653061,0.688172,0.670157,0.36069,"[[70, 29], [34, 64]]",0.645455,0.555556,0.568182,0.561798,0.264198,"[[46, 19], [20, 25]]"
12,robust,ada,0.680203,0.653061,0.688172,0.670157,0.36069,"[[70, 29], [34, 64]]",0.645455,0.555556,0.568182,0.561798,0.264198,"[[46, 19], [20, 25]]"
2,std,knn,0.492386,0.918367,0.494505,0.642857,-0.020597,"[[7, 92], [8, 90]]",0.427273,0.933333,0.411765,0.571429,0.019418,"[[5, 60], [3, 42]]"
10,minmax,sdgc,0.64467,0.581633,0.662791,0.619565,0.291053,"[[70, 29], [41, 57]]",0.590909,0.488889,0.5,0.494382,0.15097,"[[43, 22], [23, 22]]"
16,robust,sdgc,0.624365,0.612245,0.625,0.618557,0.248685,"[[63, 36], [38, 60]]",0.590909,0.555556,0.5,0.526316,0.16879,"[[40, 25], [20, 25]]"
17,robust,svm,0.664975,0.530612,0.722222,0.611765,0.341167,"[[79, 20], [46, 52]]",0.663636,0.444444,0.625,0.519481,0.281273,"[[53, 12], [25, 20]]"
7,minmax,dt,0.624365,0.591837,0.630435,0.610526,0.248942,"[[65, 34], [40, 58]]",0.536364,0.311111,0.411765,0.35443,0.003637,"[[45, 20], [31, 14]]"
1,std,dt,0.609137,0.591837,0.610526,0.601036,0.218234,"[[62, 37], [40, 58]]",0.536364,0.311111,0.411765,0.35443,0.003637,"[[45, 20], [31, 14]]"
14,robust,knn,0.583756,0.591837,0.58,0.585859,0.167612,"[[57, 42], [40, 58]]",0.572727,0.488889,0.478261,0.483516,0.119271,"[[41, 24], [23, 22]]"


## Cross Validation

In [18]:
train_y = pd.read_csv("data_separated/y_train.csv")

In [19]:
results_cv = pd.DataFrame(columns=["scaler", "algorithm", "cv"])
for model in models:
    train_X = pd.read_csv(f"data_separated/train_data_{model.scaler}.csv")
    results_cv = pd.concat([results_cv, model.cross_validate(train_X.values, train_y.values, 5)])
results_cv.sort_values(by=["cv"], ascending=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  results_cv = pd.concat([results_cv, model.cross_validate(train_X.values, train_y.values, 5)])
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colum

Unnamed: 0,scaler,algorithm,cv
0,robust,svm,0.66242
0,robust,rf,0.658599
0,std,rf,0.650955
0,std,svm,0.643312
0,minmax,svm,0.642038
0,minmax,rf,0.640764
0,std,sdgc,0.638217
0,std,ada,0.634395
0,robust,ada,0.634395
0,minmax,ada,0.634395


In [20]:
results_cv = pd.DataFrame(columns=["scaler", "algorithm","acc", "recall", "precision", "f1", "mcc", "cm"])
for model in models:
    train_X = pd.read_csv(f"data_separated/train_data_{model.scaler}.csv")
    results_cv = pd.concat([results_cv, model.cv_predict(train_X.values, train_y.values, 5)])
results_cv.sort_values(by=["f1", "mcc"], ascending=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  results_cv = pd.concat([results_cv, model.cv_predict(train_X.values, train_y.values, 5)])
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or

Unnamed: 0,scaler,algorithm,acc,recall,precision,f1,mcc,cm
0,std,knn,0.463694,0.897361,0.442197,0.592449,0.042934,"[[58, 386], [35, 306]]"
0,std,ada,0.634395,0.58651,0.578035,0.582242,0.257265,"[[298, 146], [141, 200]]"
0,minmax,ada,0.634395,0.58651,0.578035,0.582242,0.257265,"[[298, 146], [141, 200]]"
0,robust,ada,0.634395,0.58651,0.578035,0.582242,0.257265,"[[298, 146], [141, 200]]"
0,std,sdgc,0.635669,0.565982,0.583082,0.574405,0.256129,"[[306, 138], [148, 193]]"
0,robust,knn,0.610191,0.589443,0.547684,0.567797,0.214158,"[[278, 166], [140, 201]]"
0,minmax,knn,0.608917,0.577713,0.547222,0.562054,0.209494,"[[281, 163], [144, 197]]"
0,robust,sdgc,0.615287,0.56305,0.556522,0.559767,0.21817,"[[291, 153], [149, 192]]"
0,robust,svm,0.66242,0.475073,0.653226,0.550085,0.300019,"[[358, 86], [179, 162]]"
0,std,rf,0.667516,0.422287,0.692308,0.52459,0.312405,"[[380, 64], [197, 144]]"


### Antiguo codigo

In [76]:
standard_scaler = load("results/std_scaler.joblib")
minmax_scaler = load("results/minmax_scaler.joblib")
robust_scaler = load("results/robust_scaler.joblib")
knn_model = load("results/knn_model_std.joblib")
rf_model = load("results/rf_model_std.joblib")
svm_model = load("results/svm_model_std.joblib")
dt_model = load("results/dt_model_std.joblib")
sgdc_model = load("results/sdgc_model_std.joblib")
ada_model = load("results/ada_model_std.joblib")

In [77]:
val_data = pd.read_csv("data_separated/val_data.csv")
test_data = pd.read_csv("data_separated/test_data.csv")

y_val = pd.read_csv("data_separated/y_val.csv")
y_test = pd.read_csv("data_separated/y_test.csv")

In [33]:
predict_val_knn = knn_model.predict(X=val_data_scaler)
predict_test_knn = knn_model.predict(X=test_data_scaler)



In [31]:
predict_val_rf = rf_model.predict(X=val_data_scaler)
predict_test_rf = rf_model.predict(X=test_data_scaler)



In [36]:
predict_val_svm = svm_model.predict(X=val_data_scaler)
predict_test_svm = svm_model.predict(X=test_data_scaler)



In [37]:
predict_val_dt = dt_model.predict(X=val_data_scaler)
predict_test_dt = dt_model.predict(X=test_data_scaler)



In [38]:
predict_val_sgdc = sgdc_model.predict(X=val_data_scaler)
predict_test_sgdc = sgdc_model.predict(X=test_data_scaler)



In [87]:
predict_val_ada = ada_model.predict(X=val_data_scaler)
predict_test_ada = ada_model.predict(X=test_data_scaler)



In [46]:
get_metrics(predict_val_knn, y_val)

[0.6285714285714286,
 np.float64(0.6555555555555556),
 np.float64(0.6344086021505376),
 np.float64(0.644808743169399),
 np.float64(0.25595737271495583),
 array([[51, 34],
        [31, 59]])]

In [49]:
get_metrics(predict_test_knn, y_test)

[0.6804123711340206,
 np.float64(0.7567567567567568),
 np.float64(0.56),
 np.float64(0.6436781609195402),
 np.float64(0.37914686876298165),
 array([[38, 22],
        [ 9, 28]])]

In [48]:
get_metrics(predict_val_rf, y_val)

[0.7942857142857143,
 np.float64(0.8111111111111111),
 np.float64(0.7934782608695652),
 np.float64(0.8021978021978022),
 np.float64(0.5881200948904004),
 array([[66, 19],
        [17, 73]])]

In [50]:
get_metrics(predict_test_rf, y_test)

[0.7525773195876289,
 np.float64(0.8378378378378378),
 np.float64(0.6326530612244898),
 np.float64(0.7209302325581395),
 np.float64(0.522527534988099),
 array([[42, 18],
        [ 6, 31]])]

In [81]:
get_metrics(predict_val_knn, y_val)

[0.6285714285714286,
 np.float64(0.6555555555555556),
 np.float64(0.6344086021505376),
 np.float64(0.644808743169399),
 np.float64(0.25595737271495583),
 array([[51, 34],
        [31, 59]])]

In [51]:
get_metrics(predict_test_knn, y_test)

[0.6804123711340206,
 np.float64(0.7567567567567568),
 np.float64(0.56),
 np.float64(0.6436781609195402),
 np.float64(0.37914686876298165),
 array([[38, 22],
        [ 9, 28]])]

In [52]:
get_metrics(predict_val_dt, y_val)

[0.7371428571428571,
 np.float64(0.7333333333333333),
 np.float64(0.75),
 np.float64(0.7415730337078652),
 np.float64(0.4743238310341898),
 array([[63, 22],
        [24, 66]])]

In [83]:
get_metrics(predict_test_dt, y_test)

[0.7422680412371134,
 np.float64(0.7027027027027027),
 np.float64(0.65),
 np.float64(0.6753246753246753),
 np.float64(0.4631522820327474),
 array([[46, 14],
        [11, 26]])]

In [84]:
get_metrics(predict_val_sgdc, y_val)

[0.7771428571428571,
 np.float64(0.7555555555555555),
 np.float64(0.8),
 np.float64(0.7771428571428571),
 np.float64(0.5555555555555556),
 array([[68, 17],
        [22, 68]])]

In [85]:
get_metrics(predict_test_sgdc, y_test)

[0.7010309278350515,
 np.float64(0.6216216216216216),
 np.float64(0.6052631578947368),
 np.float64(0.6133333333333333),
 np.float64(0.36979382702249364),
 array([[45, 15],
        [14, 23]])]

In [89]:
get_metrics(predict_val_ada, y_val)

[0.8228571428571428,
 np.float64(0.8222222222222222),
 np.float64(0.8314606741573034),
 np.float64(0.8268156424581006),
 np.float64(0.645582876206969),
 array([[70, 15],
        [16, 74]])]

In [88]:
get_metrics(predict_test_ada, y_test)

[0.7938144329896907,
 np.float64(0.7567567567567568),
 np.float64(0.717948717948718),
 np.float64(0.7368421052631579),
 np.float64(0.5680749201914252),
 array([[49, 11],
        [ 9, 28]])]