In [1]:
import pandas as pd
from joblib import load
import numpy as np

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

## Cargo datos

In [2]:
val_data = pd.read_csv("data_separated/val_data.csv")
test_data = pd.read_csv("data_separated/test_data.csv")

In [3]:
y_val = pd.read_csv("data_separated/y_val.csv")
y_test = pd.read_csv("data_separated/y_test.csv")

In [4]:
mut_columns = [col for col in val_data.columns if "_mut" in col]

### Pequeño parche
Por alguna razon los dtypes de test y train son distintos. En train las columnas '_mut' son object y en test las columnas '_mut' son imt

In [5]:
print(test_data.dtypes)

age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
cellularity                object
                           ...   
hras_mut                    int64
prps2_mut                   int64
smarcb1_mut                 int64
stmn2_mut                   int64
siah1_mut                   int64
Length: 692, dtype: object


In [6]:
for col in mut_columns:
    test_data[col] = test_data[col].astype('object')

print("Adjusted test dataframe data types:")
print(test_data.dtypes)

Adjusted test dataframe data types:
age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
cellularity                object
                           ...   
hras_mut                   object
prps2_mut                  object
smarcb1_mut                object
stmn2_mut                  object
siah1_mut                  object
Length: 692, dtype: object


In [7]:
for col in mut_columns:
    val_data[col] = val_data[col].astype('object')

print("Adjusted test dataframe data types:")
print(val_data.dtypes)

Adjusted test dataframe data types:
age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
cellularity                object
                           ...   
hras_mut                   object
prps2_mut                  object
smarcb1_mut                object
stmn2_mut                  object
siah1_mut                  object
Length: 692, dtype: object


## Definition de la clase modelo y prediccion

In [8]:
class Models():
    def __init__(self, scaler_prefix: str, algorithm: str):
        self.algorithm = algorithm
        self.scaler = scaler_prefix
        self.scaler_instance = load(f"results/{scaler_prefix}_scaler.joblib")
        self.model_instance = load(f"results/{algorithm}_model_{scaler_prefix}.joblib")
    
    def predict(self):
        self.predictions = self.model_instance.predict(self.data)
    
    def scale(self, data):
        self.data = self.scaler_instance.transform(data)
    
    def cross_validate(self, X, y, cv):
        return pd.DataFrame(
            [[self.scaler, 
             self.algorithm, 
             cross_val_score(self.model_instance, X, y, cv=cv).mean()
            ]], columns=["scaler", "algorithm", "cv"]
        )

    def cv_predict(self, X, y, cv):
        self.predictions = cross_val_predict(self.model_instance, X, y, cv=cv)
        return self.get_metrics(y)

    
    def get_metrics(self, y_true):
        acc_value = accuracy_score(y_pred=self.predictions, y_true=y_true) 
        recall_value = recall_score(y_pred=self.predictions, y_true=y_true)
        precision_value = precision_score(y_pred=self.predictions, y_true=y_true) 
        f1_value = f1_score(y_pred=self.predictions, y_true=y_true)
        mcc_value = matthews_corrcoef(y_pred=self.predictions, y_true=y_true)
        cm = confusion_matrix(y_pred=self.predictions, y_true=y_true)
    
        return pd.DataFrame(
            [[self.scaler, self.algorithm, acc_value, recall_value, precision_value, f1_value, mcc_value, cm]],
            columns=["scaler", "algorithm", "acc", "recall", "precision", "f1", "mcc", "cm"]
        )

In [9]:
scaler_names = ["std", "minmax", "robust"]
algorithm_names = ["ada", "dt", "knn", "rf", "sdgc", "svm"]

In [10]:
models = []
scalers = []

for scaler in scaler_names:
    for algorithm in algorithm_names:
        models.append(Models(scaler, algorithm))
# std ada
# std dt
# ...
# minmax ada
# minmax dt

In [27]:
for model in models:
    model.scale(val_data)
    model.predict()
    



In [28]:
metrics_val = pd.DataFrame(columns=["scaler", "algorithm","acc", "recall", "precision", "f1", "mcc", "cm"])
for model in models:
    metrics_val = pd.concat([metrics_val, model.get_metrics(y_val)])
metrics_val.reset_index(drop=True)

  metrics_val = pd.concat([metrics_val, model.get_metrics(y_val)])


Unnamed: 0,scaler,algorithm,acc,recall,precision,f1,mcc,cm
0,std,ada,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
1,std,dt,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
2,std,knn,0.502538,0.938776,0.5,0.652482,0.019098,"[[7, 92], [6, 92]]"
3,std,rf,0.989848,0.979592,1.0,0.989691,0.979895,"[[99, 0], [2, 96]]"
4,std,sdgc,0.725888,0.44898,1.0,0.619718,0.538996,"[[99, 0], [54, 44]]"
5,std,svm,0.51269,0.020408,1.0,0.04,0.101789,"[[99, 0], [96, 2]]"
6,minmax,ada,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
7,minmax,dt,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
8,minmax,knn,0.583756,0.561224,0.585106,0.572917,0.167458,"[[60, 39], [43, 55]]"
9,minmax,rf,0.979695,0.959184,1.0,0.979167,0.960174,"[[99, 0], [4, 94]]"


In [17]:
for model in models:
    model.scale(test_data)
    model.predict()



In [25]:
metrics_test = pd.DataFrame(columns=["scaler", "algorithm","acc", "recall", "precision", "f1", "mcc", "cm"])
for model in models:
    metrics_test = pd.concat([metrics_test, model.get_metrics(y_test)])
metrics_test.reset_index(drop=True)

  metrics_test = pd.concat([metrics_test, model.get_metrics(y_test)])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,scaler,algorithm,acc,recall,precision,f1,mcc,cm
0,std,ada,1.0,1.0,1.0,1.0,1.0,"[[65, 0], [0, 45]]"
1,std,dt,1.0,1.0,1.0,1.0,1.0,"[[65, 0], [0, 45]]"
2,std,knn,0.454545,0.933333,0.424242,0.583333,0.09245,"[[8, 57], [3, 42]]"
3,std,rf,0.990909,0.977778,1.0,0.988764,0.981307,"[[65, 0], [1, 44]]"
4,std,sdgc,0.609091,0.044444,1.0,0.085106,0.163551,"[[65, 0], [43, 2]]"
5,std,svm,0.590909,0.0,0.0,0.0,0.0,"[[65, 0], [45, 0]]"
6,minmax,ada,1.0,1.0,1.0,1.0,1.0,"[[65, 0], [0, 45]]"
7,minmax,dt,1.0,1.0,1.0,1.0,1.0,"[[65, 0], [0, 45]]"
8,minmax,knn,0.581818,0.488889,0.488889,0.488889,0.135043,"[[42, 23], [23, 22]]"
9,minmax,rf,0.972727,0.933333,1.0,0.965517,0.944541,"[[65, 0], [3, 42]]"


In [16]:
metrics_val.sort_values(by=["f1","mcc"], ascending=False).

Unnamed: 0,scaler,algorithm,acc,recall,precision,f1,mcc,cm
0,std,ada,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
0,std,dt,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
0,minmax,ada,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
0,minmax,dt,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
0,robust,ada,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
0,robust,dt,1.0,1.0,1.0,1.0,1.0,"[[99, 0], [0, 98]]"
0,std,rf,0.989848,0.979592,1.0,0.989691,0.979895,"[[99, 0], [2, 96]]"
0,minmax,rf,0.979695,0.959184,1.0,0.979167,0.960174,"[[99, 0], [4, 94]]"
0,robust,rf,0.974619,0.959184,0.989474,0.974093,0.94967,"[[98, 1], [4, 94]]"
0,minmax,sdgc,0.939086,0.989796,0.898148,0.941748,0.882789,"[[88, 11], [1, 97]]"


In [37]:
metrics = pd.merge(metrics_val, metrics_test, on=["scaler", "algorithm"], how="inner", suffixes=["_val","_test"])
metrics = metrics.sort_values(by=["f1_val", "f1_test", "mcc_val", "mcc_test"], ascending=False)
metrics[metrics["f1_val"] != 1.0]

Unnamed: 0,scaler,algorithm,acc_val,recall_val,precision_val,f1_val,mcc_val,cm_val,acc_test,recall_test,precision_test,f1_test,mcc_test,cm_test
3,std,rf,0.989848,0.979592,1.0,0.989691,0.979895,"[[99, 0], [2, 96]]",0.990909,0.977778,1.0,0.988764,0.981307,"[[65, 0], [1, 44]]"
9,minmax,rf,0.979695,0.959184,1.0,0.979167,0.960174,"[[99, 0], [4, 94]]",0.972727,0.933333,1.0,0.965517,0.944541,"[[65, 0], [3, 42]]"
15,robust,rf,0.974619,0.959184,0.989474,0.974093,0.94967,"[[98, 1], [4, 94]]",0.981818,0.955556,1.0,0.977273,0.962825,"[[65, 0], [2, 43]]"
10,minmax,sdgc,0.939086,0.989796,0.898148,0.941748,0.882789,"[[88, 11], [1, 97]]",0.909091,1.0,0.818182,0.9,0.83205,"[[55, 10], [0, 45]]"
16,robust,sdgc,0.93401,0.908163,0.956989,0.931937,0.869104,"[[95, 4], [9, 89]]",0.918182,0.866667,0.928571,0.896552,0.830365,"[[62, 3], [6, 39]]"
11,minmax,svm,0.827411,0.734694,0.9,0.808989,0.665725,"[[91, 8], [26, 72]]",0.854545,0.688889,0.939394,0.794872,0.706099,"[[63, 2], [14, 31]]"
17,robust,svm,0.827411,0.72449,0.910256,0.806818,0.668411,"[[92, 7], [27, 71]]",0.818182,0.622222,0.903226,0.736842,0.629568,"[[62, 3], [17, 28]]"
2,std,knn,0.502538,0.938776,0.5,0.652482,0.019098,"[[7, 92], [6, 92]]",0.454545,0.933333,0.424242,0.583333,0.09245,"[[8, 57], [3, 42]]"
14,robust,knn,0.614213,0.632653,0.607843,0.62,0.228754,"[[59, 40], [36, 62]]",0.572727,0.488889,0.478261,0.483516,0.119271,"[[41, 24], [23, 22]]"
4,std,sdgc,0.725888,0.44898,1.0,0.619718,0.538996,"[[99, 0], [54, 44]]",0.609091,0.044444,1.0,0.085106,0.163551,"[[65, 0], [43, 2]]"


## Cross Validation

In [35]:
train_y = pd.read_csv("data_separated/y_train.csv")

In [38]:
results_cv = pd.DataFrame(columns=["scaler", "algorithm", "cv"])
for model in models:
    train_X = pd.read_csv(f"data_separated/train_data_{model.scaler}.csv")
    results_cv = pd.concat([results_cv, model.cross_validate(train_X.values, train_y.values, 5)])
results_cv.sort_values(by=["cv"], ascending=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  results_cv = pd.concat([results_cv, model.cross_validate(train_X.values, train_y.values, 5)])
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colum

Unnamed: 0,scaler,algorithm,cv
0,std,ada,1.0
0,std,dt,1.0
0,minmax,dt,1.0
0,minmax,ada,1.0
0,robust,ada,1.0
0,robust,dt,1.0
0,robust,rf,0.975796
0,minmax,rf,0.974522
0,std,rf,0.957962
0,robust,sdgc,0.93121


In [41]:
results_cv = pd.DataFrame(columns=["scaler", "algorithm","acc", "recall", "precision", "f1", "mcc", "cm"])
for model in models:
    train_X = pd.read_csv(f"data_separated/train_data_{model.scaler}.csv")
    results_cv = pd.concat([results_cv, model.cv_predict(train_X.values, train_y.values, 5)])
results_cv.sort_values(by=["f1", "mcc"], ascending=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  results_cv = pd.concat([results_cv, model.cv_predict(train_X.values, train_y.values, 5)])
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or

Unnamed: 0,scaler,algorithm,acc,recall,precision,f1,mcc,cm
0,std,ada,1.0,1.0,1.0,1.0,1.0,"[[444, 0], [0, 341]]"
0,std,dt,1.0,1.0,1.0,1.0,1.0,"[[444, 0], [0, 341]]"
0,minmax,ada,1.0,1.0,1.0,1.0,1.0,"[[444, 0], [0, 341]]"
0,minmax,dt,1.0,1.0,1.0,1.0,1.0,"[[444, 0], [0, 341]]"
0,robust,ada,1.0,1.0,1.0,1.0,1.0,"[[444, 0], [0, 341]]"
0,robust,dt,1.0,1.0,1.0,1.0,1.0,"[[444, 0], [0, 341]]"
0,minmax,rf,0.965605,0.926686,0.993711,0.959029,0.931136,"[[442, 2], [25, 316]]"
0,std,rf,0.963057,0.923754,0.990566,0.955994,0.925901,"[[441, 3], [26, 315]]"
0,robust,rf,0.957962,0.909091,0.99359,0.949464,0.916243,"[[442, 2], [31, 310]]"
0,robust,sdgc,0.940127,0.88563,0.974194,0.927803,0.879765,"[[436, 8], [39, 302]]"


### Antiguo codigo

In [76]:
standard_scaler = load("results/std_scaler.joblib")
minmax_scaler = load("results/minmax_scaler.joblib")
robust_scaler = load("results/robust_scaler.joblib")
knn_model = load("results/knn_model_std.joblib")
rf_model = load("results/rf_model_std.joblib")
svm_model = load("results/svm_model_std.joblib")
dt_model = load("results/dt_model_std.joblib")
sgdc_model = load("results/sdgc_model_std.joblib")
ada_model = load("results/ada_model_std.joblib")

In [77]:
val_data = pd.read_csv("data_separated/val_data.csv")
test_data = pd.read_csv("data_separated/test_data.csv")

y_val = pd.read_csv("data_separated/y_val.csv")
y_test = pd.read_csv("data_separated/y_test.csv")

In [33]:
predict_val_knn = knn_model.predict(X=val_data_scaler)
predict_test_knn = knn_model.predict(X=test_data_scaler)



In [31]:
predict_val_rf = rf_model.predict(X=val_data_scaler)
predict_test_rf = rf_model.predict(X=test_data_scaler)



In [36]:
predict_val_svm = svm_model.predict(X=val_data_scaler)
predict_test_svm = svm_model.predict(X=test_data_scaler)



In [37]:
predict_val_dt = dt_model.predict(X=val_data_scaler)
predict_test_dt = dt_model.predict(X=test_data_scaler)



In [38]:
predict_val_sgdc = sgdc_model.predict(X=val_data_scaler)
predict_test_sgdc = sgdc_model.predict(X=test_data_scaler)



In [87]:
predict_val_ada = ada_model.predict(X=val_data_scaler)
predict_test_ada = ada_model.predict(X=test_data_scaler)



In [46]:
get_metrics(predict_val_knn, y_val)

[0.6285714285714286,
 np.float64(0.6555555555555556),
 np.float64(0.6344086021505376),
 np.float64(0.644808743169399),
 np.float64(0.25595737271495583),
 array([[51, 34],
        [31, 59]])]

In [49]:
get_metrics(predict_test_knn, y_test)

[0.6804123711340206,
 np.float64(0.7567567567567568),
 np.float64(0.56),
 np.float64(0.6436781609195402),
 np.float64(0.37914686876298165),
 array([[38, 22],
        [ 9, 28]])]

In [48]:
get_metrics(predict_val_rf, y_val)

[0.7942857142857143,
 np.float64(0.8111111111111111),
 np.float64(0.7934782608695652),
 np.float64(0.8021978021978022),
 np.float64(0.5881200948904004),
 array([[66, 19],
        [17, 73]])]

In [50]:
get_metrics(predict_test_rf, y_test)

[0.7525773195876289,
 np.float64(0.8378378378378378),
 np.float64(0.6326530612244898),
 np.float64(0.7209302325581395),
 np.float64(0.522527534988099),
 array([[42, 18],
        [ 6, 31]])]

In [81]:
get_metrics(predict_val_knn, y_val)

[0.6285714285714286,
 np.float64(0.6555555555555556),
 np.float64(0.6344086021505376),
 np.float64(0.644808743169399),
 np.float64(0.25595737271495583),
 array([[51, 34],
        [31, 59]])]

In [51]:
get_metrics(predict_test_knn, y_test)

[0.6804123711340206,
 np.float64(0.7567567567567568),
 np.float64(0.56),
 np.float64(0.6436781609195402),
 np.float64(0.37914686876298165),
 array([[38, 22],
        [ 9, 28]])]

In [52]:
get_metrics(predict_val_dt, y_val)

[0.7371428571428571,
 np.float64(0.7333333333333333),
 np.float64(0.75),
 np.float64(0.7415730337078652),
 np.float64(0.4743238310341898),
 array([[63, 22],
        [24, 66]])]

In [83]:
get_metrics(predict_test_dt, y_test)

[0.7422680412371134,
 np.float64(0.7027027027027027),
 np.float64(0.65),
 np.float64(0.6753246753246753),
 np.float64(0.4631522820327474),
 array([[46, 14],
        [11, 26]])]

In [84]:
get_metrics(predict_val_sgdc, y_val)

[0.7771428571428571,
 np.float64(0.7555555555555555),
 np.float64(0.8),
 np.float64(0.7771428571428571),
 np.float64(0.5555555555555556),
 array([[68, 17],
        [22, 68]])]

In [85]:
get_metrics(predict_test_sgdc, y_test)

[0.7010309278350515,
 np.float64(0.6216216216216216),
 np.float64(0.6052631578947368),
 np.float64(0.6133333333333333),
 np.float64(0.36979382702249364),
 array([[45, 15],
        [14, 23]])]

In [89]:
get_metrics(predict_val_ada, y_val)

[0.8228571428571428,
 np.float64(0.8222222222222222),
 np.float64(0.8314606741573034),
 np.float64(0.8268156424581006),
 np.float64(0.645582876206969),
 array([[70, 15],
        [16, 74]])]

In [88]:
get_metrics(predict_test_ada, y_test)

[0.7938144329896907,
 np.float64(0.7567567567567568),
 np.float64(0.717948717948718),
 np.float64(0.7368421052631579),
 np.float64(0.5680749201914252),
 array([[49, 11],
        [ 9, 28]])]