In [5]:
#!pip install tensorflow
#!pip install sdv

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import sdv


from sdv.tabular import GaussianCopula, CTGAN, CopulaGAN, TVAE
from sdv.sampling import Condition
from sdv.evaluation import evaluate

from numpy import random
from matplotlib.pyplot import figure
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, PredefinedSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from tensorflow import keras
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model, Sequential, load_model, save_model
from tensorflow.keras.layers import Dense, Input, Conv1D, Activation, Reshape, Flatten, Dropout, MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve

from keras import backend as K 

import joblib
from joblib import dump, load


#################################################### UTILS ##########################################

def add_metas(taxo,meta):
    new_taxo = taxo
    condition = []
    for i in range(len(taxo)):
        condition.append(pd.DataFrame(meta["condition"][meta["sample"] ==
                                                        taxo["sample"][i]]).iloc[0,0])
    return new_taxo, condition

def drop_nas(taxo,meta):
    taxo["condition"] = meta
    taxo = taxo.dropna()
    meta = taxo["condition"]
    taxo = taxo.iloc[:,:-1]
    return taxo, meta

def format_data(taxo_train, taxo_val, meta_train, meta_val, expand=0):
    x_train = taxo_train.to_numpy(); y_train = pd.DataFrame(meta_train).to_numpy()
    x_val = taxo_val.to_numpy(); y_val = pd.DataFrame(meta_val).to_numpy()
    if expand == 1:
        x_train = np.expand_dims(x_train, axis=2)
        x_val = np.expand_dims(x_val, axis=2)
            
    x_train = x_train.astype('float32'); y_train = y_train.astype('float32')
    x_val = x_val.astype('float32'); y_val = y_val.astype('float32')

    return x_train, x_val, y_train, y_val
    

#################################################### RAWDATA ##########################################

def rawdata(taxo_train, taxo_val, meta_train, meta_val):
    x_train, x_val, y_train, y_val = format_data(taxo_train, taxo_val, meta_train, meta_val)
    y_train = y_train.ravel(); y_val = y_val.ravel()
    return x_train, x_val, y_train, y_val

#################################################### ENCODED ##########################################

class Autoencoder(Model):
    def __init__(self, latent_dim, n_features): #diseño autoencoder
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim  #dimensión de bottleneck deseada 
        self.n_features = n_features 
        self.encoder = tf.keras.Sequential([ #diseño encoder
          layers.Input(shape=(n_features,1)), #capa de entrada
          layers.Conv1D(3, 3, activation="relu"), #convolución 64 filtros de 3 
          layers.Flatten(), #capa plana
          layers.Dense(latent_dim, activation="tanh"), #+densa para dimension deseada
        ])
        self.decoder = tf.keras.Sequential([
          layers.Reshape((latent_dim, 1)), #cambio de forma para la convolución
          layers.Conv1DTranspose(3, 3, activation="relu"), #convolución T 32 f de 3 
          layers.Flatten(), #capa de salida plana
          layers.Dense(n_features, activation="tanh"), #densa para dimensión entrada
          layers.Reshape((n_features,1)), #cambio de forma para dimensión == entrada
        ])
    def call(self, x): #función llamada
        encoded = self.encoder(x) #encoder
        decoded = self.decoder(encoded) #decoder
        return decoded #resultado autoencoder
    
def autoencoder_training(x_train, x_val, latent_dim, lr, ID_model, disease):
    autoencoder = Autoencoder(latent_dim, x_train.shape[1])
    opt = keras.optimizers.RMSprop(learning_rate=lr)
    autoencoder.compile(optimizer=opt, loss='mean_squared_error',
                        metrics=['accuracy'])
    callbacks = EarlyStopping(monitor='val_loss', patience=100, verbose=0)
    checkpoint_path = cp_path("autoencoder", "encoded", ID_model, disease)
    mc = ModelCheckpoint(filepath=checkpoint_path, monitor="val_loss", mode='min', 
                         save_best_only=True, save_weights_only=True, verbose=0)
    autoencoder.fit(x_train, x_train, validation_data=(x_val, x_val), 
                    epochs=300, callbacks = [callbacks, mc], batch_size=32, 
                    verbose=0, shuffle=True)
    autoencoder.load_weights(checkpoint_path)
    return autoencoder

def encode_norm_meta_data(autoencoder, x_train, x_val):
    x_train = pd.DataFrame(autoencoder.encoder(x_train).numpy())
    scaler = MinMaxScaler(); scaler.fit(x_train); x_train = pd.DataFrame(scaler.transform(x_train))  
    x_val = pd.DataFrame(autoencoder.encoder(x_val).numpy())
    scaler = MinMaxScaler(); scaler.fit(x_val); x_val = pd.DataFrame(scaler.transform(x_val))        
    return x_train, x_val

def save_autoencoder_encoded(autoencoder, ID_model, disease, x_train, x_val, y_train, y_val):
    path = "repro/models/altres_models/autoencoders/" + ID_model + "_" + "autoencoder" + "_" + disease
    DirExists = os.path.exists(path)
    if not DirExists:
        os.makedirs(path)
    tf.saved_model.save(autoencoder, path)
    path_vars = "repro/models/altres_models/autoencoders/" + ID_model + "_" + "encoded" + "_" + disease + "_"
    pd.DataFrame(x_train).to_csv(path_vars + "x_train.csv", index=False)
    pd.DataFrame(y_train).to_csv(path_vars + "y_train.csv", index=False)  
    pd.DataFrame(x_val).to_csv(path_vars + "x_val.csv", index=False); 
    pd.DataFrame(y_val).to_csv(path_vars + "y_val.csv", index=False)
        
def load_autoencoder_encoded(ID_model, autoencoder_path, disease):
    autoencoder = tf.saved_model.load(autoencoder_path)
    path_vars = "repro/models/altres_models/autoencoders/" + ID_model + "_" + "encoded" + "_" + disease + "_"
    x_train = pd.read_csv(path_vars + "x_train.csv", encoding='unicode_escape'); x_train.to_numpy().astype('float32')
    y_train = pd.read_csv(path_vars + "y_train.csv", encoding='unicode_escape'); y_train.to_numpy().astype('float32')
    x_val = pd.read_csv(path_vars + "x_val.csv", encoding='unicode_escape'); x_val.to_numpy().astype('float32')
    y_val = pd.read_csv(path_vars + "y_val.csv", encoding='unicode_escape'); y_val.to_numpy().astype('float32')
    return x_train, x_val, y_train, y_val

def encoded(bench, ID_model, disease, taxo_train=None, taxo_val=None, meta_train=None, meta_val=None, autoencoder_path=None):
    if autoencoder_path == None:
        x_train, x_val, y_train, y_val = format_data(taxo_train, taxo_val, meta_train, meta_val, expand=1)
        autoencoder = autoencoder_training(x_train, x_val, latent_dim=100, lr=1e-3, ID_model=ID_model, disease=disease)
        x_train, x_val = encode_norm_meta_data(autoencoder, x_train, x_val)
        save_autoencoder_encoded(autoencoder, ID_model, disease, x_train, x_val, y_train, y_val)
        print("Model i metadades autoencoder guardats correctament.")
    else:
        x_train, x_val, y_train, y_val = load_autoencoder_encoded(ID_model, autoencoder_path, disease)
        print("Carregant model i metadades autoencoder...")
    y_train = y_train.ravel(); y_val = y_val.ravel()
    return x_train, x_val, y_train, y_val


#################################################### SYNTHETIC ##########################################

def synthetizer_training(x_train, y_train):
    path = "repro/models/altres_models/synthetizers/"
    x_train["condition"] = y_train.astype(int)
    x_train.to_csv(path + "real_data.csv", index=False)
    x_train = pd.read_csv(path + "real_data.csv", encoding='unicode_escape')
    field_types = {"condition": {"type": "boolean"}}
    field_transformers = {"condition": "boolean"}  
    synthetizer = TVAE(batch_size=50, epochs=500, l2scale=1e-3, embedding_dim=50,
                       field_types=field_types, field_transformers=field_transformers)
    synthetizer.fit(x_train)
    return synthetizer

def save_synthetizer(synthetizer, ID_model, disease):
    path = "repro/models/altres_models/synthetizers/" + ID_model + "_synthetizer_" + disease + ".h5"
    synthetizer.save(path)
    
def synthetize(synthetizer, num_rows):
    if (num_rows % 2) == 0:
        num_rows = num_rows
    else:
        num_rows = num_rows+1
    
    condition = Condition({'condition': np.repeat([0,1],[int(num_rows/2),int(num_rows/2)],axis=0)}, 
                          num_rows=num_rows) 
 
    syn_data = synthetizer.sample_conditions(conditions=[condition])
    syn_data_y = pd.DataFrame(syn_data.iloc[:,-1]); 
    syn_data_x = syn_data.iloc[:,:-1]
    return syn_data_x, syn_data_y
    

def synthetic(ID_model, disease, autoencoder_path, num_rows, synthetizer_path=None):
    x_train, x_val, y_train, y_val = load_autoencoder_encoded(ID_model, autoencoder_path, disease)
    if synthetizer_path == None:
        synthetizer = synthetizer_training(x_train, y_train)
        save_synthetizer(synthetizer, ID_model, disease)
        print("Model i metadades synthetizer guardats correctament.")
    else:
        synthetizer = TVAE.load(synthetizer_path)
        print("Carregant model i metadades synthetizer...")
    syn_data_x, syn_data_y = synthetize(synthetizer, num_rows)
    syn_data_y = syn_data_y.to_numpy().ravel(); y_val = y_val.to_numpy().ravel()
    return syn_data_x, x_val, syn_data_y, y_val
          
        
#################################################### SYNREAL ##########################################

def concat_synreal(syn_data_x, x_train, syn_data_y, y_train):
    synreal_data_x = pd.concat([syn_data_x.reset_index(drop=True), x_train.reset_index(drop=True)], axis=0)
    synreal_data_y = np.concatenate((syn_data_y, y_train), axis=0)
    return synreal_data_x, synreal_data_y

def synreal(ID_model, disease, autoencoder_path, synthetizer_path):
    x_train, x_val, y_train, y_val = load_autoencoder_encoded(ID_model, autoencoder_path, disease)
    synthetizer = TVAE.load(synthetizer_path)
    print("Carregant model i metadades synthetizer...")
    syn_data_x, syn_data_y = synthetize(synthetizer, len(x_train))
    synreal_data_x, synreal_data_y = concat_synreal(syn_data_x, x_train, syn_data_y, y_train)
    synreal_data_y = synreal_data_y.ravel(); y_val = y_val.to_numpy().ravel()
    return synreal_data_x, x_val, synreal_data_y, y_val


#################################################### PREPARE DATA ######################################

def prepare_data(bench, ID_model, disease, taxo_train=None, taxo_val=None, meta_train=None, meta_val=None, 
                 autoencoder_path=None, synthetizer_path=None):
    
    taxo_train = taxo_train.iloc[:,1:]; taxo_val = taxo_val.iloc[:,1:]

    if bench == "rawdata":
        x_train, x_val, y_train, y_val = rawdata(taxo_train, taxo_val, meta_train, meta_val)
    elif bench == "encoded":
        if autoencoder_path==None:
            x_train, x_val, y_train, y_val = encoded(taxo_train=taxo_train, taxo_val=taxo_val, 
                                                     meta_train=meta_train, meta_val=meta_val,
                                                     autoencoder_path=None, 
                                                     bench=bench, ID_model=ID_model, disease=disease)
        else:
            x_train, x_val, y_train, y_val = encoded(taxo_train=None, taxo_val=None, 
                                                     meta_train=None, meta_val=None, 
                                                     autoencoder_path=autoencoder_path,
                                                     bench=bench, ID_model=ID_model, disease=disease)
    elif bench == "synthetic":
        if synthetizer_path==None:
            x_train, x_val, y_train, y_val = synthetic(ID_model=ID_model, disease=disease,
                                                       autoencoder_path=autoencoder_path, 
                                                       synthetizer_path=None, num_rows=5000)
        else:
            x_train, x_val, y_train, y_val = synthetic(ID_model=ID_model, disease=disease, 
                                                       autoencoder_path=autoencoder_path,
                                                       synthetizer_path=synthetizer_path, num_rows=5000)
    else:
        x_train, x_val, y_train, y_val = synreal(ID_model, disease, autoencoder_path, synthetizer_path)
    return x_train, x_val, y_train, y_val

#################################################### BENCHMARKING ######################################

def get_taxo_meta_fold(x_train, x_val, y_train, y_val):
   
    # put -1 here, so they will be in training set
    val_fold = []
    for i in range(len(x_train)):
        val_fold.append(-1)

    # for all greater indices, assign 0, so they will be put in test set
    for i in range(len(x_val)):
        val_fold.append(0)
        
    taxo = np.concatenate((x_train, x_val), axis=0)
    meta = np.concatenate((y_train, y_val), axis=0)
    return taxo, meta, val_fold

def KNN_bm(taxo, meta, scoring, val_fold):
    param_grid = dict(n_neighbors=list(range(30,121)), weights = ['uniform'], p=[1,2])
    grid = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, scoring=scoring,
                    cv=PredefinedSplit(test_fold=val_fold))
    grid_result = grid.fit(taxo, meta)
    return grid_result.best_estimator_, grid_result.best_score_, grid_result.best_params_

def DT_bm(taxo, meta, scoring, val_fold):
    param_grid = dict(criterion=['entropy'], splitter = ['random'], 
                  min_samples_split=[0.8,0.9], 
                  min_samples_leaf=[0.1,0.2],
                  max_depth=list(range(1,6)),    
                  max_features=['sqrt'])
    grid = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, scoring=scoring,
                    cv=PredefinedSplit(test_fold=val_fold))
    grid_result = grid.fit(taxo, meta)
    return grid_result.best_estimator_, grid_result.best_score_, grid_result.best_params_

def RF_bm(taxo, meta, scoring, val_fold):
    param_grid = dict(n_estimators=[500], criterion=['entropy'], 
                  min_samples_split=[0.25,0.5,0.75], 
                  min_samples_leaf=[0.1,0.2],
                  max_depth=list(range(1,11)), 
                  max_features=['sqrt'], 
                  bootstrap=[True])
    grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, scoring=scoring, 
                    cv=PredefinedSplit(test_fold=val_fold))
    grid_result = grid.fit(taxo,meta)
    return grid_result.best_estimator_, grid_result.best_score_, grid_result.best_params_

def SVM_bm(taxo, meta, scoring, val_fold, bench):
    param_grid = dict(C=[0.1,1,5,10], kernel=['linear','rbf','sigmoid'], 
                      gamma=[0.1, 0.5], shrinking=[True], 
                      tol=[1e-3], probability=[True])
    grid = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring=scoring,
                    cv=PredefinedSplit(test_fold=val_fold))
    grid_result = grid.fit(taxo,meta)
    return grid_result.best_estimator_, grid_result.best_score_, grid_result.best_params_

def extract_n(x_train):
    n_features = x_train.shape[1]
    n1 = int(round(n_features/2,0))
    n2 = int(round(n1/2,0))
    n3 = int(round(n2/2,0))
    return n_features, n1, n2, n3

def get_batch_size(bench):
    if bench == "synthetic":
        batch_size = 500
    elif bench == "synreal":
        batch_size = 64
    else:
        batch_size = 32
    return batch_size
        
def get_lr(type_model, bench):
    if bench == "rawdata":
        if type_model == "mlp":
            lr = 1e-5
        else:
            lr = 1e-4
    elif bench == "encoded":
        if type_model == "mlp":
            lr = 1e-4
        else:
            lr = 1e-3
    else:
        lr = 1e-3
    return lr
        
def cp_path(type_model, bench, ID_model, disease):
    path = "repro/models/altres_models/training_checkpoints/" + ID_model + "_" + bench + "_" + type_model + "_" + disease
    DirExists = os.path.exists(path)
    if not DirExists:
        os.makedirs(path)
    path = path + "/cp.ckpt"
    return path

def MLP(layer, n_features, lr):
    model = Sequential()
    for i, nodes in enumerate(layer):
        if i==0:
            model.add(Dense(nodes, input_dim=n_features))
            model.add(Activation("relu"))
            model.add(Dropout(0.5))
        else:
            model.add(Dense(nodes))
            model.add(Activation("relu"))
            model.add(Dropout(0.5))
        model.add(Dense(1, activation="sigmoid"))
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), 
                      loss='binary_crossentropy', metrics=['accuracy'])
    return model
   
def MLP_bm(x_train, x_val, y_train, y_val, k, scoring, bench, ID_model, disease):
    model = KerasClassifier(build_fn=MLP, verbose=0)
    n_features, n1, n2, n3 = extract_n(x_train)
    #lr = get_lr("mlp", bench)
    lr = 1e-3
    layer = [[n1], [n2], [n3], [n1,n2], [n2,n3], [n1,n3], [n1, n2, n3]]
    #batch_size = get_batch_size(bench)
    batch_size = 32
    callbacks = EarlyStopping(monitor='val_loss', patience=100, verbose=0)
    checkpoint_path = cp_path("mlp", bench, ID_model, disease)
    param_grid = dict(layer=layer, n_features=[n_features], lr=[lr],  batch_size=[batch_size], epochs=[1000])
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring,
                    cv=KFold(k, shuffle=True, random_state=42))
    grid_result = grid.fit(x_train, y_train, validation_data=(x_val, y_val), callbacks = [callbacks])
    return grid_result.best_estimator_, grid_result.best_score_, grid_result.best_params_, checkpoint_path

def CNN(filters, n_features, lr):
    model = Sequential()
    model.add(Input((n_features,1)))
    model.add(Conv1D(filters, 3))
    model.add(Activation("relu"))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.5))
    model.add(Conv1D(filters, 3))
    model.add(Activation("relu"))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(1))
    model.add(Activation("sigmoid"))
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), 
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

def CNN_bm(x_train, x_val, y_train, y_val, k, scoring, bench, ID_model, disease):
    model = KerasClassifier(build_fn=CNN, verbose=0)
    filters = [16,32,64]
    n_features, n1, n2, n3 = extract_n(x_train)
    #lr = get_lr("cnn", bench)
    lr = 1e-3
    #batch_size = get_batch_size(bench)
    batch_size = 32
    callbacks = EarlyStopping(monitor='val_loss', patience=100, verbose=0)
    checkpoint_path = cp_path("cnn", bench, ID_model, disease)
    param_grid = dict(filters=filters, n_features=[n_features], lr=[lr], 
                      batch_size = [batch_size], epochs=[1000])
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring,
                    cv=KFold(k, shuffle=True, random_state=42))
    grid_result = grid.fit(x_train, y_train, validation_data=(x_val, y_val), callbacks = [callbacks])
    return grid_result.best_estimator_, grid_result.best_score_, grid_result.best_params_, checkpoint_path

def update_best(estimator1, score1, params1, estimator2, score2, params2):
    if score1>=score2:
        return estimator1, score1, params1
    else:
        return estimator2, score2, params2
    
def restore_NN_best_estimator(best_estimator, checkpoint_path, x_train, y_train, x_val, y_val, bench):
    batch_size = get_batch_size(bench)
    callbacks = EarlyStopping(monitor='val_loss', patience=100, verbose=0)
    mc = ModelCheckpoint(filepath=checkpoint_path, monitor="val_loss", mode='min', 
                         save_best_only=True, save_weights_only=True, verbose=0)
    best_estimator.fit(x_train, y_train, validation_data=(x_val, y_val), 
                          callbacks=[callbacks, mc], epochs=1000, verbose=0, batch_size=batch_size)
    best_estimator.load_weights(checkpoint_path)
    return best_estimator
    
def benchmarking(x_train, x_val, y_train, y_val, bench, ID_model, disease, k_NN, scoring):
    taxo, meta, val_fold = get_taxo_meta_fold(x_train, x_val, y_train, y_val)

    best_estimator, best_score, best_params = KNN_bm(taxo, meta, scoring, val_fold)
    print("KNN: " + str(round(best_score,4)) + ". Paràmetres=", best_params)
    
    #best_estimator_DT, best_score_DT, best_params_DT = DT_bm(taxo, meta, scoring, val_fold)
    #best_estimator, best_score, best_params = update_best(best_estimator, best_score, best_params,
    #                                                      best_estimator_DT, best_score_DT, best_params_DT)
    #print("DT: " + str(round(best_score_DT,4)) + ". Paràmetres=", best_params_DT)
    
    best_estimator_RF, best_score_RF, best_params_RF = RF_bm(taxo, meta, scoring, val_fold)
    best_estimator, best_score, best_params = update_best(best_estimator, best_score, best_params,
                                                          best_estimator_RF, best_score_RF, best_params_RF)
    print("RF: " + str(round(best_score_RF,4)) + ". Paràmetres=", best_params_RF)
   
    best_estimator_SVM, best_score_SVM, best_params_SVM = SVM_bm(taxo, meta, scoring, val_fold, bench)
    best_estimator, best_score, best_params = update_best(best_estimator, best_score, best_params,
                                                          best_estimator_SVM, best_score_SVM, best_params_SVM)
    print("SVM: " + str(round(best_score_SVM,4)) + ". Paràmetres=", best_params_SVM)
    
    best_estimator_MLP, best_score_MLP, best_params_MLP, checkpointpath_MLP = MLP_bm(x_train, x_val, y_train, y_val, 
                                                                                     k_NN, scoring, bench, ID_model, disease)
    best_estimator, best_score, best_params = update_best(best_estimator, best_score, best_params,
                                                          best_estimator_MLP, best_score_MLP, best_params_MLP)
    print("MLP: " + str(round(best_score_MLP,4)) + ". Paràmetres=", best_params_MLP)
   
    best_estimator_CNN, best_score_CNN, best_params_CNN, checkpointpath_CNN = CNN_bm(x_train, x_val, y_train, y_val, 
                                                                                     k_NN, scoring, bench, ID_model, disease)
    best_estimator, best_score, best_params = update_best(best_estimator, best_score, best_params,
                                                          best_estimator_CNN, best_score_CNN, best_params_CNN)
    print("CNN: " + str(round(best_score_CNN,4)) + ". Paràmetres=", best_params_CNN)
    
    if (str(best_estimator) == str(best_estimator_MLP)) | (str(best_estimator) == str(best_estimator_CNN)):
        if str(best_estimator) == str(best_estimator_MLP):
            checkpoint_path = checkpointpath_MLP
            best_estimator = MLP(layer=best_params["layer"], n_features=best_params["n_features"], 
                                lr=best_params["lr"])
        else:
            checkpoint_path = checkpointpath_CNN
            best_estimator = CNN(filters=best_params["filters"], n_features=best_params["n_features"], 
                                 lr=best_params["lr"])
        best_estimator = restore_NN_best_estimator(best_estimator, checkpoint_path, 
                                                   x_train, y_train, x_val, y_val, bench)
    
    print("Millor model " + ID_model + "_" + bench + "_" + disease + ": " + str(best_estimator) + "[", best_params, "] AUC: " + str(round(best_score,4)))
    
    return best_estimator, best_score, best_params

def save_best_model(best_estimator, best_score, best_params, bench, ID_model, disease):
    #Save model
    model_path = "repro/models/altres_models/"
    if (str(best_estimator).find("keras") != -1):
        model_path = model_path + ID_model + "_" + bench + "_" + disease + ".h5"
        best_estimator.save(model_path)
    else:
        model_path = model_path + ID_model + "_" + bench + "_" + disease + ".joblib"
        joblib.dump(best_estimator, model_path)
        
    #Save model metadata
    params_path = "repro/models/altres_models/params/" + ID_model + "_" + bench + "_" + disease + "_params.csv"
    best_params["score"] = best_score
    best_params["model_type"] = type(best_estimator)
    pd.DataFrame(best_params.items()).to_csv(params_path, index=False, header=False)
    print("Model i paràmetres guardats correctament.")
    
#################################################### MAIN ###############################################    
    
def main(bench, ID_model, disease, k_NN, scoring, taxo_train=None, taxo_val=None, meta_train=None, meta_val=None, 
         autoencoder_path=None, synthetizer_path=None):
        
    x_train, x_val, y_train, y_val = prepare_data(taxo_train=taxo_train, taxo_val=taxo_val, 
                                                  meta_train=meta_train, meta_val=meta_val,
                                                  autoencoder_path=autoencoder_path, 
                                                  synthetizer_path=synthetizer_path, 
                                                  bench=bench, ID_model=ID_model, disease=disease)
    
    best_estimator, best_score, best_params = benchmarking(x_train, x_val, y_train, y_val, 
                                                           bench, ID_model, disease, k_NN, scoring)
    
    save_best_model(best_estimator, best_score, best_params, bench, ID_model, disease)
    

# Benchmarking i selecció millors models

In [2]:
# DM2

## Train
taxoS_DM2 = pd.read_csv("repro/train/taxoS_train_DM2.csv", encoding='unicode_escape')
taxoS1_DM2 = pd.read_csv("repro/train/taxoS1_train_DM2.csv", encoding='unicode_escape')
meta_DM2 = pd.read_csv("repro/metadades_bin_DM2.csv", encoding='unicode_escape')
taxoS_DM2, meta_DM2 = add_metas(taxoS_DM2,meta_DM2)
meta_DM2 = pd.read_csv("repro/metadades_bin_DM2.csv", encoding='unicode_escape')
taxoS1_DM2, meta_DM2 = add_metas(taxoS1_DM2,meta_DM2)

## Val
taxoS_val_DM2 = pd.read_csv("repro/train/taxoS_val_DM2.csv", encoding='unicode_escape')
taxoS1_val_DM2 = pd.read_csv("repro/train/taxoS1_val_DM2.csv", encoding='unicode_escape')
meta_val_DM2 = pd.read_csv("repro/metadades_bin_DM2.csv", encoding='unicode_escape')
taxoS_val_DM2, meta_val_DM2 = add_metas(taxoS_val_DM2, meta_val_DM2)
meta_val_DM2 = pd.read_csv("repro/metadades_bin_DM2.csv", encoding='unicode_escape')
taxoS1_val_DM2, meta_val_DM2 = add_metas(taxoS1_val_DM2, meta_val_DM2)

# PD

## Train
taxoS_PD = pd.read_csv("repro/train/taxoS_train_PD.csv", encoding='unicode_escape')
taxoS1_PD = pd.read_csv("repro/train/taxoS1_train_PD.csv", encoding='unicode_escape')
meta_PD = pd.read_csv("repro/metadades_bin_PD.csv", encoding='unicode_escape')
taxoS_PD, meta_PD = add_metas(taxoS_PD,meta_PD)
meta_PD = pd.read_csv("repro/metadades_bin_PD.csv", encoding='unicode_escape')
taxoS1_PD, meta_PD = add_metas(taxoS1_PD,meta_PD)

## Val
taxoS_val_PD = pd.read_csv("repro/train/taxoS_val_PD.csv", encoding='unicode_escape')
taxoS1_val_PD = pd.read_csv("repro/train/taxoS1_val_PD.csv", encoding='unicode_escape')
meta_val_PD = pd.read_csv("repro/metadades_bin_PD.csv", encoding='unicode_escape')
taxoS_val_PD, meta_val_PD = add_metas(taxoS_val_PD, meta_val_PD)
meta_val_PD = pd.read_csv("repro/metadades_bin_PD.csv", encoding='unicode_escape')
taxoS1_val_PD, meta_val_PD = add_metas(taxoS1_val_PD, meta_val_PD)

## DM2

### Models S

In [3]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS_DM2, taxo_val=taxoS_val_DM2, meta_train=meta_DM2, meta_val=meta_val_DM2, 
     k_NN=3, scoring="accuracy", bench="rawdata", ID_model="a", disease="DM2")

KNN: 0.6897. Paràmetres= {'n_neighbors': 30, 'p': 2, 'weights': 'uniform'}
RF: 0.6897. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.5, 'n_estimators': 500}
SVM: 0.7241. Paràmetres= {'C': 5, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.6921. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [834], 'lr': 0.001, 'n_features': 1669}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.669. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 32, 'lr': 0.001, 'n_features': 1669}
Millor model a_rawdata_DM2: SVC(C=5, gamma=0.1, probability=True)[ {'C': 5, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001} ] AUC: 0.7241
Model i paràmetres guardats correctament.


In [6]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS_DM2, taxo_val=taxoS_val_DM2, meta_train=meta_DM2, meta_val=meta_val_DM2, 
     k_NN=3, scoring="accuracy", bench="encoded", ID_model="a", disease="DM2")

INFO:tensorflow:Assets written to: repro/models/altres_models/autoencoders/a_autoencoder_DM2\assets
Model i metadades autoencoder guardats correctament.
KNN: 0.7586. Paràmetres= {'n_neighbors': 99, 'p': 2, 'weights': 'uniform'}
RF: 0.6897. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.25, 'n_estimators': 500}
SVM: 0.7241. Paràmetres= {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.6394. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [12], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.632. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model a_encoded_DM2: KNeighborsClassifier(n_neighbors=99)[ {'n_neighbors': 99, 'p': 2, 'weights': 'uniform'} ] AUC: 0.7586
Model i paràmetres guardats correctament.


In [8]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS_DM2, taxo_val=taxoS_val_DM2, meta_train=meta_DM2, meta_val=meta_val_DM2, 
     k_NN=3, scoring="accuracy", bench="synthetic", ID_model="a", disease="DM2",
     autoencoder_path="repro/models/altres_models/autoencoders/a_autoencoder_DM2")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

Model i metadades synthetizer guardats correctament.


Sampling conditions: 100%|████████████████████████████████████████████████████████| 5000/5000 [00:22<00:00, 219.42it/s]


KNN: 0.7241. Paràmetres= {'n_neighbors': 39, 'p': 1, 'weights': 'uniform'}
RF: 0.6897. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.5, 'n_estimators': 500}
SVM: 0.6897. Paràmetres= {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.7898. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [50, 25], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.8094. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model a_synthetic_DM2: <keras.engine.sequential.Sequential object at 0x000001B7F29DCC70>[ {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100} ] AUC: 0.8094
Model i paràmetres guardats correctament.


In [9]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS_DM2, taxo_val=taxoS_val_DM2, meta_train=meta_DM2, meta_val=meta_val_DM2, 
     k_NN=3, scoring="accuracy", bench="synreal", ID_model="a", disease="DM2",
     autoencoder_path="repro/models/altres_models/autoencoders/a_autoencoder_DM2",
     synthetizer_path="repro/models/altres_models/synthetizers/a_synthetizer_DM2.h5")

Carregant model i metadades synthetizer...


Sampling conditions: 100%|███████████████████████████████████████████████████████████| 134/134 [00:01<00:00, 71.13it/s]


KNN: 0.7241. Paràmetres= {'n_neighbors': 32, 'p': 2, 'weights': 'uniform'}
RF: 0.7241. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.25, 'n_estimators': 500}
SVM: 0.6897. Paràmetres= {'C': 1, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.6816. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [12], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.6929. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model a_synreal_DM2: KNeighborsClassifier(n_neighbors=32)[ {'n_neighbors': 32, 'p': 2, 'weights': 'uniform'} ] AUC: 0.7241
Model i paràmetres guardats correctament.


### Models S1

In [10]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS1_DM2, taxo_val=taxoS1_val_DM2, meta_train=meta_DM2, meta_val=meta_val_DM2, 
     k_NN=3, scoring="accuracy", bench="rawdata", ID_model="b", disease="DM2")

KNN: 0.6897. Paràmetres= {'n_neighbors': 30, 'p': 1, 'weights': 'uniform'}
RF: 0.7586. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.25, 'n_estimators': 500}
SVM: 0.7241. Paràmetres= {'C': 5, 'gamma': 0.5, 'kernel': 'sigmoid', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.6766. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [130, 65], 'lr': 0.001, 'n_features': 520}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.6535. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 16, 'lr': 0.001, 'n_features': 520}
Millor model b_rawdata_DM2: RandomForestClassifier(criterion='entropy', max_depth=4, max_features='sqrt',
                       min_samples_leaf=0.1, min_samples_split=0.25,
                       n_estimators=500)[ {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.25, 'n_estimators': 500} ] AUC: 0.7586
Model i paràmetres guardats correctament.


In [11]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS1_DM2, taxo_val=taxoS1_val_DM2, meta_train=meta_DM2, meta_val=meta_val_DM2, 
     k_NN=3, scoring="accuracy", bench="encoded", ID_model="b", disease="DM2")

INFO:tensorflow:Assets written to: repro/models/altres_models/autoencoders/b_autoencoder_DM2\assets
Model i metadades autoencoder guardats correctament.
KNN: 0.6897. Paràmetres= {'n_neighbors': 88, 'p': 2, 'weights': 'uniform'}
RF: 0.6207. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.25, 'n_estimators': 500}
SVM: 0.6552. Paràmetres= {'C': 10, 'gamma': 0.1, 'kernel': 'sigmoid', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.6545. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [12], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.6389. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 16, 'lr': 0.001, 'n_features': 100}
Millor model b_encoded_DM2: KNeighborsClassifier(n_neighbors=88)[ {'n_neighbors': 88, 'p': 2, 'weights': 'uniform'} ] AUC: 0.6897
Model i paràmetres guardats correctament.


In [12]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS1_DM2, taxo_val=taxoS1_val_DM2, meta_train=meta_DM2, meta_val=meta_val_DM2, 
     k_NN=3, scoring="accuracy", bench="synthetic", ID_model="b", disease="DM2",
     autoencoder_path="repro/models/altres_models/autoencoders/b_autoencoder_DM2")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

Model i metadades synthetizer guardats correctament.


Sampling conditions: 100%|████████████████████████████████████████████████████████| 5000/5000 [00:21<00:00, 235.67it/s]


KNN: 0.6897. Paràmetres= {'n_neighbors': 40, 'p': 2, 'weights': 'uniform'}
RF: 0.6897. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.25, 'n_estimators': 500}
SVM: 0.6552. Paràmetres= {'C': 1, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.8172. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [50], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.8232. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model b_synthetic_DM2: <keras.engine.sequential.Sequential object at 0x000001B7FC6F6820>[ {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100} ] AUC: 0.8232
Model i paràmetres guardats correctament.


In [14]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS1_DM2, taxo_val=taxoS1_val_DM2, meta_train=meta_DM2, meta_val=meta_val_DM2, 
     k_NN=3, scoring="accuracy", bench="synreal", ID_model="b", disease="DM2", 
     autoencoder_path="repro/models/altres_models/autoencoders/b_autoencoder_DM2",
     synthetizer_path="repro/models/altres_models/synthetizers/b_synthetizer_DM2.h5")

Carregant model i metadades synthetizer...


Sampling conditions: 100%|███████████████████████████████████████████████████████████| 134/134 [00:02<00:00, 59.84it/s]


KNN: 0.6552. Paràmetres= {'n_neighbors': 70, 'p': 2, 'weights': 'uniform'}
RF: 0.6552. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.2, 'min_samples_split': 0.5, 'n_estimators': 500}
SVM: 0.6552. Paràmetres= {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.6966. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [50, 25], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.7041. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 16, 'lr': 0.001, 'n_features': 100}
Millor model b_synreal_DM2: <keras.engine.sequential.Sequential object at 0x000001B7D1928220>[ {'batch_size': 32, 'epochs': 1000, 'filters': 16, 'lr': 0.001, 'n_features': 100} ] AUC: 0.7041
Model i paràmetres guardats correctament.


## PD

### Models S

In [15]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS_PD, taxo_val=taxoS_val_PD, meta_train=meta_PD, meta_val=meta_val_PD, 
     k_NN=3, scoring="accuracy", bench="rawdata", ID_model="a", disease="PD")

Traceback (most recent call last):
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 68, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 214, in predict
    neigh_dist, neigh_ind = self.kneighbors(X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 727, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_sample

KNN: 0.5238. Paràmetres= {'n_neighbors': 94, 'p': 1, 'weights': 'uniform'}
RF: 0.5238. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.75, 'n_estimators': 500}
SVM: 0.5714. Paràmetres= {'C': 5, 'gamma': 0.5, 'kernel': 'sigmoid', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.7567. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [818, 409], 'lr': 0.001, 'n_features': 1637}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.6717. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 1637}
Millor model a_rawdata_PD: <keras.engine.sequential.Sequential object at 0x000001B781080940>[ {'batch_size': 32, 'epochs': 1000, 'layer': [818, 409], 'lr': 0.001, 'n_features': 1637} ] AUC: 0.7567
Model i paràmetres guardats correctament.


In [16]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS_PD, taxo_val=taxoS_val_PD, meta_train=meta_PD, meta_val=meta_val_PD, 
     k_NN=3, scoring="accuracy", bench="encoded", ID_model="a", disease="PD")

INFO:tensorflow:Assets written to: repro/models/altres_models/autoencoders/a_autoencoder_PD\assets
Model i metadades autoencoder guardats correctament.


Traceback (most recent call last):
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 68, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 214, in predict
    neigh_dist, neigh_ind = self.kneighbors(X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 727, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_sample

KNN: 0.5238. Paràmetres= {'n_neighbors': 86, 'p': 1, 'weights': 'uniform'}
RF: 0.5238. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.75, 'n_estimators': 500}
SVM: 0.5714. Paràmetres= {'C': 5, 'gamma': 0.1, 'kernel': 'linear', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.746. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [12], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.6492. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model a_encoded_PD: <keras.engine.sequential.Sequential object at 0x000001B7F0731640>[ {'batch_size': 32, 'epochs': 1000, 'layer': [12], 'lr': 0.001, 'n_features': 100} ] AUC: 0.746
Model i paràmetres guardats correctament.


In [17]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS_PD, taxo_val=taxoS_val_PD, meta_train=meta_PD, meta_val=meta_val_PD, 
     k_NN=3, scoring="accuracy", bench="synthetic", ID_model="a", disease="PD",
     autoencoder_path="repro/models/altres_models/autoencoders/a_autoencoder_PD")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

Model i metadades synthetizer guardats correctament.


Sampling conditions: 100%|████████████████████████████████████████████████████████| 5000/5000 [00:23<00:00, 215.46it/s]


KNN: 0.4286. Paràmetres= {'n_neighbors': 30, 'p': 1, 'weights': 'uniform'}
RF: 0.5238. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.75, 'n_estimators': 500}
SVM: 0.5238. Paràmetres= {'C': 10, 'gamma': 0.1, 'kernel': 'linear', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.843. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [50, 25], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.8614. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model a_synthetic_PD: <keras.engine.sequential.Sequential object at 0x000001B7FC71BB80>[ {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100} ] AUC: 0.8614
Model i paràmetres guardats correctament.


In [18]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS_PD, taxo_val=taxoS_val_PD, meta_train=meta_PD, meta_val=meta_val_PD, 
     k_NN=3, scoring="accuracy", bench="synreal", ID_model="a", disease="PD", 
     autoencoder_path="repro/models/altres_models/autoencoders/a_autoencoder_PD",
     synthetizer_path="repro/models/altres_models/synthetizers/a_synthetizer_PD.h5")

Carregant model i metadades synthetizer...


Sampling conditions: 100%|█████████████████████████████████████████████████████████████| 94/94 [00:02<00:00, 36.93it/s]


KNN: 0.381. Paràmetres= {'n_neighbors': 30, 'p': 1, 'weights': 'uniform'}
RF: 0.5238. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.75, 'n_estimators': 500}
SVM: 0.5714. Paràmetres= {'C': 0.1, 'gamma': 0.5, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.7547. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [25], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.7228. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model a_synreal_PD: <keras.engine.sequential.Sequential object at 0x000001B78146FDC0>[ {'batch_size': 32, 'epochs': 1000, 'layer': [25], 'lr': 0.001, 'n_features': 100} ] AUC: 0.7547
Model i paràmetres guardats correctament.


### Models S1

In [19]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS1_PD, taxo_val=taxoS1_val_PD, meta_train=meta_PD, meta_val=meta_val_PD, 
     k_NN=3, scoring="accuracy", bench="rawdata", ID_model="b", disease="PD")

Traceback (most recent call last):
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 68, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 214, in predict
    neigh_dist, neigh_ind = self.kneighbors(X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 727, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_sample

Traceback (most recent call last):
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 68, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 214, in predict
    neigh_dist, neigh_ind = self.kneighbors(X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 727, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_sample

KNN: 0.619. Paràmetres= {'n_neighbors': 76, 'p': 1, 'weights': 'uniform'}
RF: 0.5238. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.75, 'n_estimators': 500}
SVM: 0.5714. Paràmetres= {'C': 10, 'gamma': 0.5, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.5749. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [58], 'lr': 0.001, 'n_features': 460}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.6072. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 460}
Millor model b_rawdata_PD: KNeighborsClassifier(n_neighbors=76, p=1)[ {'n_neighbors': 76, 'p': 1, 'weights': 'uniform'} ] AUC: 0.619
Model i paràmetres guardats correctament.


In [20]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS1_PD, taxo_val=taxoS1_val_PD, meta_train=meta_PD, meta_val=meta_val_PD, 
     k_NN=3, scoring="accuracy", bench="encoded", ID_model="b", disease="PD")

INFO:tensorflow:Assets written to: repro/models/altres_models/autoencoders/b_autoencoder_PD\assets
Model i metadades autoencoder guardats correctament.


Traceback (most recent call last):
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 68, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 214, in predict
    neigh_dist, neigh_ind = self.kneighbors(X)
  File "C:\Users\46375547A\Anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 727, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_sample

KNN: 0.5714. Paràmetres= {'n_neighbors': 30, 'p': 1, 'weights': 'uniform'}
RF: 0.5238. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.75, 'n_estimators': 500}
SVM: 0.5714. Paràmetres= {'C': 0.1, 'gamma': 0.5, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.4798. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [12], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.4892. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model b_encoded_PD: KNeighborsClassifier(n_neighbors=30, p=1)[ {'n_neighbors': 30, 'p': 1, 'weights': 'uniform'} ] AUC: 0.5714
Model i paràmetres guardats correctament.


In [21]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS1_PD, taxo_val=taxoS1_val_PD, meta_train=meta_PD, meta_val=meta_val_PD, 
     k_NN=3, scoring="accuracy", bench="synthetic", ID_model="b", disease="PD", 
     autoencoder_path="repro/models/altres_models/autoencoders/b_autoencoder_PD")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

Model i metadades synthetizer guardats correctament.


Sampling conditions: 100%|████████████████████████████████████████████████████████| 5000/5000 [00:24<00:00, 205.20it/s]


KNN: 0.4762. Paràmetres= {'n_neighbors': 36, 'p': 2, 'weights': 'uniform'}
RF: 0.619. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.2, 'min_samples_split': 0.5, 'n_estimators': 500}
SVM: 0.5238. Paràmetres= {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.7456. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [50], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.7804. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model b_synthetic_PD: <keras.engine.sequential.Sequential object at 0x000001B7F06D5D90>[ {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100} ] AUC: 0.7804
Model i paràmetres guardats correctament.


In [22]:
os.environ['PYTHONHASHSEED'] = '0' 
np.random.seed(42) 
random.seed(42) 
tf.random.set_seed(42)

main(taxo_train=taxoS1_PD, taxo_val=taxoS1_val_PD, meta_train=meta_PD, meta_val=meta_val_PD, 
     k_NN=3, scoring="accuracy", bench="synreal", ID_model="b", disease="PD", 
     autoencoder_path="repro/models/altres_models/autoencoders/b_autoencoder_PD",
     synthetizer_path="repro/models/altres_models/synthetizers/b_synthetizer_PD.h5")

Carregant model i metadades synthetizer...


Sampling conditions: 100%|█████████████████████████████████████████████████████████████| 94/94 [00:01<00:00, 53.16it/s]


KNN: 0.5714. Paràmetres= {'n_neighbors': 82, 'p': 1, 'weights': 'uniform'}
RF: 0.5714. Paràmetres= {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.5, 'n_estimators': 500}
SVM: 0.6667. Paràmetres= {'C': 0.1, 'gamma': 0.5, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001}


  model = KerasClassifier(build_fn=MLP, verbose=0)


MLP: 0.6221. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'layer': [12], 'lr': 0.001, 'n_features': 100}


  model = KerasClassifier(build_fn=CNN, verbose=0)


CNN: 0.5957. Paràmetres= {'batch_size': 32, 'epochs': 1000, 'filters': 64, 'lr': 0.001, 'n_features': 100}
Millor model b_synreal_PD: SVC(C=0.1, gamma=0.5, probability=True)[ {'C': 0.1, 'gamma': 0.5, 'kernel': 'rbf', 'probability': True, 'shrinking': True, 'tol': 0.001} ] AUC: 0.6667
Model i paràmetres guardats correctament.
