In [25]:
#!pip install tensorflow
#!pip install sdv

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import sdv
import joblib
from joblib import dump, load

from sdv.tabular import GaussianCopula, CTGAN, CopulaGAN, TVAE
from sdv.sampling import Condition
from sdv.evaluation import evaluate

from numpy import random
from matplotlib.pyplot import figure
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from tensorflow import keras
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model, Sequential, save_model, load_model
from tensorflow.keras.layers import Dense, Input, Conv1D, Activation, Reshape, Flatten, Dropout, MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve

def which_taxo(file):

    if (file[0].find("a") != -1 and file.find("DM2") != -1):
        taxo = pd.read_csv("repro/test/taxoS_test_DM2.csv")
        nom = "DM2 S"
    elif (file[0].find("b") != -1 and file.find("DM2") != -1):
        taxo = pd.read_csv("repro/test/taxoS1_test_DM2.csv")
        nom = "DM2 S1"
    elif (file[0].find("a") != -1 and file.find("PD") != -1):
        taxo = pd.read_csv("repro/test/taxoS_test_PD.csv")
        nom = "PD S"
    elif (file[0].find("b") != -1 and file.find("PD") != -1):
        taxo = pd.read_csv("repro/test/taxoS1_test_PD.csv")
        nom = "PD S1"
    print(nom)
    return taxo, nom

def filt_y_rows(taxo,meta):
    condition = []
    for i in range(len(taxo)):
        condition.append(pd.DataFrame(meta["condition"][meta["sample"] == 
                                                        taxo["sample"][i]]).iloc[0,0])
    return condition

def drop_nas(taxo,meta):
    taxo["condition"] = meta
    taxo = taxo.dropna()
    meta = taxo["condition"]
    taxo = taxo.iloc[:,1:-1]
    return taxo, meta

def get_roc_curves(y_test, y_prob, pos_label):
    fper1, tper1, tresholds1 = roc_curve(y_test, y_prob, pos_label=pos_label)
    fig,base = plt.subplots()
    base.set_title("Corba ROC" + " " + nomi)
    base.set_xlabel("Especificitat")
    base.set_ylabel("Sensibilitat")
    base.plot(fper1, tper1)
    base.plot([0,1],[0,1],'-')
    fig.savefig("repro/metrics/" + file + '_roc_curve.png')
    plt.close()

def get_test_metrics(file, model, x_test, y_test, labels):
    y_pred = model.predict(x_test)
    if (file.find(".h5") == -1):
        y_prob = model.predict_proba(x_test)[:,1]
    else:
        y_prob = y_pred
    y_pred = pd.DataFrame(np.around(y_pred,0).astype(int))
    y_pred.replace([0,1], labels, inplace=True)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels = labels).ravel()
    acc = accuracy_score(y_test, y_pred)
    sens = tp / (tp+fn)
    espe = tn / (tn+fp)
    AUC = roc_auc_score(y_test, y_prob)
    get_roc_curves(y_test, y_prob, labels[1])
    return acc, sens, espe, AUC
    
def encode_data(x_test, autoencoder):
    x_test = x_test.to_numpy()
    x_test = np.expand_dims(x_test, axis=2)
    x_test = x_test.astype('float32')
    encoded_data = autoencoder.encoder(x_test).numpy()
    encoded_data = pd.DataFrame(encoded_data)
    scaler = MinMaxScaler()
    scaler.fit(encoded_data)
    encoded_data_scaled = pd.DataFrame(scaler.transform(encoded_data))
    return encoded_data_scaled

In [26]:
DirExists = os.path.exists("repro/metrics")
if not DirExists:
    os.makedirs("repro/metrics")
nom = []
nom_model = []
N = []
acc = []
sens = []
espe = []
AUC = []
for file in os.listdir("repro/models/altres_models/"):
    if (file.find("autoencoders") == -1 & file.find("synthetizers") == -1 & file.find("params") == -1 & file.find("checkpoints")==-1):
        print(file)
        x_test, nomi = which_taxo(file)
        if (file.find("DM2") != -1):
            y = pd.read_csv("repro/metadades_DM2.csv")
            labels = ["Control", "DM2"]
            autoencoder_filename = "repro/models/altres_models/autoencoders/" + file[0] + "_autoencoder_DM2" 
        elif (file.find("PD") != -1):
            y = pd.read_csv("repro/metadades_PD.csv")
            labels = ["Control", "PD"]
            autoencoder_filename = "repro/models/altres_models/autoencoders/" + file[0] + "_autoencoder_PD" 
        y_test = filt_y_rows(x_test, y)
        x_test, y_test = drop_nas(x_test, y_test)
        if (file.find("rawdata") == -1):
            autoencoder = tf.saved_model.load(autoencoder_filename)
            x_test = encode_data(x_test, autoencoder)
            if (file.find("encoded") == -1):
                if (file.find("synthetic") != -1):
                    nomi2 = "(Synthetic)"
                elif (file.find("synreal") != -1):
                    nomi2 = "(Syn+Real)"
            else:
                nomi2 = "(Encoded)"
        else:
            nomi2 = "(Rawdata)"
        nomi = nomi + " " + nomi2
        if (file.find(".joblib") == -1):
            model = load_model("repro/models/altres_models/" + file)
        else:
            model = joblib.load("repro/models/altres_models/" + file)
            
        N.append(len(x_test))
        nom.append(nomi)
        nom_model.append(str(model)[0:10])
        acci, sensi, espei, AUCi =  get_test_metrics(file, model, x_test, y_test, labels)
        acc.append(acci)
        sens.append(sensi)
        espe.append(espei)
        AUC.append(AUCi)

results = pd.DataFrame()
results["Nom"] = nom
results["n (test)"] = N
results["Model"] = nom_model
results["Exactitud"] = acc
results["Sensibilitat"] = sens
results["Especificitat"] = espe
results["AUC"] = AUC

results.to_csv("repro/metrics/metriques_models_alternatius.csv", index=False)

print("metrics done")

a_encoded_DM2.joblib
DM2 S
a_encoded_PD.h5
PD S
a_rawdata_DM2.joblib
DM2 S




a_rawdata_PD.h5
PD S
a_synreal_DM2.joblib
DM2 S
a_synreal_PD.h5
PD S
a_synthetic_DM2.h5
DM2 S
a_synthetic_PD.h5
PD S
b_encoded_DM2.joblib
DM2 S1
b_encoded_PD.joblib
PD S1
b_rawdata_DM2.joblib
DM2 S1




b_rawdata_PD.joblib
PD S1
b_synreal_DM2.h5




DM2 S1
b_synreal_PD.joblib
PD S1
b_synthetic_DM2.h5
DM2 S1
b_synthetic_PD.h5
PD S1
metrics done


In [27]:
results[results["Nom"].str.contains('Rawdata')]

Unnamed: 0,Nom,n (test),Model,Exactitud,Sensibilitat,Especificitat,AUC
2,DM2 S (Rawdata),30,"SVC(C=5, g",0.566667,0.533333,0.6,0.64
3,PD S (Rawdata),21,<keras.eng,0.52381,1.0,0.0,0.663636
10,DM2 S1 (Rawdata),30,RandomFore,0.633333,0.8,0.466667,0.817778
11,PD S1 (Rawdata),21,KNeighbors,0.428571,0.090909,0.8,0.509091


In [28]:
results[results["Nom"].str.contains('Encoded')]

Unnamed: 0,Nom,n (test),Model,Exactitud,Sensibilitat,Especificitat,AUC
0,DM2 S (Encoded),30,KNeighbors,0.566667,0.8,0.333333,0.673333
1,PD S (Encoded),21,<keras.eng,0.714286,0.636364,0.8,0.718182
8,DM2 S1 (Encoded),30,KNeighbors,0.633333,0.8,0.466667,0.651111
9,PD S1 (Encoded),21,KNeighbors,0.619048,0.363636,0.9,0.559091


In [29]:
results[results["Nom"].str.contains('Synthetic')]

Unnamed: 0,Nom,n (test),Model,Exactitud,Sensibilitat,Especificitat,AUC
6,DM2 S (Synthetic),30,<keras.eng,0.6,0.6,0.6,0.657778
7,PD S (Synthetic),21,<keras.eng,0.47619,0.0,1.0,0.518182
14,DM2 S1 (Synthetic),30,<keras.eng,0.5,0.533333,0.466667,0.626667
15,PD S1 (Synthetic),21,<keras.eng,0.380952,0.454545,0.3,0.463636


In [30]:
results[results["Nom"].str.contains('Real')]

Unnamed: 0,Nom,n (test),Model,Exactitud,Sensibilitat,Especificitat,AUC
4,DM2 S (Syn+Real),30,KNeighbors,0.6,0.533333,0.666667,0.631111
5,PD S (Syn+Real),21,<keras.eng,0.47619,0.181818,0.8,0.627273
12,DM2 S1 (Syn+Real),30,<keras.eng,0.633333,0.666667,0.6,0.648889
13,PD S1 (Syn+Real),21,"SVC(C=0.1,",0.47619,0.0,1.0,0.281818
