# Reading, cleaning, scaling, enconding and spliting

## Functions

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import 
import re 

def fullRead(pathToTable, sep, anthro = False):

  df_renamed = pd.read_csv(pathToTable, sep = sep, encoding = "latin_1")
  
  df_name = re.sub("_ord.csv","",(re.sub("data/", "" ,pathToTable)))
  # reading and merging    
  
  if anthro == True:
    df_anthro = pd.read_csv("data/chronicAnthropometricCardiovascularData.csv", sep=";", decimal=",")
    df_renamed = df_renamed.merge(df_anthro)

    # separating by time moment and renaming

    df_renamed["Weight"] = ""
    df_renamed["BMI"] = ""
    df_renamed["Fat"] = ""
    df_renamed["CVRI"] = ""
    df_renamed["Bpmin"] = ""
    df_renamed["Bpmax"] = ""
    df_renamed["Frec"] = ""

    for i in range(len(df_renamed)):

        if df_renamed.loc[i]["Time"] == "Initial":
            df_renamed.loc[i,"Weight"] = df_renamed.loc[i]["Peso inicial"]
            df_renamed.loc[i,"BMI"] = df_renamed.loc[i]["IMC Inicial"]
            df_renamed.loc[i,"Fat"] = df_renamed.loc[i]["Grasa inicial"]
            df_renamed.loc[i,"CVRI"] = df_renamed.loc[i]["IRCV inicial"] 
            df_renamed.loc[i,"Bpmin"] = df_renamed.loc[i]["Bpmin inicial"] 
            df_renamed.loc[i,"Bpmax"] = df_renamed.loc[i]["Bpmax inicial"] 
            df_renamed.loc[i,"Frec"] = df_renamed.loc[i]["Frec inicial"] 
                
        if df_renamed.loc[i]["Time"] == "Final":
        
            df_renamed.loc[i,"Weight"] = df_renamed.loc[i]["Peso final"]
            df_renamed.loc[i,"BMI"] = df_renamed.loc[i]["IMC Final"]
            df_renamed.loc[i,"Fat"] = df_renamed.loc[i]["Grasa final"]
            df_renamed.loc[i,"CVRI"] = df_renamed.loc[i]["IRCV Final"] 
            df_renamed.loc[i,"Bpmin"] = df_renamed.loc[i]["Bpmin final"] 
            df_renamed.loc[i,"Bpmax"] = df_renamed.loc[i]["Bpmax final"] 
            df_renamed.loc[i,"Frec"] = df_renamed.loc[i]["Frec final"] 
        
    df_renamed.drop(columns = ["Peso inicial", "Peso final", "Delta Peso", "Talla", "IMC Inicial", "IMC Final", "Delta IMC", "Grasa inicial", "Grasa final", "Delta Grasa", "IRCV Final", "IRCV inicial", "Bpmin final", "Bpmin inicial", "Bpmax final", "Bpmax inicial", "Frec final", "Frec inicial",], inplace=True )
  
  df_renamed.drop(columns = ["Unnamed: 0", "grouping"], inplace=True )
  df_renamed.fillna(0, inplace=True)
  return (df_renamed, df_name)

def scaling(df_read):
   
   scaler = preprocessing.MinMaxScaler()
   numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns
   df_read[numCols] = scaler.fit_transform(df_read[numCols])
   return df_read

def encodingSplitting(df):
  df = pd.get_dummies(df, columns = ["Sweetener", "Sex", "Time"], drop_first=False)
  X_met, y_met = df[df["Time_Initial"] == 1].drop(["numVol", "Time_Initial", "Time_Final"], axis=1), df[df["Time_Final"] == 1].drop(['Sweetener_SA', 'Sweetener_ST','Sweetener_SU', 'Sex_MAN', 'Sex_WOMAN', 'Time_Final', 'Time_Initial','numVol', 'Weight','BMI', 'Fat', 'CVRI', 'Bpmin', 'Bpmax', 'Frec'], axis = 1)
  X_metTrain, X_metTest, y_metTrain, y_metTest = train_test_split(X_met, y_met, test_size=0.2, random_state=42)

  X_full, y_full = df[df["Time_Initial"] == 1].drop(["numVol", "Time_Initial", "Time_Final"], axis=1), df[df["Time_Final"] == 1].drop(['numVol','Sweetener_SA', 'Sweetener_ST','Sweetener_SU','Time_Final', 'Time_Initial'], axis = 1)
  X_fullTrain, X_fullTest, y_fullTrain, y_fullTest = train_test_split(X_full, y_full, test_size=0.3, random_state=42)

  return(X_met, y_met, X_metTrain, X_metTest, y_metTrain, y_metTest, X_full, y_full, X_fullTrain, X_fullTest, y_fullTrain, y_fullTest)


#df_PF = pd.get_dummies(scaling(fullRead("data/plasmFlav_ord.csv",  sep = ",", anthro= True)), columns = ["Sweetener", "Sex", "Time"], drop_first=False)
#df_PA = scaling(fullRead("data/plasmAnt_ord.csv",  sep = ",", anthro= True))
#df_UF = scaling(fullRead("data/urineFlav_ord.csv",  sep = ",", anthro= True))
#df_UA = scaling(fullRead("data/urineAnt_ord.csv",  sep = ",", anthro= True))
#X_test.to_csv("X_met_test_urineAnt.csv", index=False)
#X_fulltest.to_csv("X_full_test_urineAnt.csv",index=False)

def XGBReg (df, df_name, met):

    X_met, y_met, X_metTrain, X_metTest, y_metTrain, y_metTest, X_full, y_full, X_fullTrain, X_fullTest, y_fullTrain, y_fullTest = encodingSplitting(df)

    if (met):
        
        xgbReg = XGBRegressor()

        param_grid = {'max_depth'        : [None, 1, 3, 5, 10, 20],
                    'subsample'        : [0.5, 1],
                    'learning_rate'    : [0.001, 0.01, 0.1],
                    'booster'          : ['gbtree', 'gblinear', 'dart']
                    }


        grid_search = GridSearchCV(estimator = xgbReg, param_grid = param_grid, cv= 3, n_jobs=-1,
                                verbose=2)

        grid_search.fit(X_metTrain, y_metTrain)
        best_grid = grid_search.best_estimator_


        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
        n_scores = cross_val_score(best_grid, X_metTest, y_metTest,  scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
        n_scores = np.absolute(n_scores)

        print("Only metabolic model " + df_name +' MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
        MAE = (np.mean(n_scores), np.std(n_scores))
        return(grid_search, MAE)
    
    else:
        xgbReg = XGBRegressor()

        param_grid = {'max_depth'        : [None, 1, 3, 5, 10, 20],
                    'subsample'        : [0.5, 1],
                    'learning_rate'    : [0.001, 0.01, 0.1],
                    'booster'          : ['gbtree', 'gblinear', 'dart']
                    }


        grid_search = GridSearchCV(estimator = xgbReg, param_grid = param_grid, cv= 3, n_jobs=-1,
                                verbose=2)

        grid_search.fit(X_fullTrain, y_fullTrain)
        best_grid = grid_search.best_estimator_

        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
        n_scores = cross_val_score(best_grid, X_fullTest, y_fullTest,  scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
        n_scores = np.absolute(n_scores)

        print("Full model "+ df_name + ' MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
        MAE = (np.mean(n_scores), np.std(n_scores))
        return(grid_search, MAE)        
    
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
# import pickle

# define model

def randomForestReg(df, df_name, met = True):

    param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
    
    if (met):
        
        X_met, y_met, X_metTrain, X_metTest, y_metTrain, y_metTest, X_full, y_full, X_fullTrain, X_fullTest, y_fullTrain, y_fullTest = encodingSplitting(df)

        model = RandomForestRegressor()

        grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

        grid_search.fit(X_metTrain, y_metTrain)
        best_grid = grid_search.best_estimator_
        # define the evaluation procedure
        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
        # evaluate the model and collect the scores
        n_scores = cross_val_score(best_grid, X_metTest, y_metTest, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
        # force the scores to be positive
        n_scores = np.absolute(n_scores)
        # summarize performance

        #filename = 'rf_met_plasmAnt.pkl'
        #with open(filename, 'wb') as file:
        #    pickle.dump(grid_search.best_estimator_, file)

        print('Only Metabolic model ' + df_name + ': MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
        MAE = (np.mean(n_scores), np.std(n_scores))
        return(grid_search, MAE)

    else:
            
        X_met, y_met, X_metTrain, X_metTest, y_metTrain, y_metTest, X_full, y_full, X_fullTrain, X_fullTest, y_fullTrain, y_fullTest = encodingSplitting(df)

        model = RandomForestRegressor()

        grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                        cv = 3, n_jobs = -1, verbose = 2)

        grid_search.fit(X_fullTrain, y_fullTrain)
        best_grid = grid_search.best_estimator_
        # define the evaluation procedure
        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
        # evaluate the model and collect the scores
        n_scores = cross_val_score(best_grid, X_fullTest, y_fullTest, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
        # force the scores to be positive
        n_scores = np.absolute(n_scores)
        # summarize performance

        #filename = 'rf_met_plasmAnt.pkl'
        #with open(filename, 'wb') as file:
        #    pickle.dump(grid_search.best_estimator_, file)

        print('Full model ' + df_name + ': MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
        MAE = (np.mean(n_scores), np.std(n_scores))
        return(grid_search, MAE)

# mlp for multi-output regression

import numpy as np
import tensorflow as tf
# import pickle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score, RepeatedKFold, GridSearchCV

tf.get_logger().setLevel('ERROR')
# fix random seed for reproducibility

def MLPReg (df, df_name, met):
    
    if (met):
        seed = 7
        tf.random.set_seed(seed)

        X_met, y_met, X_metTrain, X_metTest, y_metTrain, y_metTest, X_full, y_full, X_fullTrain, X_fullTest, y_fullTrain, y_fullTest = encodingSplitting(df)


        epochs = [10, 50, 100]
        batch_size = [10, 20, 40, 60, 80, 100]

        param_grid = dict(batch_size=batch_size, epochs=epochs)

        # get the model
        def get_model(n_inputs, n_outputs):
            model_nn = Sequential()
            model_nn.add(Dense(64, input_shape=(n_inputs,),activation="relu"))
            model_nn.add(Dropout(0.5))
            model_nn.add(Dense(32, activation="relu"))
            model_nn.add(Dropout(0.5))
            model_nn.add(Dense(n_outputs, activation='linear'))
            model_nn.compile(loss='mae', optimizer=tf.keras.optimizers.Adam())

            return model_nn
        
        # evaluate a model using repeated k-fold cross-validation
        def evaluate_model(X, y):
            results = list()
            n_inputs, n_outputs = X.shape[1], y.shape[1]
            # define evaluation procedure
            cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            # define modeld
            model_nn = KerasRegressor(model = get_model(n_inputs, n_outputs), optimizer=tf.keras.optimizers.Adam(), verbose=0)
            # fit model
            grid = GridSearchCV(estimator=model_nn, param_grid=param_grid, n_jobs=-1, cv=3, verbose=0)
            grid_result = grid.fit(X_train, y_train) # evaluate model on test set
            # summarize results
            best_grid = grid_result.best_estimator_
            # define the evaluation procedure
            cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
            # evaluate the model and collect the scores
            n_scores = cross_val_score(best_grid, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
            # force the scores to be positive
            n_scores = np.absolute(n_scores)

            # store the model
            # filename = 'mlp_met_plasmAnt.pkl'
            # with open(filename, 'wb') as file:
            #    pickle.dump(grid_result.best_estimator_, file)

            # summarize performance
            print('Only Metabolic model ' + df_name +'MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
            MAE = (np.mean(n_scores), np.std(n_scores))
            return(best_grid, MAE)
            # evaluate model
            
        modelMLP, MAE = evaluate_model(X_met, y_met)
    
    else:
        seed = 7
        tf.random.set_seed(seed)

        X_met, y_met, X_metTrain, X_metTest, y_metTrain, y_metTest, X_full, y_full, X_fullTrain, X_fullTest, y_fullTrain, y_fullTest = encodingSplitting(df)


        epochs = [10, 50, 100]
        batch_size = [10, 20, 40, 60, 80, 100]

        param_grid = dict(batch_size=batch_size, epochs=epochs)

        # get the model
        def get_model(n_inputs, n_outputs):
            model_nn = Sequential()
            model_nn.add(Dense(64, input_shape=(n_inputs,),activation="relu"))
            model_nn.add(Dropout(0.5))
            model_nn.add(Dense(32, activation="relu"))
            model_nn.add(Dropout(0.5))
            model_nn.add(Dense(n_outputs, activation='linear'))
            model_nn.compile(loss='mae', optimizer=tf.keras.optimizers.Adam())

            return model_nn
        
        # evaluate a model using repeated k-fold cross-validation
        def evaluate_model(X, y):
            results = list()
            n_inputs, n_outputs = X.shape[1], y.shape[1]
            # define evaluation procedure
            cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            # define modeld
            model_nn = KerasRegressor(model = get_model(n_inputs, n_outputs), optimizer=tf.keras.optimizers.Adam(), verbose=0)
            # fit model
            grid = GridSearchCV(estimator=model_nn, param_grid=param_grid, n_jobs=-1, cv=3, verbose=0)
            grid_result = grid.fit(X_train, y_train) # evaluate model on test set
            # summarize results
            best_grid = grid_result.best_estimator_
            # define the evaluation procedure
            cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
            # evaluate the model and collect the scores
            n_scores = cross_val_score(best_grid, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
            # force the scores to be positive
            n_scores = np.absolute(n_scores)

            # store the model
            # filename = 'mlp_met_plasmAnt.pkl'
            # with open(filename, 'wb') as file:
            #    pickle.dump(grid_result.best_estimator_, file)

            # summarize performance
            print('Full model ' + df_name + 'MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
            MAE = (np.mean(n_scores), np.std(n_scores))
            return(best_grid, MAE)
            # evaluate model
            
        modelMLP, MAE = evaluate_model(X_full, y_full)    
    return (modelMLP, MAE)



# Running models

In [2]:
paths = ["plasmFlav_ord.csv", "plasmAnt_ord.csv", "urineFlav_ord.csv", "urineAnt_ord.csv"]
paths = ["data/" + s for s in paths]

resultsModelsRF = {}
resultsScoresRF = {}

for path in paths:
    df, df_name = fullRead(path, sep = ",", anthro = True)
    resultsModelsRF["modelRFMet{0}".format(df_name)], resultsScoresRF["modelRFMet{0}".format(df_name)] = randomForestReg(scaling(df), df_name, met = True)
    resultsModelsRF["modelRFFull{0}".format(df_name)], resultsScoresRF["modelRFFull{0}".format(df_name)] = randomForestReg(scaling(df), df_name, met = False)

#modelRFMet = randomForestReg(scaling(fullRead("data/plasmFlav_ord.csv",  sep = ",", anthro= True)[0]), met = True)
#modelRFFull = randomForestReg(scaling(fullRead("data/plasmFlav_ord.csv",  sep = ",", anthro= True)), met = False)

resultsModelsMLP = {}
resultsScoresMLP = {}

for path in paths:
    df, df_name = fullRead(path, sep = ",", anthro = True)
    resultsModelsMLP["modelMLPMet{0}".format(df_name)], resultsScoresMLP["modelMLPMet{0}".format(df_name)] = MLPReg(scaling(df), df_name, met = True)
    resultsModelsMLP["modelMLPFull{0}".format(df_name)], resultsScoresMLP["modelMLPFull{0}".format(df_name)] = MLPReg(scaling(df), df_name, met = False)

resultsModelsXGB = {}
resultsScoresXGB = {}

for path in paths:
    df, df_name = fullRead(path, sep = ",", anthro = True)
    resultsModelsXGB["modelXGBMet{0}".format(df_name)], resultsScoresXGB["modelXGBMet{0}".format(df_name)] = XGBReg(scaling(df), df_name, met = True)
    resultsModelsXGB["modelXGBFull{0}".format(df_name)], resultsScoresXGB["modelXGBFull{0}".format(df_name)] = XGBReg(scaling(df), df_name, met = False)



  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Fitting 3 folds for each of 288 candidates, totalling 864 fits
Only Metabolic model plasmFlav: MAE: 0.053 (0.020)
Fitting 3 folds for each of 288 candidates, totalling 864 fits


  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Full model plasmFlav: MAE: 0.107 (0.010)


  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Fitting 3 folds for each of 288 candidates, totalling 864 fits
Only Metabolic model plasmAnt: MAE: 0.103 (0.017)
Fitting 3 folds for each of 288 candidates, totalling 864 fits


  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Full model plasmAnt: MAE: 0.123 (0.011)


  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Fitting 3 folds for each of 288 candidates, totalling 864 fits
Only Metabolic model urineFlav: MAE: 0.058 (0.019)
Fitting 3 folds for each of 288 candidates, totalling 864 fits


  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Full model urineFlav: MAE: 0.092 (0.012)


  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Fitting 3 folds for each of 288 candidates, totalling 864 fits
Only Metabolic model urineAnt: MAE: 0.076 (0.014)
Fitting 3 folds for each of 288 candidates, totalling 864 fits


  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Full model urineAnt: MAE: 0.095 (0.008)


  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Only Metabolic model plasmFlavMAE: 0.053 (0.021)


  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


Full model plasmFlavMAE: 0.091 (0.012)
Only Metabolic model plasmAntMAE: 0.096 (0.019)
Full model plasmAntMAE: 0.107 (0.017)
Only Metabolic model urineFlavMAE: 0.052 (0.021)
Full model urineFlavMAE: 0.078 (0.013)
Only Metabolic model urineAntMAE: 0.071 (0.019)
Full model urineAntMAE: 0.086 (0.013)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Only metabolic model plasmFlav MAE: 0.066 (0.020)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Full model plasmFlav MAE: 0.104 (0.009)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Only metabolic model plasmAnt MAE: 0.109 (0.018)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Full model plasmAnt MAE: 0.121 (0.015)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Only metabolic model urineFlav MAE: 0.071 (0.026)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Full model urineFlav MAE: 0.078 (0.015)
Fitting 3 folds for each of 108 candidates, totalling 3

# Save models and scores

In [3]:
import pickle

with open('resultsScoresXGB.pkl', 'wb') as fp:
    pickle.dump(resultsScoresXGB, fp)
    print('dictionary saved successfully to file')

with open('resultsModelsXGB.pkl', 'wb') as fp:
    pickle.dump(resultsModelsXGB, fp)
    print('dictionary saved successfully to file')

with open('resultsModelsRF.pkl', 'wb') as fp:
    pickle.dump(resultsModelsRF, fp)
    print('dictionary saved successfully to file')

with open('resultsScoresRF.pkl', 'wb') as fp:
    pickle.dump(resultsScoresRF, fp)
    print('dictionary saved successfully to file')

with open('resultsModelsMLP.pkl', 'wb') as fp:
    pickle.dump(resultsModelsMLP, fp)
    print('dictionary saved successfully to file')

with open('resultsScoresMLP.pkl', 'wb') as fp:
    pickle.dump(resultsScoresMLP, fp)
    print('dictionary saved successfully to file')    

dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file


# Load models and scores

In [2]:
import pickle

names = ["resultsScores", "resultsModels"]
models = ["RF", "XGB", "MLP"]

for i in [name+model for name in names for model in models]:
    with open (i+".pkl", 'rb') as f:
        i = pickle.load(f)

ValueError: node array from the pickle has an incompatible dtype:
- expected: [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]
- got     : {'names':['left_child','right_child','feature','threshold','impurity','n_node_samples','weighted_n_node_samples','missing_go_to_left'], 'formats':['<i8','<i8','<i8','<f8','<f8','<i8','<f8','u1'], 'offsets':[0,8,16,24,32,40,48,56], 'itemsize':64}

# Feature Selection experiments

SelectFromModel method

In [85]:
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_absolute_error
import numpy as np
from sklearn.ensemble import RandomForestRegressor

def randomForestRegCV(X_train, y_train):
    model = RandomForestRegressor()
    param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
    }
    grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

    grid_search.fit(X_train, y_train)
    best_grid = grid_search.best_estimator_
    return(best_grid)
    
def featureSelectionCustom(df, df_name, modelDict, MLmethod, met = True):

    X_met, y_met, X_metTrain, X_metTest, y_metTrain, y_metTest, X_full, y_full, X_fullTrain, X_fullTest, y_fullTrain, y_fullTest = encodingSplitting(df)

    if (met):
        X_train, y_train, X_test, y_test = X_metTrain, y_metTrain, X_metTest, y_metTest
        model_name = "model" +  MLmethod + "Met"+ df_name 
    else: 
        X_train, y_train, X_test, y_test = X_fullTrain, y_fullTrain, X_fullTest, y_fullTest
        model_name = "model" +  MLmethod + "Full" + df_name
    
    # grid_search_testing = resultsModelsRF["modelRFFullplasmAnt"] # model PF_met
    

    grid_search_testing = modelDict[model_name]

    model = grid_search_testing.best_estimator_
    print("##### " + model_name + " #####")
    thresholds = np.sort(model.feature_importances_)
    results = pd.DataFrame()

    for thresh in thresholds:
        # select features using threshold
        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        # feature_idx = selection.get_support()
        
        # print(X_train.columns[feature_idx])
        # print("threshold" + str(thresh))
        select_X_train = selection.transform(X_train)
        # train model
        selection_model = RandomForestRegressor() #randomForestRegCV(select_X_train, y_fullTrain)
        selection_model.fit(select_X_train, y_train)
        # eval model
        select_X_test = selection.transform(X_test)
        y_pred = selection_model.predict(select_X_test)
        predictions = [value for value in y_pred]
        mae = mean_absolute_error(y_test, predictions)
        result1 = pd.DataFrame({"thresh": thresh, "n":select_X_train.shape[1], "MAE":mae}, index = [30 - select_X_train.shape[1]])
        results = pd.concat([results, result1])
        # print("Thresh=%.3f, n=%d, MAE: %.4f" % (thresh, select_X_train.shape[1], accuracy))

    return (results)    

paths = ["plasmFlav_ord.csv", "plasmAnt_ord.csv", "urineFlav_ord.csv", "urineAnt_ord.csv"]
paths = ["data/" + s for s in paths]

resultsFeatureSelectionRFMet = {}
resultsFeatureSelectionRFFull = {}
#resultsScoresXGB = {}

for path in paths:
     
    df, df_name = fullRead(path, sep = ",", anthro = True)
   
    print ("--------")
    print ("starting df " + df_name)
    print ("--------")
   
    resultsFeatureSelectionRFMet["featureSelectionRFMet{0}".format(df_name)]= featureSelectionCustom(scaling(df), df_name, modelDict =  resultsModelsRF, MLmethod= "RF", met = True)
    resultsFeatureSelectionRFFull["featureSelectionRFMet{0}".format(df_name)]= featureSelectionCustom(scaling(df), df_name, modelDict =  resultsModelsRF, MLmethod= "RF", met = False)
    
    print ("--------")
    print ("ended df")
    print ("--------")
    
    #resultsModelsXGB["modelXGBFull{0}".format(df_name)], resultsScoresXGB["modelXGBFull{0}".format(df_name)] = XGBReg(scaling(df), df_name, met = False)

--------
starting df plasmFlav
--------
##### modelRFMetplasmFlav #####
##### modelRFFullplasmFlav #####
--------
ended df
--------
--------
starting df plasmAnt
--------
##### modelRFMetplasmAnt #####
##### modelRFFullplasmAnt #####
--------
ended df
--------
--------
starting df urineFlav
--------
##### modelRFMeturineFlav #####
##### modelRFFullurineFlav #####
--------
ended df
--------
--------
starting df urineAnt
--------
##### modelRFMeturineAnt #####
##### modelRFFullurineAnt #####
--------
ended df
--------


In [87]:
resultsFeatureSelectionRFMet

{'featureSelectionRFMetplasmFlav':       thresh   n       MAE
 13  0.006681  17  0.059640
 14  0.009412  16  0.059257
 15  0.010632  15  0.057757
 16  0.012401  14  0.056980
 17  0.021008  13  0.058249
 18  0.021569  12  0.057027
 19  0.052921  11  0.056726
 20  0.055310  10  0.058157
 21  0.056185   9  0.058601
 22  0.073166   8  0.057695
 23  0.075995   7  0.056417
 24  0.086433   6  0.062026
 25  0.087071   5  0.058898
 26  0.093950   4  0.060619
 27  0.098331   3  0.062081
 28  0.101352   2  0.064049
 29  0.137583   1  0.068315,
 'featureSelectionRFMetplasmAnt':        thresh   n       MAE
 -1   0.000000  31  0.111678
  0   0.007219  30  0.113253
  1   0.008098  29  0.115496
  2   0.010562  28  0.115267
  3   0.014207  27  0.112872
  4   0.015238  26  0.113640
  5   0.020667  25  0.114611
  6   0.021405  24  0.112578
  7   0.022977  23  0.115560
  8   0.024787  22  0.113786
  9   0.026222  21  0.113813
  10  0.027375  20  0.113637
  11  0.027488  19  0.114491
  12  0.029276  18  0.

In [79]:
thresholdSelected = results.sort_values(by= "MAE", ascending = True).head(n=1)["thresh"]

# Si hay que reentrenar:



# select features using threshold
selection = SelectFromModel(model, threshold=thresholdSelected, prefit=True)
feature_idx = selection.get_support()
print(X_train.columns[feature_idx])
select_X_train = selection.transform(X_train)
# train model
selection_model = RandomForestRegressor()
selection_model.fit(select_X_train, y_train)


Index(['HE.G', 'N.G', 'Weight', 'BMI', 'Fat', 'Bpmin', 'Frec'], dtype='object')


In [1]:
import eli5 
from eli5.sklearn import PermutationImportance
perm_base = PermutationImportance(modelMLPMet, random_state = 1).fit(X, y)
eli5.show_weights(perm_base, feature_names = X.columns.tolist())

ImportError: cannot import name 'if_delegate_has_method' from 'sklearn.utils.metaestimators' (c:\Users\dres2\anaconda3\Lib\site-packages\sklearn\utils\metaestimators.py)

In [22]:
import eli5 
from eli5.sklearn import PermutationImportance
perm_base = PermutationImportance(uwu.best_estimator_, random_state = 1).fit(X_fulltrain, y_fulltrain)
eli5.show_weights(perm_base, feature_names = X_fulltrain.columns.tolist())

Weight,Feature
0.0905  ± 0.0166,Sex
0.0336  ± 0.0154,Sweetener
0.0169  ± 0.0053,Weight
0.0153  ± 0.0060,BMI
0.0080  ± 0.0031,Fat
0.0050  ± 0.0025,CVRI
0.0026  ± 0.0016,Bpmin
0.0026  ± 0.0009,Bpmax
0.0024  ± 0.0019,VA.GS
0.0022  ± 0.0016,CA


In [None]:
# Full parameters gridsearch
# DONT EXECUTE
'''
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import RepeatedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.constraints import MaxNorm
from sklearn.model_selection import GridSearchCV

from scikeras.wrappers import KerasRegressor
import tensorflow as tf


# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
weight_constraint = [1.0, 2.0, 3.0, 4.0, 5.0]

param_grid = dict(batch_size=batch_size, epochs=epochs, optimizer=optimizer, optimizer__learning_rate=learn_rate, optimizer__momentum=momentum,
                  model__init_mode=init_mode, model__activation=activation, model__dropout_rate=dropout_rate, model__weight_constraint=weight_constraint)

# get the model
def get_model(n_inputs, n_outputs, dropout_rate, weight_constraint, activation="relu", init_mode='uniform'):
    model_nn = Sequential()
    model_nn.add(Dense(64, input_shape=(X.shape[1],),activation=activation, kernel_initializer=init_mode, kernel_constraint=MaxNorm(weight_constraint)))
    model_nn.add(Dropout(dropout_rate))
    model_nn.add(Dense(32, activation=activation, kernel_initializer=init_mode))
    model_nn.add(Dropout(dropout_rate))
    model_nn.add(Dense(y.shape[1], activation='linear', kernel_initializer=init_mode))
    return model_nn
 
# evaluate a model using repeated k-fold cross-validation
def evaluate_model(X, y):
 results = list()
 n_inputs, n_outputs = X.shape[1], y.shape[1]
 # define evaluation procedure
 cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# define modeld
 model_nn = KerasRegressor(model = get_model) #(n_inputs, n_outputs, activation="relu", init_mode='uniform'))
 # fit model
 grid = GridSearchCV(estimator=model_nn, param_grid=param_grid, n_jobs=-1, cv=3)
 grid_result = grid.fit(X_train, y_train) # evaluate model on test set
 mae = model_nn.evaluate(X_test, y_test)
 # store result
 print('>%.3f' % mae)
 results.append(mae)
 return results
 
# evaluate model
results = evaluate_model(X, y)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(results), std(results)))
'''