# Reading and scaling

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

def fullRead(pathToTable, sep, anthro = False):

  df_renamed = pd.read_csv(pathToTable, sep = sep, encoding = "latin_1")


  # reading and merging    
  
  if anthro == True:
    df_anthro = pd.read_csv("data/chronicAnthropometricCardiovascularData.csv", sep=";", decimal=",")
    df_renamed = df_renamed.merge(df_anthro)

    # separating by time moment and renaming

    df_renamed["Weight"] = ""
    df_renamed["BMI"] = ""
    df_renamed["Fat"] = ""
    df_renamed["CVRI"] = ""
    df_renamed["Bpmin"] = ""
    df_renamed["Bpmax"] = ""
    df_renamed["Frec"] = ""

    for i in range(len(df_renamed)):

        if df_renamed.loc[i]["Time"] == "Initial":
            df_renamed.loc[i,"Weight"] = df_renamed.loc[i]["Peso inicial"]
            df_renamed.loc[i,"BMI"] = df_renamed.loc[i]["IMC Inicial"]
            df_renamed.loc[i,"Fat"] = df_renamed.loc[i]["Grasa inicial"]
            df_renamed.loc[i,"CVRI"] = df_renamed.loc[i]["IRCV inicial"] 
            df_renamed.loc[i,"Bpmin"] = df_renamed.loc[i]["Bpmin inicial"] 
            df_renamed.loc[i,"Bpmax"] = df_renamed.loc[i]["Bpmax inicial"] 
            df_renamed.loc[i,"Frec"] = df_renamed.loc[i]["Frec inicial"] 
                
        if df_renamed.loc[i]["Time"] == "Final":
        
            df_renamed.loc[i,"Weight"] = df_renamed.loc[i]["Peso final"]
            df_renamed.loc[i,"BMI"] = df_renamed.loc[i]["IMC Final"]
            df_renamed.loc[i,"Fat"] = df_renamed.loc[i]["Grasa final"]
            df_renamed.loc[i,"CVRI"] = df_renamed.loc[i]["IRCV Final"] 
            df_renamed.loc[i,"Bpmin"] = df_renamed.loc[i]["Bpmin final"] 
            df_renamed.loc[i,"Bpmax"] = df_renamed.loc[i]["Bpmax final"] 
            df_renamed.loc[i,"Frec"] = df_renamed.loc[i]["Frec final"] 
        
    df_renamed.drop(columns = ["Peso inicial", "Peso final", "Delta Peso", "Talla", "IMC Inicial", "IMC Final", "Delta IMC", "Grasa inicial", "Grasa final", "Delta Grasa", "IRCV Final", "IRCV inicial", "Bpmin final", "Bpmin inicial", "Bpmax final", "Bpmax inicial", "Frec final", "Frec inicial",], inplace=True )
  
  df_renamed.drop(columns = ["Unnamed: 0", "grouping"], inplace=True )
  df_renamed.fillna(0, inplace=True)
  return df_renamed

def scaling(df_read):
   
   scaler = preprocessing.MinMaxScaler()
   numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns
   df_read[numCols] = scaler.fit_transform(df_read[numCols])
   return df_read

df = scaling(fullRead("data/plasmAnt_ord.csv",  sep = ",", anthro= True))

  numCols = df_read.select_dtypes(include=np.number).drop("numVol",1).columns


# Split and encoding

In [2]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
 
enc = OrdinalEncoder()
enc.fit(df[["Sweetener", "Sex", "Time"]])
df[["Sweetener", "Sex", "Time"]] = enc.transform(df[["Sweetener", "Sex",  "Time"]])

# Initial = 1, Final = 0

X, y = df[df["Time"] == 1].drop(["numVol", "Time"], axis=1), df[df["Time"] == 0].drop(['Time','numVol', 'Sweetener', 'Time', 'Sex', 'Weight','BMI', 'Fat', 'CVRI', 'Bpmin', 'Bpmax', 'Frec'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_full, y_full = df[df["Time"] == 1].drop(["numVol", "Time"], axis=1), df[df["Time"] == 0].drop(['Time','numVol', 'Sweetener', 'Time'], axis = 1)
X_fulltrain, X_fulltest, y_fulltrain, y_fulltest = train_test_split(X_full, y_full, test_size=0.3, random_state=42)

#X_test.to_csv("X_met_test_urineAnt.csv", index=False)
#X_fulltest.to_csv("X_full_test_urineAnt.csv",index=False)

# XGB MET

In [23]:
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV

xgbReg = XGBRegressor()
#xgbRegFit = xgbReg.fit(X_train, y_train)

param_grid = {'max_depth'        : [None, 1, 3, 5, 10, 20],
              'subsample'        : [0.5, 1],
              'learning_rate'    : [0.001, 0.01, 0.1],
              'booster'          : ['gbtree']
             }


grid_search = GridSearchCV(estimator = xgbReg, param_grid = param_grid, cv= 10, n_jobs=-1,
                           verbose=2)

grid_search.fit(X_train, y_train)
best_grid = grid_search.best_estimator_


cv = RepeatedKFold(n_splits=20, n_repeats=10, random_state=1)
n_scores = cross_val_score(best_grid, X_test, y_test,  scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
n_scores = np.absolute(n_scores)

print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))


Fitting 10 folds for each of 36 candidates, totalling 360 fits
MAE: 0.113 (0.035)


# XGB FULL

In [7]:
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV

xgbReg = XGBRegressor()
#xgbRegFit = xgbReg.fit(X_train, y_train)

param_grid = {'max_depth'        : [None, 1, 3, 5, 10, 20],
              'subsample'        : [0.5, 1],
              'learning_rate'    : [0.001, 0.01, 0.1],
              'booster'          : ['gbtree']
             }


grid_search = GridSearchCV(estimator = xgbReg, param_grid = param_grid, cv= 10, n_jobs=-1,
                           verbose=2)

grid_search.fit(X_fulltrain, y_fulltrain)
best_grid = grid_search.best_estimator_


cv = RepeatedKFold(n_splits=20, n_repeats=10, random_state=1)
n_scores = cross_val_score(best_grid, X_fulltest, y_fulltest,  scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
n_scores = np.absolute(n_scores)

print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))


Fitting 10 folds for each of 36 candidates, totalling 360 fits
MAE: 0.110 (0.023)


# More complex MLP

In [4]:
import numpy as np
from sklearn.model_selection import RepeatedKFold, GridSearchCV, cross_val_score
from scikeras.wrappers import KerasRegressor

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf

tf.get_logger().setLevel('ERROR')

# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

epochs = [10, 50, 100]
batch_size = [10, 20, 40, 60, 80, 100]

param_grid = dict(batch_size=batch_size, epochs=epochs)


def get_model(n_inputs, n_outputs):
    model_nn = Sequential()
    model_nn.add(Dense(64, input_shape=(X.shape[1],),activation="relu"))
    model_nn.add(Dropout(0.5))
    model_nn.add(Dense(32, activation="relu"))
    model_nn.add(Dropout(0.5))
    model_nn.add(Dense(32, activation = "relu"))
    model_nn.add(Dropout(0.7))
    model_nn.add(Dense(16, activation = "relu"))
    model_nn.add(Dropout(0.9))
    model_nn.add(Dense(y.shape[1], activation='linear'))
    model_nn.compile(loss='mae', optimizer=tf.keras.optimizers.Adam())

    return model_nn

def evaluate_model(X, y):
 results = list()
 n_inputs, n_outputs = X.shape[1], y.shape[1]
 # define evaluation procedure
 cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# define modeld
 model_nn = KerasRegressor(model = get_model(n_inputs, n_outputs), optimizer=tf.keras.optimizers.Adam(), verbose=0)
 # fit model
 grid = GridSearchCV(estimator=model_nn, param_grid=param_grid, n_jobs=-1, cv=3, verbose=0)
 grid_result = grid.fit(X_train, y_train) # evaluate model on test set
 # summarize results
 best_grid = grid_result.best_estimator_
# define the evaluation procedure
 cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
 n_scores = cross_val_score(best_grid, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
 n_scores = np.absolute(n_scores)
 print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

evaluate_model(X, y)


MAE: 0.117 (0.013)
