# Model selection

## Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,r2_score,mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV

from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


## Load data

In [4]:
# leer data
data_m = pd.read_csv('../../data/processed/data_m_v1.csv')
data_f = pd.read_csv('../../data/processed/data_f_v1.csv')


In [5]:
data_m

Unnamed: 0,edad,peso,talla,per_brazo_rel,per_brazo_ten,per_antebrazo,per_torax,per_cintura,per_cadera,per_muslo_max,per_muslo_medial,per_pantorrilla,masa_piel,masa_adiposa,masa_muscular,masa_residual,masa_osea
0,-1.036390,-1.129647,-1.041523,-0.245085,-0.364953,-0.791100,-0.661568,-1.015423,-1.182268,-1.117874,-1.535150,-1.224345,3.585589,17.189072,32.433673,7.148282,7.474786
1,-0.425749,-1.106688,0.645564,-1.516202,-2.004565,-1.668563,-2.015216,-1.005378,-0.498441,-1.117874,-1.422207,-0.332794,3.764521,24.290302,25.310796,6.384089,9.142289
2,0.033609,1.495342,-1.424952,3.133412,2.974995,2.718753,1.299323,1.355101,1.642233,2.023883,2.033875,1.681451,4.229962,39.435706,41.820398,11.560786,10.609290
3,-0.378883,0.576979,-0.581408,0.357024,0.788847,0.757365,0.413759,0.551533,0.675956,0.750198,0.633371,1.483329,4.106494,28.820770,38.656728,10.132910,10.732621
4,0.297577,-0.326079,0.338821,0.055969,0.667394,0.602518,-0.092277,-0.000919,-0.394381,-0.948049,-0.518656,-0.762059,3.965215,17.866439,37.797663,9.785320,8.850928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,-1.340331,-2.178113,-0.734780,-1.783806,-1.609843,-1.823410,-2.154376,-2.150461,-2.371532,-2.285419,-2.619411,-2.776305,3.280333,12.413889,23.242262,5.378595,7.311850
149,-0.919913,-1.091382,-1.041523,-0.010932,-0.243501,-0.326561,-0.408550,-0.854709,-1.122805,-0.948049,-0.902665,-1.158304,3.596988,16.477752,34.365242,7.527648,7.474786
150,0.968523,0.148409,-0.734780,-0.077832,0.120857,0.137979,0.666777,0.551533,-0.290320,-0.098926,0.249362,-0.002590,3.977372,16.785899,41.176544,10.735525,9.292465
151,1.273154,1.495342,-1.194895,1.795393,1.760468,1.531597,1.742105,2.048177,1.478709,1.068619,0.994792,1.549370,4.258235,29.831738,44.134553,13.106154,11.448103


## Model

In [6]:
x_columns = data_m.columns.drop(['masa_piel', 'masa_adiposa', 'masa_muscular', 'masa_residual',
       'masa_osea'])
y_columns = ['masa_piel', 'masa_adiposa', 'masa_muscular', 'masa_residual',
       'masa_osea']

X_m = data_m[x_columns].values
y_m = data_m[y_columns].values

X_f = data_f[x_columns].values
y_f = data_f[y_columns].values


In [7]:
x_columns

Index(['edad', 'peso', 'talla', 'per_brazo_rel', 'per_brazo_ten',
       'per_antebrazo', 'per_torax', 'per_cintura', 'per_cadera',
       'per_muslo_max', 'per_muslo_medial', 'per_pantorrilla'],
      dtype='object')

## split data 

In [8]:
# Create train/test
x_train_m, x_test_m, y_train_m, y_test_m = train_test_split(    
    X_m, y_m, test_size=0.25, random_state=42)

x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(    
    X_f, y_f, test_size=0.25, random_state=42)

## Model

### Masc

In [10]:
R2s = []
MAEs = []
model = XGBRegressor()
for index,target in enumerate(y_columns):
    model.fit(x_train_m,y_train_m[:,index])

    y_pred = model.predict(x_test_m)
    R2 = r2_score(y_test_m[:,index],y_pred)
    MAE = mean_absolute_error(y_test_m[:,index],y_pred)
    MSE = mean_squared_error(y_test_m[:,index],y_pred)
    R2s.append(R2)
    MAEs.append(MAE)

    print(f'{target}  R2 = {round(R2,3)} and MAE = {round(MAE,3)}')
    print('-'*33)
print('='*33)
print(f'R2 prom = {np.mean(R2s)}')
print(f'MAE prom = {np.mean(MAEs)}')

masa_piel  R2 = 0.973 and MAE = 0.04
---------------------------------
masa_adiposa  R2 = 0.547 and MAE = 3.194
---------------------------------
masa_muscular  R2 = 0.927 and MAE = 1.253
---------------------------------
masa_residual  R2 = 0.844 and MAE = 0.398
---------------------------------
masa_osea  R2 = 0.835 and MAE = 0.364
---------------------------------
R2 prom = 0.8252097853835245
MAE prom = 1.049855384538565


### Fem

In [11]:
R2s = []
MAEs = []
for index,target in enumerate(y_columns):
    model = XGBRegressor()

    model.fit(x_train_f,y_train_f[:,index])

    y_pred = model.predict(x_test_f)
    R2 = r2_score(y_test_f[:,index],y_pred)
    MAE = mean_absolute_error(y_test_f[:,index],y_pred)
    MSE = mean_squared_error(y_test_f[:,index],y_pred)
    R2s.append(R2)
    MAEs.append(MAE)

    print(f'{target}  R2 = {round(R2,3)} and MAE = {round(MAE,3)}')
    print('-'*33)
print('='*33)
print(f'R2 prom = {np.mean(R2s)}')
print(f'MAE prom = {np.mean(MAEs)}')

masa_piel  R2 = 0.965 and MAE = 0.047
---------------------------------
masa_adiposa  R2 = 0.689 and MAE = 3.07
---------------------------------
masa_muscular  R2 = 0.543 and MAE = 1.93
---------------------------------
masa_residual  R2 = 0.935 and MAE = 0.389
---------------------------------
masa_osea  R2 = 0.821 and MAE = 0.341
---------------------------------
R2 prom = 0.7905751115991425
MAE prom = 1.155373048986771


## Using CV

### masc

In [15]:
for index,target in enumerate(y_columns):
    model = XGBRegressor()
    scores = cross_val_score(model, x_train_m,y_train_m[:,index], cv=3)
    print(target)
    print(scores)
    print(scores.mean())
    print(scores.std())

masa_piel
[0.94386078 0.95285136 0.97309913]
0.9566037569448529
0.012227857268131697
masa_adiposa
[0.54952109 0.54813011 0.69437062]
0.597340605680102
0.06861293195462247
masa_muscular
[0.8880033  0.83353388 0.86162774]
0.8610549754550467
0.022240732790178616
masa_residual
[0.96907328 0.89535385 0.93899065]
0.934472591373201
0.030264921030690624
masa_osea
[0.86650104 0.92250793 0.85699722]
0.8820020649760644
0.028903570526312337


### fem

In [16]:
for index,target in enumerate(y_columns):
    model = XGBRegressor()
    scores = cross_val_score(model, x_train_f,y_train_f[:,index], cv=3)
    print(target)
    print(scores)
    print(scores.mean())
    print(scores.std())

masa_piel
[0.96611843 0.95367223 0.97435102]
0.9647138938972996
0.008500299384797006
masa_adiposa
[0.74599442 0.75743354 0.67140937]
0.7249457751163803
0.038142919878538385
masa_muscular
[0.85072837 0.6871931  0.69925301]
0.7457248257276324
0.07441177422693279
masa_residual
[0.91676944 0.75287138 0.88393444]
0.8511917552178719
0.07080351375323186
masa_osea
[0.81480245 0.75717231 0.83665104]
0.8028752672825511
0.03352521840374243


## Using Grid Search CV

In [58]:
grid_param = {
    'alpha': [0.01,0.03,0.1,0.3,1,3,10]
}

### masc

In [None]:
model = XGBRegressor()
for index,target in enumerate(y_columns):
    gd_sr = GridSearchCV(estimator=model,
                        param_grid=grid_param,
                        scoring='r2',
                        cv=7,
                        n_jobs=-1)
    gd_sr.fit(x_train_m,y_train_m[:,index])
    best_parameters = gd_sr.best_params_
    print(target)
    print(f'Best R2: {gd_sr.best_score_}')
    print(best_parameters)
    print('-'*33)


### fem

In [None]:
model = Ridge()
for index,target in enumerate(y_columns):
    gd_sr = GridSearchCV(estimator=model,
                        param_grid=grid_param,
                        scoring='r2',
                        cv=7,
                        n_jobs=-1)
    gd_sr.fit(x_train_f,y_train_f[:,index])
    best_parameters = gd_sr.best_params_
    print(target)
    print(f'Best R2: {gd_sr.best_score_}')
    print(best_parameters)
    print('-'*33)
