# Model selection

## Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,r2_score,mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


## Load data

In [2]:
# leer data
data_m = pd.read_csv('../../data/processed/data_m_v2.csv')
data_f = pd.read_csv('../../data/processed/data_f_v2.csv')


### Separar por sexo

### Normalization

In [3]:
# with normalization is harder to analyze the results
# the dimensions are not very different, so i will leave it without normalization



In [4]:
data_m

Unnamed: 0,edad,peso,talla,per_brazo_rel,per_brazo_ten,per_antebrazo,per_torax,per_cintura,per_cadera,per_muslo_max,...,talla_2,per_brazo_rel_2,per_brazo_ten_2,per_antebrazo_2,per_torax_2,per_cintura_2,per_cadera_2,per_muslo_max_2,per_muslo_medial_2,per_pantorrilla_2
0,-1.036390,-1.129647,-1.041523,-0.245085,-0.364953,-0.791100,-0.661568,-1.015423,-1.182268,-1.117874,...,1.084770,0.060067,0.133191,0.625839,0.437672,1.031083e+00,1.397758,1.249642,2.356687,1.499021
1,-0.425749,-1.106688,0.645564,-1.516202,-2.004565,-1.668563,-2.015216,-1.005378,-0.498441,-1.117874,...,0.416753,2.298869,4.018279,2.784104,4.061094,1.010785e+00,0.248444,1.249642,2.022671,0.110752
2,0.033609,1.495342,-1.424952,3.133412,2.974995,2.718753,1.299323,1.355101,1.642233,2.023883,...,2.030488,9.818270,8.850597,7.391618,1.688240,1.836298e+00,2.696930,4.096102,4.136647,2.827279
3,-0.378883,0.576979,-0.581408,0.357024,0.788847,0.757365,0.413759,0.551533,0.675956,0.750198,...,0.338036,0.127466,0.622280,0.573601,0.171197,3.041890e-01,0.456917,0.562797,0.401159,2.200264
4,0.297577,-0.326079,0.338821,0.055969,0.667394,0.602518,-0.092277,-0.000919,-0.394381,-0.948049,...,0.114800,0.003133,0.445415,0.363028,0.008515,8.447684e-07,0.155536,0.898797,0.269004,0.580735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,-1.340331,-2.178113,-0.734780,-1.783806,-1.609843,-1.823410,-2.154376,-2.150461,-2.371532,-2.285419,...,0.539902,3.181964,2.591596,3.324824,4.641334,4.624484e+00,5.624162,5.223138,6.861315,7.707868
149,-0.919913,-1.091382,-1.041523,-0.010932,-0.243501,-0.326561,-0.408550,-0.854709,-1.122805,-0.948049,...,1.084770,0.000119,0.059293,0.106642,0.166913,7.305279e-01,1.260691,0.898797,0.814804,1.341669
150,0.968523,0.148409,-0.734780,-0.077832,0.120857,0.137979,0.666777,0.551533,-0.290320,-0.098926,...,0.539902,0.006058,0.014606,0.019038,0.444592,3.041890e-01,0.084286,0.009786,0.062182,0.000007
151,1.273154,1.495342,-1.194895,1.795393,1.760468,1.531597,1.742105,2.048177,1.478709,1.068619,...,1.427773,3.223437,3.099249,2.345789,3.034929,4.195030e+00,2.186581,1.141947,0.989610,2.400546


## Model

In [5]:
x_columns = data_m.columns.drop(['masa_piel', 'masa_adiposa', 'masa_muscular', 'masa_residual',
       'masa_osea'])
y_columns = ['masa_piel', 'masa_adiposa', 'masa_muscular', 'masa_residual',
       'masa_osea']

X_m = data_m[x_columns].values
y_m = data_m[y_columns].values

X_f = data_f[x_columns].values
y_f = data_f[y_columns].values


In [6]:
x_columns

Index(['edad', 'peso', 'talla', 'per_brazo_rel', 'per_brazo_ten',
       'per_antebrazo', 'per_torax', 'per_cintura', 'per_cadera',
       'per_muslo_max', 'per_muslo_medial', 'per_pantorrilla', 'edad_2',
       'peso_2', 'talla_2', 'per_brazo_rel_2', 'per_brazo_ten_2',
       'per_antebrazo_2', 'per_torax_2', 'per_cintura_2', 'per_cadera_2',
       'per_muslo_max_2', 'per_muslo_medial_2', 'per_pantorrilla_2'],
      dtype='object')

## split data 

In [7]:
# Create train/test
x_train_m, x_test_m, y_train_m, y_test_m = train_test_split(    
    X_m, y_m, test_size=0.3, random_state=5)

x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(    
    X_f, y_f, test_size=0.3, random_state=5)

## Model

### Masc

In [8]:
R2s = []
MAEs = []
model = Ridge(alpha=0.1)
for index,target in enumerate(y_columns):
    model.fit(x_train_m,y_train_m[:,index])

    y_pred = model.predict(x_test_m)
    R2 = r2_score(y_test_m[:,index],y_pred)
    MAE = mean_absolute_error(y_test_m[:,index],y_pred)
    MSE = mean_squared_error(y_test_m[:,index],y_pred)
    R2s.append(R2)
    MAEs.append(MAE)

    print(f'{target}  R2 = {round(R2,3)} and MAE = {round(MAE,3)}')
    print('-'*33)
print('='*33)
print(f'R2 prom = {np.mean(R2s)}')
print(f'MAE prom = {np.mean(MAEs)}')

masa_piel  R2 = 1.0 and MAE = 0.01
---------------------------------
masa_adiposa  R2 = 0.787 and MAE = 0.359
---------------------------------
masa_muscular  R2 = 0.947 and MAE = 0.19
---------------------------------
masa_residual  R2 = 0.959 and MAE = 0.152
---------------------------------
masa_osea  R2 = 0.866 and MAE = 0.286
---------------------------------
R2 prom = 0.9118780325720286
MAE prom = 0.19916762999331633


### Fem

In [9]:
R2s = []
MAEs = []
for index,target in enumerate(y_columns):
    model = Ridge(alpha=0.1)

    model.fit(x_train_f,y_train_f[:,index])

    y_pred = model.predict(x_test_f)
    R2 = r2_score(y_test_f[:,index],y_pred)
    MAE = mean_absolute_error(y_test_f[:,index],y_pred)
    MSE = mean_squared_error(y_test_f[:,index],y_pred)
    R2s.append(R2)
    MAEs.append(MAE)

    print(f'{target}  R2 = {round(R2,3)} and MAE = {round(MAE,3)}')
    print('-'*33)
print('='*33)
print(f'R2 prom = {np.mean(R2s)}')
print(f'MAE prom = {np.mean(MAEs)}')

masa_piel  R2 = 1.0 and MAE = 0.011
---------------------------------
masa_adiposa  R2 = 0.594 and MAE = 0.375
---------------------------------
masa_muscular  R2 = 0.726 and MAE = 0.392
---------------------------------
masa_residual  R2 = 0.938 and MAE = 0.184
---------------------------------
masa_osea  R2 = 0.859 and MAE = 0.298
---------------------------------
R2 prom = 0.8232855193920485
MAE prom = 0.2521105204908633


## Using CV

### masc

In [10]:
for index,target in enumerate(y_columns):
    model = Ridge(alpha=0.1)
    scores = cross_val_score(model, x_train_m,y_train_m[:,index], cv=6)
    print(target)
    print(scores)
    print(scores.mean())
    print(scores.std())

masa_piel
[0.99985351 0.99962682 0.99925724 0.99934804 0.99972548 0.99992294]
0.9996223366100111
0.00024596843602790156
masa_adiposa
[ 0.58010509  0.75394963 -0.28679919  0.71722719  0.85377587  0.75358894]
0.5619745904888488
0.3880651734492353
masa_muscular
[0.86987199 0.90891747 0.87842746 0.86380966 0.94113251 0.91973736]
0.8969827400677683
0.02825230766249197
masa_residual
[0.77134164 0.84335122 0.81722857 0.93827593 0.94399342 0.94937669]
0.8772612453460149
0.06993971900547315
masa_osea
[0.50638928 0.71520195 0.24082488 0.74943776 0.69946657 0.73446557]
0.6076310016153799
0.18302916811076958


### fem

In [11]:
for index,target in enumerate(y_columns):
    model = Ridge(alpha=0.1)
    scores = cross_val_score(model, x_train_f,y_train_f[:,index], cv=5)
    print(target)
    print(scores)
    print(scores.mean())
    print(scores.std())

masa_piel
[0.99967571 0.99978811 0.99951162 0.999661   0.99976133]
0.999679554937311
9.698899962338387e-05
masa_adiposa
[0.75578971 0.33867816 0.49819732 0.69668054 0.63997132]
0.58586340921351
0.1502457578491544
masa_muscular
[0.73594356 0.39076151 0.58077597 0.83908218 0.46021069]
0.6013547838911081
0.16692700521018203
masa_residual
[0.83856871 0.76851121 0.88832167 0.92476745 0.77956644]
0.8399470960002808
0.06047379707087269
masa_osea
[0.63504684 0.81327582 0.77566329 0.63202655 0.85828081]
0.7428586610928093
0.0930205758387678


## Using Grid Search CV

In [12]:
grid_param = {
    'alpha': [0.01,0.03,0.1,0.3,1,3,10]
}

### masc

In [13]:
model = Ridge()
for index,target in enumerate(y_columns):
    gd_sr = GridSearchCV(estimator=model,
                        param_grid=grid_param,
                        scoring='r2',
                        cv=7,
                        n_jobs=-1)
    gd_sr.fit(x_train_m,y_train_m[:,index])
    best_parameters = gd_sr.best_params_
    print(target)
    print(f'Best R2: {gd_sr.best_score_}')
    print(best_parameters)
    print('-'*33)


masa_piel
Best R2: 0.9998061991207355
{'alpha': 0.01}
---------------------------------
masa_adiposa
Best R2: 0.6560832386775266
{'alpha': 0.01}
---------------------------------
masa_muscular
Best R2: 0.8960811366932722
{'alpha': 1}
---------------------------------
masa_residual
Best R2: 0.8849803428809038
{'alpha': 10}
---------------------------------
masa_osea
Best R2: 0.6953534265539779
{'alpha': 10}
---------------------------------


### fem

In [14]:
model = Ridge()
for index,target in enumerate(y_columns):
    gd_sr = GridSearchCV(estimator=model,
                        param_grid=grid_param,
                        scoring='r2',
                        cv=7,
                        n_jobs=-1)
    gd_sr.fit(x_train_f,y_train_f[:,index])
    best_parameters = gd_sr.best_params_
    print(target)
    print(f'Best R2: {gd_sr.best_score_}')
    print(best_parameters)
    print('-'*33)


masa_piel
Best R2: 0.9997842835232278
{'alpha': 0.01}
---------------------------------
masa_adiposa
Best R2: 0.6701436872875889
{'alpha': 10}
---------------------------------
masa_muscular
Best R2: 0.7139120795980965
{'alpha': 3}
---------------------------------
masa_residual
Best R2: 0.8461106892387547
{'alpha': 3}
---------------------------------
masa_osea
Best R2: 0.7891098399572625
{'alpha': 10}
---------------------------------
