# Model selection

## Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,r2_score,mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


## Load data

In [47]:
# leer data
data_m = pd.read_csv('../../data/processed/data_m_v1.csv')
data_f = pd.read_csv('../../data/processed/data_f_v1.csv')


### Separar por sexo

### Normalization

In [48]:
# with normalization is harder to analyze the results
# the dimensions are not very different, so i will leave it without normalization



In [49]:
data_m

Unnamed: 0,edad,peso,talla,per_brazo_rel,per_brazo_ten,per_antebrazo,per_torax,per_cintura,per_cadera,per_muslo_max,per_muslo_medial,per_pantorrilla,masa_piel,masa_adiposa,masa_muscular,masa_residual,masa_osea
0,-1.036390,-1.129647,-1.041523,-0.245085,-0.364953,-0.791100,-0.661568,-1.015423,-1.182268,-1.117874,-1.535150,-1.224345,-1.341535,-0.806759,-0.788917,-1.233627,-1.523340
1,-0.425749,-1.106688,0.645564,-1.516202,-2.004565,-1.668563,-2.015216,-1.005378,-0.498441,-1.117874,-1.422207,-0.332794,-0.773592,0.171437,-1.938302,-1.616691,-0.329323
2,0.033609,1.495342,-1.424952,3.133412,2.974995,2.718753,1.299323,1.355101,1.642233,2.023883,2.033875,1.681451,0.703758,2.257721,0.725774,0.978207,0.721124
3,-0.378883,0.576979,-0.581408,0.357024,0.788847,0.757365,0.413759,0.551533,0.675956,0.750198,0.633371,1.483329,0.311860,0.795510,0.215267,0.262463,0.809435
4,0.297577,-0.326079,0.338821,0.055969,0.667394,0.602518,-0.092277,-0.000919,-0.394381,-0.948049,-0.518656,-0.762059,-0.136571,-0.713452,0.076644,0.088228,-0.537953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,-1.340331,-2.178113,-0.734780,-1.783806,-1.609843,-1.823410,-2.154376,-2.150461,-2.371532,-2.285419,-2.619411,-2.776305,-2.310445,-1.464542,-2.272092,-2.120709,-1.640011
149,-0.919913,-1.091382,-1.041523,-0.010932,-0.243501,-0.326561,-0.408550,-0.854709,-1.122805,-0.948049,-0.902665,-1.158304,-1.305354,-0.904744,-0.477229,-1.043464,-1.523340
150,0.968523,0.148409,-0.734780,-0.077832,0.120857,0.137979,0.666777,0.551533,-0.290320,-0.098926,0.249362,-0.002590,-0.097984,-0.862296,0.621878,0.564533,-0.221790
151,1.273154,1.495342,-1.194895,1.795393,1.760468,1.531597,1.742105,2.048177,1.478709,1.068619,0.994792,1.549370,0.793497,0.934772,1.099198,1.752846,1.321756


## Model

In [50]:
x_columns = data_m.columns.drop(['masa_piel', 'masa_adiposa', 'masa_muscular', 'masa_residual',
       'masa_osea'])
y_columns = ['masa_piel', 'masa_adiposa', 'masa_muscular', 'masa_residual',
       'masa_osea']

X_m = data_m[x_columns].values
y_m = data_m[y_columns].values

X_f = data_f[x_columns].values
y_f = data_f[y_columns].values


In [51]:
x_columns

Index(['edad', 'peso', 'talla', 'per_brazo_rel', 'per_brazo_ten',
       'per_antebrazo', 'per_torax', 'per_cintura', 'per_cadera',
       'per_muslo_max', 'per_muslo_medial', 'per_pantorrilla'],
      dtype='object')

## split data 

In [52]:
# Create train/test
x_train_m, x_test_m, y_train_m, y_test_m = train_test_split(    
    X_m, y_m, test_size=0.3, random_state=5)

x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(    
    X_f, y_f, test_size=0.3, random_state=5)

## Model

### Masc

In [53]:
R2s = []
MAEs = []
model = Ridge(alpha=0.1)
for index,target in enumerate(y_columns):
    model.fit(x_train_m,y_train_m[:,index])

    y_pred = model.predict(x_test_m)
    R2 = r2_score(y_test_m[:,index],y_pred)
    MAE = mean_absolute_error(y_test_m[:,index],y_pred)
    MSE = mean_squared_error(y_test_m[:,index],y_pred)
    R2s.append(R2)
    MAEs.append(MAE)

    print(f'{target}  R2 = {round(R2,3)} and MAE = {round(MAE,3)}')
    print('-'*33)
print('='*33)
print(f'R2 prom = {np.mean(R2s)}')
print(f'MAE prom = {np.mean(MAEs)}')

masa_piel  R2 = 0.998 and MAE = 0.029
---------------------------------
masa_adiposa  R2 = 0.823 and MAE = 0.315
---------------------------------
masa_muscular  R2 = 0.95 and MAE = 0.174
---------------------------------
masa_residual  R2 = 0.952 and MAE = 0.168
---------------------------------
masa_osea  R2 = 0.859 and MAE = 0.291
---------------------------------
R2 prom = 0.9161341140336777
MAE prom = 0.19538245904718948


### Fem

In [54]:
R2s = []
MAEs = []
for index,target in enumerate(y_columns):
    model = Ridge(alpha=0.1)

    model.fit(x_train_f,y_train_f[:,index])

    y_pred = model.predict(x_test_f)
    R2 = r2_score(y_test_f[:,index],y_pred)
    MAE = mean_absolute_error(y_test_f[:,index],y_pred)
    MSE = mean_squared_error(y_test_f[:,index],y_pred)
    R2s.append(R2)
    MAEs.append(MAE)

    print(f'{target}  R2 = {round(R2,3)} and MAE = {round(MAE,3)}')
    print('-'*33)
print('='*33)
print(f'R2 prom = {np.mean(R2s)}')
print(f'MAE prom = {np.mean(MAEs)}')

masa_piel  R2 = 0.998 and MAE = 0.034
---------------------------------
masa_adiposa  R2 = 0.674 and MAE = 0.377
---------------------------------
masa_muscular  R2 = 0.792 and MAE = 0.402
---------------------------------
masa_residual  R2 = 0.942 and MAE = 0.194
---------------------------------
masa_osea  R2 = 0.85 and MAE = 0.332
---------------------------------
R2 prom = 0.851045099409754
MAE prom = 0.2678466350769661


## Using CV

### masc

In [55]:
for index,target in enumerate(y_columns):
    model = Ridge(alpha=0.1)
    scores = cross_val_score(model, x_train_m,y_train_m[:,index], cv=6)
    print(target)
    print(scores)
    print(scores.mean())
    print(scores.std())

masa_piel
[0.99629997 0.99541645 0.99496763 0.99819209 0.9988079  0.99758744]
0.9968785801210331
0.0014185929058428016
masa_adiposa
[0.60670421 0.56874955 0.10931481 0.83158332 0.78215981 0.65290381]
0.5919025826583684
0.23488008557541998
masa_muscular
[0.88104082 0.89366411 0.89872568 0.89592358 0.89553079 0.90696072]
0.8953076178252289
0.007682770611045094
masa_residual
[0.93964329 0.88431802 0.88863627 0.94990569 0.95415399 0.94850711]
0.9275273970314428
0.029371973912303687
masa_osea
[0.72938788 0.76948504 0.81661105 0.74865065 0.78558853 0.72129712]
0.761836714255037
0.03289663565739481


### fem

In [56]:
for index,target in enumerate(y_columns):
    model = Ridge(alpha=0.1)
    scores = cross_val_score(model, x_train_f,y_train_f[:,index], cv=5)
    print(target)
    print(scores)
    print(scores.mean())
    print(scores.std())

masa_piel
[0.99284964 0.99671809 0.99694399 0.99439481 0.99740759]
0.9956628230317633
0.00175048751532094
masa_adiposa
[0.87467629 0.68852123 0.57221961 0.62619348 0.81736273]
0.7157946704705068
0.11396968955079372
masa_muscular
[0.7457049  0.7039414  0.60441273 0.76150894 0.7165702 ]
0.7064276357385525
0.05494132817770922
masa_residual
[0.84633315 0.89796574 0.93861672 0.91514873 0.77266534]
0.8741459386705477
0.059121833917858455
masa_osea
[0.6544064  0.78058401 0.76279998 0.73714541 0.78077478]
0.7431421162460161
0.04714770789460813


## Using Grid Search CV

In [58]:
grid_param = {
    'alpha': [0.01,0.03,0.1,0.3,1,3,10]
}

### masc

In [63]:
model = Ridge()
for index,target in enumerate(y_columns):
    gd_sr = GridSearchCV(estimator=model,
                        param_grid=grid_param,
                        scoring='r2',
                        cv=7,
                        n_jobs=-1)
    gd_sr.fit(x_train_m,y_train_m[:,index])
    best_parameters = gd_sr.best_params_
    print(target)
    print(f'Best R2: {gd_sr.best_score_}')
    print(best_parameters)
    print('-'*33)


masa_piel
Best R2: 0.9971838459182357
{'alpha': 0.01}
---------------------------------
masa_adiposa
Best R2: 0.6514242533346094
{'alpha': 1}
---------------------------------
masa_muscular
Best R2: 0.8973014650050797
{'alpha': 1}
---------------------------------
masa_residual
Best R2: 0.9221676330142631
{'alpha': 1}
---------------------------------
masa_osea
Best R2: 0.7564179261542314
{'alpha': 3}
---------------------------------


### fem

In [64]:
model = Ridge()
for index,target in enumerate(y_columns):
    gd_sr = GridSearchCV(estimator=model,
                        param_grid=grid_param,
                        scoring='r2',
                        cv=7,
                        n_jobs=-1)
    gd_sr.fit(x_train_f,y_train_f[:,index])
    best_parameters = gd_sr.best_params_
    print(target)
    print(f'Best R2: {gd_sr.best_score_}')
    print(best_parameters)
    print('-'*33)


masa_piel
Best R2: 0.9957366621008733
{'alpha': 0.03}
---------------------------------
masa_adiposa
Best R2: 0.6461485354926971
{'alpha': 10}
---------------------------------
masa_muscular
Best R2: 0.7076203647939164
{'alpha': 3}
---------------------------------
masa_residual
Best R2: 0.8624139733586291
{'alpha': 3}
---------------------------------
masa_osea
Best R2: 0.7601745842644624
{'alpha': 3}
---------------------------------
