# Obesity Modelado

**Variable objetivo:** Obesity: 
- **0** -> Riesgo nulo o bajo
- **1** -> Riesgo medio
- **2** -> Riesgo alto

Se trata de un problema de clasificación supervisada. 

**Tareas:**

1. Cargar los datos
2. Selección y separación del dataset
3. Transformación de los datos (si necesario)
4. Entrenar y evaluar diferentes modelos de clasificación
5. Escoger el mejor modelo
6. Ajustar y mejorar el modelo escogido
7. Contruir el pipeline definitivo


In [1]:
# Librerías
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt # importar altair con el alias alt

# Dividir el dataframe
from sklearn.model_selection import train_test_split

# Transformar columnas
#from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import OneHotEncoder, Binarizer, StandardScaler, RobustScaler
#from sklearn.compose import ColumnTransformer

# Clasificación supervisada
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Métricas
from sklearn.metrics import accuracy_score, classification_report

# Armando pipelines
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline


# Funciones propias
import data_common_functions as dcf
import char_common_functions as ccf

### Cargar los datos

In [2]:
# Leemos el archivo
CURRENT_DIR = Path.cwd()

BASE_DIR = Path(CURRENT_DIR).parent

df_obesity = pd.read_parquet(f"{BASE_DIR}/data/out/obesity_labeled.parquet", engine='fastparquet')

df_obesity

Unnamed: 0,age,age_range,gender,height,weight,waist_circum_preferred,hip_circum,gender_bin,bmi,rcc,...,obesity_cc,obesity_cc_txt,obesity_rcc,obesity_rcc_txt,obesity_ict,obesity_ict_txt,risk_factors,cluster_kmeans,cluster,obesity
1,47.0,46-55,male,187.1980,156.630,138.5062,142.1892,1,44.696399,0.974098,...,1,1-Alto,1,1-Medio,3,3-Obesidad,4,5,1,2
2,50.0,46-55,male,180.5940,98.064,98.3996,109.9058,1,30.067891,0.895309,...,1,1-Alto,0,0-Bajo,2,2-Sobrepeso,3,1,4,2
3,28.0,26-35,male,182.7022,89.211,97.0026,107.0102,1,26.725809,0.906480,...,1,1-Alto,0,0-Bajo,2,2-Sobrepeso,3,1,4,2
4,52.0,46-55,male,185.9026,113.273,105.9942,111.0996,1,32.775964,0.954047,...,1,1-Alto,1,1-Medio,2,2-Sobrepeso,4,1,1,2
5,50.0,46-55,male,183.1086,108.279,104.0892,112.3950,1,32.294365,0.926102,...,1,1-Alto,0,0-Bajo,2,2-Sobrepeso,3,1,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4460,35.0,26-35,female,152.8064,49.259,61.5950,93.0910,0,21.096116,0.661664,...,0,0-Bajo,0,0-Bajo,0,0-Delgado,0,2,0,0
4461,40.0,36-45,female,168.6052,67.646,73.8124,100.4062,0,23.795794,0.735138,...,0,0-Bajo,0,0-Bajo,1,1-Normal,0,2,5,0
4462,23.0,17-25,female,168.5036,73.775,78.0034,109.7026,0,25.983093,0.711044,...,0,0-Bajo,0,0-Bajo,1,1-Normal,1,0,2,0
4463,24.0,17-25,male,170.5102,65.149,75.7936,97.7900,1,22.408203,0.775065,...,0,0-Bajo,0,0-Bajo,1,1-Normal,0,0,5,0


## Separación del dataset

1. Selección de columnas
2. Separación de la variable objetivo
3. División del dataset

In [3]:
def select_split_dataset(df, cols_to_leave, var_target, train_proportion, test_proportion):
    
    cols_to_leave.append(var_target)
    
    df = ccf.select_data(df, [], cols_to_leave)
    
    obesity_risk = df[var_target].copy()
    obesity_data = df.drop([var_target], axis=1)

    original_count = len(df)
    training_size = int(original_count * train_proportion)
    test_size = int((1 - train_proportion) * test_proportion * training_size)
    
    train_x, rest_x, train_y, rest_y = train_test_split(obesity_data, 
                                                        obesity_risk, 
                                                        train_size=training_size,
                                                        stratify=obesity_risk)
    test_x, validate_x, test_y, validate_y = train_test_split(rest_x, 
                                                              rest_y, 
                                                              train_size=test_size,
                                                              stratify=rest_y)

    return (train_x, train_y), (validate_x, validate_y), (test_x, test_y)
    

In [4]:
cols_to_leave = ['age',
                 'gender_bin',
                 'height',
                 'weight',
                 'waist_circum_preferred',
                 'hip_circum']

training_data, validate_data, test_data = select_split_dataset(df_obesity, cols_to_leave, 'obesity', 
                                                        train_proportion=0.6, test_proportion=0.5)

train_x=training_data[0]
train_y=training_data[1]
validate_x=validate_data[0]
validate_y=validate_data[1]
test_x=test_data[0]
test_y=test_data[1]



In [5]:
test_x.shape

(531, 6)

## Escalado y transformación de datos

In [6]:
cols_encode = []
cols_scale = ['age', 'height', 'weight', 'waist_circum_preferred', 'hip_circum']
cols_pass = ['gender_bin']


X_train_scaled = dcf.transform_df(train_x, cols_encode, cols_scale, cols_pass)
X_validate_scaled = dcf.transform_df(validate_x, cols_encode, cols_scale, cols_pass)
X_test_scaled = dcf.transform_df(test_x, cols_encode, cols_scale, cols_pass)

[[ 1.877361    0.47463483  0.23859154  0.76498719 -0.16538131  1.        ]
 [-0.91831112 -1.10115381 -1.35058312 -1.1988033  -1.05928572  0.        ]
 [ 1.48799162  0.28046182 -0.36868795 -0.34490011 -1.12693254  1.        ]
 ...
 [-1.39334176  0.300377    0.01696027 -0.37479606  0.00615169  0.        ]
 [-0.06948588 -0.81736249 -1.15322341 -1.396864   -0.48911967  0.        ]
 [-0.61460301 -0.75014876 -1.09193158 -1.34641458 -0.57367819  0.        ]]
[[ 0.48314113 -0.73729861  1.02385353  0.96965873  1.93448883  0.        ]
 [-0.99658722 -0.00350482 -0.09660188 -0.48514829 -0.18286817  1.        ]
 [-1.1523481  -1.08118877 -1.35554055 -1.03588256 -1.16638176  0.        ]
 ...
 [-0.99658722  0.16844026 -0.89099218 -1.39316895 -0.22363557  0.        ]
 [-0.91870678 -0.26989861 -0.78901815 -1.30039292 -0.37396537  0.        ]
 [-0.77073394 -0.37645612 -1.17400159 -1.2076169  -1.85687965  1.        ]]
[[ 0.23916043 -0.31204746 -0.45090894 -0.27167669  0.17393608  0.        ]
 [ 0.15812273

In [7]:
X_test_scaled.shape

(531, 6)

## Inicialización de los algoritmos

In [8]:
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
dtree = DecisionTreeClassifier(random_state=42)
svm = SVC(kernel='linear', C=1.0, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

## Evaluación de diferentes modelos

1. Entrenar y evaluar los diferentes modelos
2. Comparar resultados y escoger el mejor

### Entrenamiento

In [9]:
log_reg.fit(X_train_scaled, train_y)
dtree.fit(X_train_scaled, train_y)
svm.fit(X_train_scaled, train_y)
rf.fit(X_train_scaled, train_y)

### Validaciones

In [10]:
y_pred_log_reg = log_reg.predict(X_test_scaled)
y_pred_dtree = dtree.predict(X_test_scaled)
y_pred_svm = svm.predict(X_test_scaled)
y_pred_rf = rf.predict(X_test_scaled)

## Evaluación del rendimiento

In [11]:
# Calcular la precisión de cada clasificador
accuracy_log_reg = accuracy_score(test_y, y_pred_log_reg)
accuracy_dtree = accuracy_score(test_y, y_pred_dtree)
accuracy_svm = accuracy_score(test_y, y_pred_svm)
accuracy_rf = accuracy_score(test_y, y_pred_rf)

# Imprimir los resultados
print("Exactitud de Regresión Logística Multinomial:", accuracy_log_reg)
print("Exactitud de Árboles de Decisión:", accuracy_dtree)
print("Exactitud de SVM:", accuracy_svm)
print("Exactitud de Random Forest:", accuracy_rf)

# Informe detallado de la clasificación
print("\nInforme de clasificación para Regresión Logística Multinomial:\n", classification_report(test_y, y_pred_log_reg))
print("Informe de clasificación para Árboles de Decisión:\n", classification_report(test_y, y_pred_dtree))
print("Informe de clasificación para SVM:\n", classification_report(test_y, y_pred_svm))
print("Informe de clasificación para Random Forest:\n", classification_report(test_y, y_pred_rf))


Exactitud de Regresión Logística Multinomial: 0.943502824858757
Exactitud de Árboles de Decisión: 0.935969868173258
Exactitud de SVM: 0.9510357815442562
Exactitud de Random Forest: 0.9416195856873822

Informe de clasificación para Regresión Logística Multinomial:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       319
           1       0.84      0.41      0.55        39
           2       0.92      0.98      0.95       173

    accuracy                           0.94       531
   macro avg       0.91      0.79      0.83       531
weighted avg       0.94      0.94      0.94       531

Informe de clasificación para Árboles de Decisión:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       319
           1       0.59      0.59      0.59        39
           2       0.95      0.95      0.95       173

    accuracy                           0.94       531
   macro avg       0.84      0

In [12]:
def test_models(df, params):
    
    var_target = params['var_target']
    cols_leave = params['cols_leave']
    cols_encode = params['cols_encode']
    cols_scale = params['cols_scale']
    cols_pass = params['cols_pass']
    train_proportion = params['train_proportion']
    test_proportion = params['test_proportion']
    
    # Divide el dataframe
    training_data, validate_data, test_data = select_split_dataset(df, cols_leave, var_target, 
                                                                   train_proportion=train_proportion, 
                                                                   test_proportion=test_proportion)

    train_x=training_data[0]
    train_y=training_data[1]
    validate_x=validate_data[0]
    validate_y=validate_data[1]
    test_x=test_data[0]
    test_y=test_data[1]
    
    # Transforma los datos 
    X_train_scaled = dcf.transform_df(train_x, cols_encode, cols_scale, cols_pass)
    X_validate_scaled = dcf.transform_df(validate_x, cols_encode, cols_scale, cols_pass)
    X_test_scaled = dcf.transform_df(test_x, cols_encode, cols_scale, cols_pass)
    
    # Crea los modelos
    log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
    dtree = DecisionTreeClassifier(random_state=42)
    svm = SVC(kernel='linear', C=1.0, random_state=42)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Entrena los modelos
    log_reg.fit(X_train_scaled, train_y)
    dtree.fit(X_train_scaled, train_y)
    svm.fit(X_train_scaled, train_y)
    rf.fit(X_train_scaled, train_y)
    
    # Prueba los modelos entrenados
    y_pred_log_reg = log_reg.predict(X_test_scaled)
    y_pred_dtree = dtree.predict(X_test_scaled)
    y_pred_svm = svm.predict(X_test_scaled)
    y_pred_rf = rf.predict(X_test_scaled)
    
    # Muestra y compara las métricas
    
    # Calcular la precisión de cada clasificador
    accuracy_log_reg = accuracy_score(test_y, y_pred_log_reg)
    accuracy_dtree = accuracy_score(test_y, y_pred_dtree)
    accuracy_svm = accuracy_score(test_y, y_pred_svm)
    accuracy_rf = accuracy_score(test_y, y_pred_rf)

    # Imprimir los resultados
    print("Exactitud de Regresión Logística Multinomial:", accuracy_log_reg)
    print("Exactitud de Árboles de Decisión:", accuracy_dtree)
    print("Exactitud de SVM:", accuracy_svm)
    print("Exactitud de Random Forest:", accuracy_rf)

    # Informe detallado de la clasificación
    print("\nInforme de clasificación para Regresión Logística Multinomial:\n", classification_report(test_y, y_pred_log_reg))
    print("Informe de clasificación para Árboles de Decisión:\n", classification_report(test_y, y_pred_dtree))
    print("Informe de clasificación para SVM:\n", classification_report(test_y, y_pred_svm))
    print("Informe de clasificación para Random Forest:\n", classification_report(test_y, y_pred_rf))


In [13]:
def test_dtree(df, params):
    
    var_target = params['var_target']
    cols_leave = params['cols_leave']
    cols_encode = params['cols_encode']
    cols_scale = params['cols_scale']
    cols_pass = params['cols_pass']
    train_proportion = params['train_proportion']
    test_proportion = params['test_proportion']
    
    # Divide el dataframe
    training_data, validate_data, test_data = select_split_dataset(df, cols_leave, var_target, 
                                                                   train_proportion=train_proportion, 
                                                                   test_proportion=test_proportion)

    train_x=training_data[0]
    train_y=training_data[1]
    validate_x=validate_data[0]
    validate_y=validate_data[1]
    test_x=test_data[0]
    test_y=test_data[1]
    
    # Transforma los datos 
    X_train_scaled = dcf.transform_df(train_x, cols_encode, cols_scale, cols_pass)
    X_validate_scaled = dcf.transform_df(validate_x, cols_encode, cols_scale, cols_pass)
    X_test_scaled = dcf.transform_df(test_x, cols_encode, cols_scale, cols_pass)
    
    # Crea los modelos
    dtree = DecisionTreeClassifier(random_state=42)
    
    # Entrena los modelos
    dtree.fit(X_train_scaled, train_y)
    
    # Prueba con los datos de test
    print("\nPrueba con test:\n")
    
    # Prueba los modelos entrenados
    y_pred_dtree = dtree.predict(X_test_scaled)
    
    # Muestra y compara las métricas
    
    # Calcular la precisión de cada clasificador
    accuracy_dtree = accuracy_score(test_y, y_pred_dtree)

    # Imprimir los resultados
    print("Exactitud de Árboles de Decisión:", accuracy_dtree)


    # Informe detallado de la clasificación
    print("\nInforme de clasificación para Árboles de Decisión:\n", classification_report(test_y, y_pred_dtree))

    
    # Prueba con los datos de test
    print("\nPrueba con validate:\n")
    
    # Prueba los modelos entrenados
    y_pred_validate_dtree = dtree.predict(X_validate_scaled)
    
    # Muestra y compara las métricas
    
    # Calcular la precisión de cada clasificador
    accuracy_validate_dtree = accuracy_score(validate_y, y_pred_validate_dtree)

    # Imprimir los resultados
    print("Exactitud de Árboles de Decisión:", accuracy_validate_dtree)


    # Informe detallado de la clasificación
    print("\nInforme de clasificación para Árboles de Decisión:\n", classification_report(validate_y, y_pred_validate_dtree))


In [14]:
def test_rf(df, params):
    
    var_target = params['var_target']
    cols_leave = params['cols_leave']
    cols_encode = params['cols_encode']
    cols_scale = params['cols_scale']
    cols_pass = params['cols_pass']
    train_proportion = params['train_proportion']
    test_proportion = params['test_proportion']
    
    # Divide el dataframe
    training_data, validate_data, test_data = select_split_dataset(df, cols_leave, var_target, 
                                                                   train_proportion=train_proportion, 
                                                                   test_proportion=test_proportion)

    train_x=training_data[0]
    train_y=training_data[1]
    validate_x=validate_data[0]
    validate_y=validate_data[1]
    test_x=test_data[0]
    test_y=test_data[1]
    
    # Transforma los datos 
    X_train_scaled = dcf.transform_df(train_x, cols_encode, cols_scale, cols_pass)
    X_validate_scaled = dcf.transform_df(validate_x, cols_encode, cols_scale, cols_pass)
    X_test_scaled = dcf.transform_df(test_x, cols_encode, cols_scale, cols_pass)
    
    # Crea los modelos
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Entrena los modelos
    rf.fit(X_train_scaled, train_y)
    
    # Prueba con los datos de test
    print("\nPrueba con test:\n")
    
    # Prueba los modelos entrenados
    y_pred_rf = rf.predict(X_test_scaled)
    
    # Muestra y compara las métricas
    
    # Calcular la precisión de cada clasificador
    accuracy_rf = accuracy_score(test_y, y_pred_rf)

    # Imprimir los resultados
    print("Exactitud de Random Forest:", accuracy_rf)


    # Informe detallado de la clasificación
    print("\nInforme de clasificación para Random Forest:\n", classification_report(test_y, y_pred_rf))

    
    # Prueba con los datos de test
    print("\nPrueba con validate:\n")
    
    # Prueba los modelos entrenados
    y_pred_validate_rf = rf.predict(X_validate_scaled)
    
    # Muestra y compara las métricas
    
    # Calcular la precisión de cada clasificador
    accuracy_validate_rf = accuracy_score(validate_y, y_pred_validate_rf)

    # Imprimir los resultados
    print("Exactitud de Random Forest:", accuracy_validate_rf)


    # Informe detallado de la clasificación
    print("\nInforme de clasificación para Random Forest:\n", classification_report(validate_y, y_pred_validate_rf))


## Pruebas con diferentes columnas

In [15]:
params = {
    'var_target': 'obesity',
    'cols_leave': ['age', 'gender_bin', 'height', 'weight', 'waist_circum_preferred', 'hip_circum'],
    'cols_encode': [],
    'cols_scale': ['age', 'height', 'weight', 'waist_circum_preferred', 'hip_circum'],
    'cols_pass': ['gender_bin'],
    'train_proportion': 0.6,
    'test_proportion': 0.5
}

test_models(df_obesity, params)

[[ 1.26146167  0.2970269  -0.35429816 -0.36906137 -0.78199774  1.        ]
 [-1.55880914  2.35019414 -0.17036094 -0.25842487 -0.7335974   1.        ]
 [-1.01042315  0.4237963   0.27025564  0.02097918 -0.08745292  1.        ]
 ...
 [-1.16710486  1.96988592  1.88160443  1.62427091  1.29437665  1.        ]
 [ 0.94809825  1.69646171  0.75299898  1.22485437  0.1037284   1.        ]
 [-0.93208229  0.14042939 -0.35429816 -1.16601925 -0.87637839  1.        ]]
[[-0.861474    0.83830792 -0.23833526  0.12718332 -0.72545514  1.        ]
 [-0.81470071 -0.59153984 -1.02316316 -1.63782136 -0.48543849  0.        ]
 [ 1.13418626 -0.57253854 -1.36449869 -1.67403635 -0.99326319  0.        ]
 ...
 [ 0.58849791 -0.47040656  0.75617017  1.37183564  0.8763402   0.        ]
 [ 0.04280956 -0.06900411 -0.37803902 -0.32455114 -0.72545514  0.        ]
 [-0.34696784 -0.1711361  -0.60868431 -0.87158823 -0.31616358  0.        ]]
[[-0.83225763  1.62369389 -0.35190682 -0.81046057 -0.72904864  1.        ]
 [-1.16112614

In [16]:
params = {
    'var_target': 'obesity',
    'cols_leave': ['age', 'gender_bin', 'waist_circum_preferred', 'bmi', 'rcc', 'ict'],
    'cols_encode': [],
    'cols_scale': ['age', 'waist_circum_preferred', 'bmi', 'rcc', 'ict'],
    'cols_pass': ['gender_bin'],
    'train_proportion': 0.6,
    'test_proportion': 0.5
}

test_models(df_obesity, params)

[[-1.07360631 -1.49460371 -1.46360403 -1.47360876 -1.73373212  0.        ]
 [-0.29338879  3.12346534  4.03623679  0.37532201  3.52845993  0.        ]
 [ 1.65715501  2.36333652  1.82635328  2.684791    2.28497753  1.        ]
 ...
 [ 0.64287223  0.70938789  0.68449211  1.08718422  0.54965512  1.        ]
 [-1.19844112 -0.44226457 -0.49867819 -0.96051807 -0.49789535  0.        ]
 [-0.05932354 -1.00567663 -0.05298486 -1.79192858 -0.88843238  0.        ]]
[[-0.93776266 -1.76235888 -1.64953943 -1.32293988 -1.6296156   0.        ]
 [-1.44582661 -0.74850798 -0.79548074 -0.49166039 -0.50762692  0.        ]
 [-0.46878056 -0.55806085 -0.68655216  0.05172457 -0.62951506  1.        ]
 ...
 [ 1.2508205   3.34610531  3.10534479  1.42214085  3.75270252  0.        ]
 [ 0.15652892  0.75079638 -0.02114728  0.73602404 -0.04936217  1.        ]
 [-0.6563734  -0.43296323  0.28387097 -1.38176368 -0.20066433  0.        ]]
[[-0.17793938 -0.67459649  0.68223814 -0.98310812 -0.52952247  1.        ]
 [ 1.88801896

In [17]:
params = {
    'var_target': 'obesity',
    'cols_leave': ['age', 'gender_bin', 'waist_circum_preferred', 'bmi', 'rcc', 'ict'],
    'cols_encode': [],
    'cols_scale': ['age', 'waist_circum_preferred'],
    'cols_pass': ['gender_bin', 'bmi', 'rcc', 'ict'],
    'train_proportion': 0.6,
    'test_proportion': 0.5
}

test_models(df_obesity, params)

[[-0.13520942 -0.3588124   0.         23.06561419  0.78144381  0.46923879]
 [ 1.04315894  0.78320088  0.         34.94410359  0.7766508   0.5765212 ]
 [ 1.31811156  0.37154493  1.         26.51310579  0.87760675  0.53576841]
 ...
 [-1.156462    0.66368786  1.         29.03934798  0.84941285  0.53056235]
 [-0.21376731  0.14769515  1.         24.17737482  0.90396825  0.5135257 ]
 [-1.29001042 -0.28482815  1.         22.90143946  0.80206237  0.47370766]]
[[-4.35238114e-01 -9.15946769e-02  0.00000000e+00  2.65365381e+01
   7.77381801e-01  4.82382427e-01]
 [-1.05978377e+00 -9.74456932e-01  0.00000000e+00  2.06831596e+01
   7.80399778e-01  4.44567452e-01]
 [ 8.91921395e-01 -8.92107524e-01  0.00000000e+00  2.40254798e+01
   6.74385633e-01  4.17556693e-01]
 ...
 [ 2.06294449e+00  3.29727874e-01  1.00000000e+00  2.64035568e+01
   8.63715063e-01  5.07705728e-01]
 [-1.93226674e-01 -5.66540098e-01  1.00000000e+00  2.24007841e+01
   8.82919708e-01  4.68328945e-01]
 [-1.21592018e+00 -1.69056792e-02 

In [18]:
params = {
    'var_target': 'obesity',
    'cols_leave': ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict'],
    'cols_encode': [],
    'cols_scale': [],
    'cols_pass': ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict'],
    'train_proportion': 0.6,
    'test_proportion': 0.5
}

test_models(df_obesity, params)

[[26.   1.   0.   2.   0.   1. ]
 [32.   1.   1.   2.   0.   1. ]
 [45.   0.   0.   1.   0.   1. ]
 ...
 [30.   0.   0.   0.   0.   0. ]
 [37.   1.   0.   2.   0.   1. ]
 [19.6  1.   0.   1.   0.   1. ]]
[[54.   0.   0.   1.   0.   1. ]
 [22.   0.   0.   1.   0.   2. ]
 [19.   0.   0.   0.   0.   0. ]
 ...
 [36.9  1.   0.   1.   0.   1. ]
 [41.   0.   1.   2.   2.   3. ]
 [44.4  0.   0.   1.   0.   1. ]]
[[33.  0.  1.  1.  0.  2.]
 [59.  0.  1.  3.  2.  3.]
 [27.  1.  0.  1.  0.  1.]
 ...
 [30.  0.  1.  3.  1.  3.]
 [40.  1.  0.  1.  0.  1.]
 [62.  1.  1.  3.  1.  2.]]
Exactitud de Regresión Logística Multinomial: 0.9830508474576272
Exactitud de Árboles de Decisión: 0.9943502824858758
Exactitud de SVM: 0.967984934086629
Exactitud de Random Forest: 0.9943502824858758

Informe de clasificación para Regresión Logística Multinomial:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00       319
           1       0.92      0.85      0.88      

In [19]:
params = {
    'var_target': 'obesity',
    'cols_leave': ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict',
                   'height', 'weight', 'waist_circum_preferred', 'hip_circum'],
    'cols_encode': [],
    'cols_scale': ['height', 'weight', 'waist_circum_preferred', 'hip_circum'],
    'cols_pass': ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict'],
    'train_proportion': 0.6,
    'test_proportion': 0.5
}

test_dtree(df_obesity, params)

[[ 0.46113514  0.48773778  0.62323284 ...  2.          0.
   2.        ]
 [ 2.37195704  1.32138211  1.01065551 ...  2.          0.
   1.        ]
 [-0.7648131  -1.1116677  -0.67767173 ...  1.          1.
   1.        ]
 ...
 [-0.53570945 -0.22894238  0.31557366 ...  2.          0.
   2.        ]
 [-0.25786035 -0.87969286 -0.86568567 ...  1.          0.
   1.        ]
 [ 1.52622337  0.90285066  0.03830057 ...  2.          0.
   1.        ]]
[[-0.76994712 -0.20175296  0.0305571  ...  2.          1.
   2.        ]
 [-0.70076014 -1.07934834 -1.37631267 ...  1.          0.
   0.        ]
 [ 0.57178619 -0.47622409  0.15297809 ...  1.          0.
   1.        ]
 ...
 [ 0.45565089 -0.42026149 -1.06345903 ...  1.          0.
   0.        ]
 [ 0.3592833  -0.21447173 -0.5504568  ...  1.          0.
   1.        ]
 [-0.71064399  0.74630443  1.2100736  ...  3.          1.
   3.        ]]
[[-0.73899813 -0.74627035 -0.6639736  ...  1.          0.
   1.        ]
 [ 1.36238663 -0.08670317 -0.6493807  .

In [20]:
params = {
    'var_target': 'obesity',
    'cols_leave': ['age', 'gender_bin', 'height', 'weight', 'waist_circum_preferred', 'hip_circum'],
    'cols_encode': [],
    'cols_scale': ['age', 'height', 'weight', 'waist_circum_preferred', 'hip_circum'],
    'cols_pass': ['gender_bin'],
    'train_proportion': 0.6,
    'test_proportion': 0.5
}

test_dtree(df_obesity, params)

[[-0.68244118  0.10526029 -0.40536881 -1.01953751 -0.10490371  0.        ]
 [ 1.42190249 -0.71440101  1.07680007  1.46494207  0.46285515  1.        ]
 [-0.21480925  0.60300023  2.12101639  2.3478563   1.54630764  0.        ]
 ...
 [ 1.11014787 -1.01651182 -0.36930212 -0.45954963  0.52979615  0.        ]
 [ 0.4944325   1.28398771  1.2985979   0.72202481  0.50004459  1.        ]
 [-0.60450252  0.85558468 -1.03175138 -0.67421165 -1.23546283  1.        ]]
[[ 0.10372676  0.97232556  0.69154148  0.50937629 -0.01863961  1.        ]
 [ 0.02546026 -1.73924464  0.94002772  2.07925372  1.12171162  0.        ]
 [ 1.7473232  -0.76666893  0.57778887  0.972161    1.86432446  0.        ]
 ...
 [-0.75720471 -0.06816068 -1.21697441 -1.47940108 -0.97019098  0.        ]
 [ 0.18199326 -0.77637043 -0.54709797 -0.84307211 -0.16716218  0.        ]
 [-0.96852425 -0.87096009 -0.90478671 -0.60769024 -0.55231392  0.        ]]
[[ 0.95364503  1.57733709  0.71711003  1.2227311   0.0902318   1.        ]
 [ 0.71494158

In [21]:
params = {
    'var_target': 'obesity',
    'cols_leave': ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict'],
    'cols_encode': [],
    'cols_scale': [],
    'cols_pass': ['gender_bin', 'age', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict'],
    'train_proportion': 0.6,
    'test_proportion': 0.5
}

test_dtree(df_obesity, params)

[[ 0.  60.   0.   2.   0.   1. ]
 [ 1.  44.7  0.   2.   0.   2. ]
 [ 1.  25.   0.   1.   0.   0. ]
 ...
 [ 0.  37.   0.   1.   0.   0. ]
 [ 0.  31.   0.   2.   0.   1. ]
 [ 0.  21.5  0.   1.   0.   1. ]]
[[ 1. 34.  0.  2.  0.  1.]
 [ 1. 33.  0.  1.  0.  1.]
 [ 1. 65.  1.  3.  1.  2.]
 ...
 [ 0. 34.  0.  1.  0.  1.]
 [ 1. 50.  1.  1.  1.  1.]
 [ 0. 62.  0.  1.  0.  1.]]
[[ 0.  54.7  1.   3.   2.   3. ]
 [ 1.  48.   0.   2.   0.   1. ]
 [ 1.  46.7  0.   2.   0.   1. ]
 ...
 [ 1.  59.6  1.   2.   1.   2. ]
 [ 1.  41.   0.   1.   0.   1. ]
 [ 1.  18.   0.   1.   0.   1. ]]

Prueba con test:

Exactitud de Árboles de Decisión: 0.9981167608286252

Informe de clasificación para Árboles de Decisión:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       319
           1       1.00      0.97      0.99        39
           2       0.99      1.00      1.00       173

    accuracy                           1.00       531
   macro avg       1.00    

In [22]:
params = {
    'var_target': 'obesity',
    'cols_leave': ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict'],
    'cols_encode': [],
    'cols_scale': [],
    'cols_pass': ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict'],
    'train_proportion': 0.6,
    'test_proportion': 0.5
}

test_rf(df_obesity, params)

[[42.  0.  0.  1.  0.  1.]
 [20.  1.  1.  2.  0.  1.]
 [21.  0.  0.  1.  0.  1.]
 ...
 [30.  1.  0.  3.  0.  2.]
 [41.  1.  1.  2.  0.  2.]
 [37.  1.  0.  1.  0.  1.]]
[[26.4  1.   0.   1.   0.   1. ]
 [44.2  1.   1.   2.   1.   2. ]
 [25.   0.   0.   1.   0.   0. ]
 ...
 [18.   0.   0.   1.   0.   1. ]
 [24.   0.   0.   1.   0.   0. ]
 [37.   1.   0.   2.   0.   1. ]]
[[50.  1.  0.  3.  0.  2.]
 [36.  0.  1.  1.  1.  2.]
 [22.  0.  0.  1.  0.  1.]
 ...
 [33.  0.  0.  1.  0.  1.]
 [30.  0.  1.  2.  1.  2.]
 [18.  1.  0.  1.  0.  1.]]

Prueba con test:

Exactitud de Random Forest: 0.9981167608286252

Informe de clasificación para Random Forest:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       319
           1       1.00      0.97      0.99        39
           2       0.99      1.00      1.00       173

    accuracy                           1.00       531
   macro avg       1.00      0.99      0.99       531
weighted avg       1.

### Conclusión

Usaré los árboles de decisión.

Ahora debo construir el pipeline para poder exportar el modelo:

A partir de los datos: edad, género, peso, altura, circunferencia de cintura y de cadera:

1. Calcular los indicadores
2. Seleccionar los datos que necesitamos
3. Transformar los datos
4. Hacer la predicción
5. Mostrar los resultados


### Todavía no tengo muy claro si se guarda todo el proceso, incluso el entrenamiento del modelo. Eso debo revisarlo

Creo que haré un nuevo notebook donde construiré el pipeline final. Este quedaría para el estudio de cómo seleccioné el modelo. Ver los notebooks de Antonio para hacer el último notebook


## Main


In [23]:
def save_model():
    
    from joblib import dump
    import pickle
    
    CURRENT_DIR = Path.cwd()

    BASE_DIR = Path(CURRENT_DIR).parent

    df_obesity = pd.read_parquet(f"{BASE_DIR}/data/out/obesity_labeled.parquet", engine='fastparquet')
    
    cols_to_leave = ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict', 'obesity']
      
    df_train = ccf.select_data(df_obesity, [], cols_to_leave)
    
    Y_train = df_train['obesity'].copy()
    obesity_data = df_train.drop(['obesity'], axis=1)
    
    cols_encode = []
    cols_scale = []
    cols_pass = ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict']
    
    # Transformo los datos 
    X_train_scaled = dcf.transform_df(obesity_data, cols_encode, cols_scale, cols_pass)

    # Creo el modelo
    model = DecisionTreeClassifier(random_state=42)
    
    # Entreno los modelos
    model.fit(X_train_scaled, Y_train)
    
    dump(model, 'dtree_obesity.joblib')

    # save the model to a file
    with open(f"{BASE_DIR}/models/dtree_obesity.pkl", "wb") as f:
        pickle.dump(model, f)
    

In [24]:
save_model()

[[47.  1.  1.  3.  1.  3.]
 [50.  1.  1.  3.  0.  2.]
 [28.  1.  1.  2.  0.  2.]
 ...
 [23.  0.  0.  2.  0.  1.]
 [24.  1.  0.  1.  0.  1.]
 [22.  0.  0.  1.  0.  0.]]


### Pipeline para predicción para una persona

**Entrada:** Un JSON con la siguiente información para una persona: Edad, género, peso, estatura, contorno de cintura y contorno de cadera

**Salida:** Indicadores calculados y variable **obesity**: 

**Pasos:**

1. Calcular las nuevas variables
2. Seleccionar sólo las columnas que nos interesan
3. Hacer la predicción con el modelo ya entrenado
4. Devolver el resultado

In [25]:
person = [{
    'age': 49,
    'gender': 'female',
    'height': 154.01,
    'weight': 51.20,
    'waist_circum_preferred': 85.0,
    'hip_circum': 96
}]

json_object = json.dumps(person) 
df_person = pd.read_json(json_object, orient='records')
df_person

Unnamed: 0,age,gender,height,weight,waist_circum_preferred,hip_circum
0,49,female,154.01,51.2,85,96


In [26]:
from custom_functions import custom_calculus
import pickle

CURRENT_DIR = Path.cwd()
BASE_DIR = Path(CURRENT_DIR).parent

df_person_transformed = custom_calculus(df_person)

cols_to_leave = ['age', 'gender_bin', 'obesity_cc', 'obesity_bmi', 'obesity_rcc', 'obesity_ict']
      
df_person_transformed = ccf.select_data(df_person_transformed, [], cols_to_leave)

with open(f"{BASE_DIR}/models/dtree_obesity.pkl", "rb") as f:
    trained_model = pickle.load(f)

df_person_transformed['obesity'] = trained_model.predict(df_person_transformed) 

df_person_transformed.columns = ['age', 'gender', 'cc', 'bmi', 'rcc', 'ict', 'obesity']

df_person_transformed

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4430 entries, 1 to 4464
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     4430 non-null   float64
 1   age_range               4430 non-null   object 
 2   gender                  4430 non-null   object 
 3   height                  4430 non-null   float64
 4   weight                  4430 non-null   float64
 5   waist_circum_preferred  4430 non-null   float64
 6   hip_circum              4430 non-null   float64
 7   gender_bin              4430 non-null   int64  
 8   bmi                     4430 non-null   float64
 9   rcc                     4430 non-null   float64
 10  ict                     4430 non-null   float64
 11  obesity_bmi             4430 non-null   int64  
 12  obesity_bmi_txt         4430 non-null   object 
 13  obesity_cc              4430 non-null   int64  
 14  obesity_cc_txt          4430 non-null   



Unnamed: 0,age,gender,cc,bmi,rcc,ict,obesity
0,49,0,1,1,2,2,2


In [27]:
js = df_person_transformed.to_json(orient = 'records')

print(js)

[{"age":49,"gender":0,"cc":1,"bmi":1,"rcc":2,"ict":2,"obesity":2}]


In [28]:
diccionario = {'age': 0, 'gender': 'female', 'weight': 0.0, 'height': 0.0, 'waist_circum_preferred': 0.0, 'hip_circum': 0.0}

df_dic = pd.DataFrame.from_dict([diccionario])

df_dic

Unnamed: 0,age,gender,weight,height,waist_circum_preferred,hip_circum
0,0,female,0.0,0.0,0.0,0.0
