In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib
print("OK ✅", pd.__version__, sklearn.__version__)

OK ✅ 2.3.3 1.6.1


# Laboratorio 1 — AlpesHearth

## 1. Carga de datos

In [4]:
import pandas as pd

TRAIN_PATH = "Datos Lab 1.csv"
TEST_PATH  = "Datos Test Lab 1.csv"       
TARGET_COL = "CVD Risk Score"    

df = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH, sep=";")

print("Train:", df.shape)
print("Test:", df_test.shape)
df.head()
df.columns


Train: (1639, 24)
Test: (194, 22)


Index(['Patient ID', 'Date of Service', 'Sex', 'Age', 'Weight (kg)',
       'Height (m)', 'BMI', 'Abdominal Circumference (cm)',
       'Blood Pressure (mmHg)', 'Total Cholesterol (mg/dL)', 'HDL (mg/dL)',
       'Fasting Blood Sugar (mg/dL)', 'Smoking Status', 'Diabetes Status',
       'Physical Activity Level', 'Family History of CVD', 'Height (cm)',
       'Waist-to-Height Ratio', 'Systolic BP', 'Diastolic BP',
       'Blood Pressure Category', 'Estimated LDL (mg/dL)', 'CVD Risk Score',
       'CVD Risk Level'],
      dtype='object')

## 2. Exploración y perfilamiento

In [9]:
# Información general
df.info()

# Conteo de valores nulos
df.isna().sum().sort_values(ascending=False)

cols_to_drop = [
    "Patient ID",
    "Date of Service",
    "Blood Pressure (mmHg)",
    "CVD Risk Level"
]

df = df.drop(columns=cols_to_drop, errors="ignore")
df_test = df_test.drop(columns=cols_to_drop, errors="ignore")

df.columns

df.isna().sum().sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1639 entries, 0 to 1638
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Sex                           1639 non-null   object 
 1   Age                           1571 non-null   float64
 2   Weight (kg)                   1566 non-null   float64
 3   Height (m)                    1578 non-null   float64
 4   BMI                           1586 non-null   float64
 5   Abdominal Circumference (cm)  1578 non-null   float64
 6   Total Cholesterol (mg/dL)     1571 non-null   float64
 7   HDL (mg/dL)                   1557 non-null   float64
 8   Fasting Blood Sugar (mg/dL)   1585 non-null   float64
 9   Smoking Status                1639 non-null   object 
 10  Diabetes Status               1639 non-null   object 
 11  Physical Activity Level       1639 non-null   object 
 12  Family History of CVD         1639 non-null   object 
 13  Hei

Diastolic BP                    85
HDL (mg/dL)                     82
Waist-to-Height Ratio           76
Weight (kg)                     73
Total Cholesterol (mg/dL)       68
Age                             68
Height (cm)                     68
Systolic BP                     61
Height (m)                      61
Abdominal Circumference (cm)    61
Estimated LDL (mg/dL)           57
Fasting Blood Sugar (mg/dL)     54
BMI                             53
CVD Risk Score                  29
Blood Pressure Category          0
Sex                              0
Family History of CVD            0
Physical Activity Level          0
Smoking Status                   0
Diabetes Status                  0
dtype: int64

## 3. Preparación de datos

In [18]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1) Eliminar filas donde el target esté vacío (NO se imputa el target)
print("Filas antes:", df.shape)
df = df.dropna(subset=[TARGET_COL]).copy()
print("Filas después:", df.shape)

# 2) Separar X/y
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# 3) Split obligatorio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

print("X_train, X_test:", X_train.shape, X_test.shape)
print("Nulos en y_train:", y_train.isna().sum(), "| Nulos en y_test:", y_test.isna().sum())

num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()

print("Numéricas:", num_cols)
print("Categóricas:", cat_cols)

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ],
    remainder="drop"
)


Filas antes: (1610, 20)
Filas después: (1610, 20)
X_train, X_test: (1207, 19) (403, 19)
Nulos en y_train: 0 | Nulos en y_test: 0
Numéricas: ['Age', 'Weight (kg)', 'Height (m)', 'BMI', 'Abdominal Circumference (cm)', 'Total Cholesterol (mg/dL)', 'HDL (mg/dL)', 'Fasting Blood Sugar (mg/dL)', 'Height (cm)', 'Waist-to-Height Ratio', 'Systolic BP', 'Diastolic BP', 'Estimated LDL (mg/dL)']
Categóricas: ['Sex', 'Smoking Status', 'Diabetes Status', 'Physical Activity Level', 'Family History of CVD', 'Blood Pressure Category']


## 4. Modelos (2 pipelines)

In [24]:
model_1 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

model_1.fit(X_train, y_train)
y_pred_1 = model_1.predict(X_test)


mse_1 = mean_squared_error(y_test, y_pred_1)
rmse_1 = np.sqrt(mse_1)
mae_1 = mean_absolute_error(y_test, y_pred_1)
r2_1 = r2_score(y_test, y_pred_1)

print("Modelo 1 - LinearRegression")
print("RMSE:", rmse_1)
print("MAE:", mae_1)
print("R2:", r2_1)

model_2 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Ridge(alpha=1.0, random_state=42))
])

model_2.fit(X_train, y_train)
y_pred_2 = model_2.predict(X_test)

mse_2 = mean_squared_error(y_test, y_pred_2)
rmse_2 = np.sqrt(mse_2)
mae_2 = mean_absolute_error(y_test, y_pred_2)
r2_2 = r2_score(y_test, y_pred_2)

print("\nModelo 2 - Ridge")
print("RMSE:", rmse_2)
print("MAE:", mae_2)
print("R2:", r2_2)



Modelo 1 - LinearRegression
RMSE: 10.606979818829508
MAE: 4.042407984744389
R2: 0.003877810056488573

Modelo 2 - Ridge
RMSE: 10.605436657324653
MAE: 4.037629663357092
R2: 0.0041676315938936614


## 5. Tabla comparativa (RMSE/MAE/R2)

In [25]:
import pandas as pd

results = pd.DataFrame({
    "Modelo": ["LinearRegression", "Ridge"],
    "RMSE": [rmse_1, rmse_2],
    "MAE": [mae_1, mae_2],
    "R2": [r2_1, r2_2]
})

results


Unnamed: 0,Modelo,RMSE,MAE,R2
0,LinearRegression,10.60698,4.042408,0.003878
1,Ridge,10.605437,4.03763,0.004168


## 6. Interpretación e importancia de variables

In [26]:
# Obtener nombres de las variables después del preprocesamiento
feature_names = model_2.named_steps["preprocessor"].get_feature_names_out()


In [27]:
coefficients = model_2.named_steps["regressor"].coef_

coef_df = pd.DataFrame({
    "Variable": feature_names,
    "Coeficiente": coefficients
})

coef_df = coef_df.sort_values(by="Coeficiente", key=abs, ascending=False)
coef_df.head(10)


Unnamed: 0,Variable,Coeficiente
27,cat__Blood Pressure Category_Normal,-2.097324
3,num__BMI,1.869249
17,cat__Diabetes Status_N,-1.564817
18,cat__Diabetes Status_Y,1.564817
9,num__Waist-to-Height Ratio,1.377157
4,num__Abdominal Circumference (cm),-1.018046
21,cat__Physical Activity Level_Moderate,1.009008
1,num__Weight (kg),-0.906116
20,cat__Physical Activity Level_Low,-0.900785
24,cat__Blood Pressure Category_Elevated,0.763184


## 7. Respuestas “Análisis de resultados”

## 8. Predicciones en test no etiquetado

## 9. Uso de herramientas de IA generativa