In [1]:
#Importación de librerías y preparación
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

RANDOM_STATE = 42

# Rutas
DATA_PATH = Path("../data/stroke.csv")
OUTPUTS = Path("../outputs")
TABLAS = OUTPUTS / "tablas"
FIGURAS = OUTPUTS / "figuras"
TABLAS.mkdir(parents=True, exist_ok=True)
FIGURAS.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", None)

In [2]:
#Carga de datos
df = pd.read_csv(DATA_PATH)

# Comprobaciones mínimas
assert "stroke" in df.columns, "No encuentro la columna 'stroke'. Revisa el CSV."
assert set(df["stroke"].unique()) <= {0,1}, "La variable 'stroke' debe ser binaria 0/1."

n_rows, n_cols = df.shape
print(f"Dimensiones: {n_rows} filas x {n_cols} columnas")

# Visión general
display(df.head())
display(df.isna().sum().sort_values(ascending=False))
print("Duplicados:", df.duplicated().sum())
print("Balance de clases:\n", df["stroke"].value_counts(normalize=True).rename("pct")*100)

# Separar X e y
y = df["stroke"].astype(int)
X = df.drop(columns=["stroke"])

Dimensiones: 5110 filas x 12 columnas


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


bmi                  201
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
smoking_status         0
stroke                 0
dtype: int64

Duplicados: 0
Balance de clases:
 stroke
0    95.127202
1     4.872798
Name: pct, dtype: float64


In [3]:
#Numéricas y categóricas
num_cols = ["age", "avg_glucose_level", "bmi", "hypertension", "heart_disease"]
cat_cols = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]

# Verificar existencia
missing_num = [c for c in num_cols if c not in X.columns]
missing_cat = [c for c in cat_cols if c not in X.columns]
assert not missing_num, f"Faltan numéricas en X: {missing_num}"
assert not missing_cat, f"Faltan categóricas en X: {missing_cat}"

X[num_cols].head(), X[cat_cols].head()

(    age  avg_glucose_level   bmi  hypertension  heart_disease
 0  67.0             228.69  36.6             0              1
 1  61.0             202.21   NaN             0              0
 2  80.0             105.92  32.5             0              1
 3  49.0             171.23  34.4             0              0
 4  79.0             174.12  24.0             1              0,
    gender ever_married      work_type Residence_type   smoking_status
 0    Male          Yes        Private          Urban  formerly smoked
 1  Female          Yes  Self-employed          Rural     never smoked
 2    Male          Yes        Private          Rural     never smoked
 3  Female          Yes        Private          Urban           smokes
 4  Female          Yes  Self-employed          Rural     never smoked)

In [4]:
#Valores faltantes bmi

#valores faltantes en bmi
print("Valores faltantes en 'bmi':", df['bmi'].isna().sum())

# Imputación
median_bmi = df['bmi'].median()
df['bmi'] = df['bmi'].fillna(median_bmi)

# Verificación
print("Valores faltantes tras imputación:", df['bmi'].isna().sum())
print("Mediana usada para imputación:", median_bmi)

Valores faltantes en 'bmi': 201
Valores faltantes tras imputación: 0
Mediana usada para imputación: 28.1


In [6]:
#Manejo de var categóricas
from sklearn.preprocessing import OneHotEncoder

# Inicializar codificador
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Ajuste y transformación de var cat
X_cat = ohe.fit_transform(X[cat_cols])

# Obtener nombres de columnas nuevas
ohe_cols = ohe.get_feature_names_out(cat_cols)

# Crear DataFrame con variables codificadas
X_cat_df = pd.DataFrame(X_cat, columns=ohe_cols, index=X.index)

# Concatenar con las numéricas
X_num_df = X[num_cols].copy()
X_prepared = pd.concat([X_num_df, X_cat_df], axis=1)

print("Dimensiones de X_prepared:", X_prepared.shape)
X_prepared.head()

Dimensiones de X_prepared: (5110, 21)


Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,228.69,36.6,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,61.0,202.21,,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,80.0,105.92,32.5,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,49.0,171.23,34.4,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,79.0,174.12,24.0,1,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [7]:
# Escalado var num
from sklearn.preprocessing import StandardScaler

# dataset preparado
X_scaled = X_prepared.copy()

# Definir las columnas numéricas que se deben escalar
num_cols_to_scale = ["age", "avg_glucose_level", "bmi"]

# Aplicar escalado estándar (media=0, DE=1)
scaler = StandardScaler()
X_scaled[num_cols_to_scale] = scaler.fit_transform(X_scaled[num_cols_to_scale])

print("Medias tras escalado:", X_scaled[num_cols_to_scale].mean().round(3).to_dict())
print("Desviaciones estándar:", X_scaled[num_cols_to_scale].std().round(3).to_dict())

X_scaled.head()


Medias tras escalado: {'age': 0.0, 'avg_glucose_level': 0.0, 'bmi': -0.0}
Desviaciones estándar: {'age': 1.0, 'avg_glucose_level': 1.0, 'bmi': 1.0}


Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.051434,2.706375,0.981345,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.78607,2.121559,,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.62639,-0.005028,0.459269,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.255342,1.437358,0.701207,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1.582163,1.501184,-0.623083,1,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [8]:
#Trabajar con desbalance de clases
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

# Distribución original
print("Distribución original:", Counter(y))

#oversampler
ros = RandomOverSampler(random_state=42)

# Aplicar al dataset escalado
X_resampled, y_resampled = ros.fit_resample(X_scaled, y)

#Nueva distribución
print("Distribución tras oversampling:", Counter(y_resampled))
print("Dimensiones originales:", X_scaled.shape)
print("Dimensiones balanceadas:", X_resampled.shape)

Distribución original: Counter({0: 4861, 1: 249})
Distribución tras oversampling: Counter({1: 4861, 0: 4861})
Dimensiones originales: (5110, 21)
Dimensiones balanceadas: (9722, 21)
