### Preprocesamiento

In [70]:
import kagglehub
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, PowerTransformer, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, confusion_matrix, classification_report

from scipy.stats import randint
import joblib
from pathlib import Path

In [52]:
path = "/home/guille/.cache/kagglehub/datasets/mdsultanulislamovi/sleep-disorder-diagnosis-dataset/versions/1"
file_path = "Sleep_health_and_lifestyle_dataset.csv"
full_path = os.path.join(path, file_path)
df = pd.read_csv(full_path)

In [53]:
df

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [54]:
df_copy = df.copy()

##### Gestionar columnas categóricas

In [55]:
# Codificar Gender
df_copy["Gender"] = df_copy["Gender"].map({"Male": 0, "Female": 1})

# Codificar BMI
ord_encoder = OrdinalEncoder(categories=[["Normal", "Normal Weight", "Overweight", "Obese"]])
df_copy["BMI Encoded"] = ord_encoder.fit_transform(df[["BMI Category"]])

# Codificar Occupation
oh_encoder = OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False)
occ_encoded = oh_encoder.fit_transform(df_copy[["Occupation"]])

df_occ = pd.DataFrame(occ_encoded, columns=oh_encoder.get_feature_names_out(["Occupation"]))
df_copy = pd.concat([df_copy.drop(columns=["Occupation"]), df_occ], axis=1)

# Codificar Blood Pressure
df_copy[["Systolic", "Diastolic"]] = df_copy["Blood Pressure"].str.split("/", expand=True)
def calc_pressure(row):
    s = int(row["Systolic"])
    d = int(row["Diastolic"])

    if pd.isna(s) or pd.isna(d):
        return None
    
    if s < 120 and d < 80:
        return 0
    
    if (120 <= s <= 129) and d < 80:
        return 1
    
    if (130 <= s <= 139) or (80 <= d <= 89):
        return 2
    
    if s >= 180 or d >= 120:
        return 4
    
    if s >= 140 or d >= 90:
        return 3
    
    return None

df_copy["BP Numeric"] = df_copy.apply(calc_pressure, axis=1)

# Codificar target
df_copy["Sleep Disorder"] = df_copy["Sleep Disorder"].map({"Insomnia": 1, "Sleep Apnea": 2}).fillna(0)

##### Eliminar columnas innecesarias

In [56]:
df_copy = df_copy.drop(columns=["Person ID", "BMI Category", "Blood Pressure", "Systolic", "Diastolic"])

##### Normalizar o estandarizar
Skewness (asimetría):
 - Alrededor de 0 --> Distribución simétrica
 - Mayor que 0 --> Asimetría positiva (cola hacia derecha)
 - Menor que 0 --> Asimetría negativa (cola hacia izquierda)

Kurtosis: En Pandas devuelve exceso (kurtosis - 3):
 - Alrededor de 0 --> Pico normal
 - Mayor que 0 --> Pico más alto, colas más pesadas (verticales)
 - Menor que 0 --> Pico más bajo, colas más ligeras (horizontales)

Distribuciones normales --> Estandarizar (StandardScaler)  
Distribuciones no gaussianas --> Normalizar (MinMaxScaler)  
Distribuciones binarias --> Dejar como están

In [57]:
to_normalize = ["Age", "Sleep Duration", "Quality of Sleep", "Physical Activity Level", "Stress Level", "Heart Rate", "Daily Steps"]
for col in to_normalize:
    skew, kurt = df_copy[col].skew(), df_copy[col].kurtosis()
    print(f"{col.capitalize()}:\n\tSimetría {skew}\n\tKurtosis:{kurt}")

Age:
	Simetría 0.2572221422742844
	Kurtosis:-0.9097795476259583
Sleep duration:
	Simetría 0.037554389846484834
	Kurtosis:-1.286506239045075
Quality of sleep:
	Simetría -0.20744763173836073
	Kurtosis:-0.7482755418548042
Physical activity level:
	Simetría 0.07448690272010197
	Kurtosis:-1.2660677718961595
Stress level:
	Simetría 0.15432958161578225
	Kurtosis:-1.3273065644578361
Heart rate:
	Simetría 1.2248235470533522
	Kurtosis:2.28645466720358
Daily steps:
	Simetría 0.17827733092832274
	Kurtosis:-0.3940306018221511


Todas a excepción de **_Heart Rate_** tienen una distribución simétrica (_skew_ < 0.5).  
En el caso de **_Heart Rate_** la kurtosis indica necesidad de transformación. Se usa Box-Cox por ser todos los valores mayores que 0.  

In [58]:
# Aplicar transformación Box-Cox
pt = PowerTransformer(method='box-cox')
df_copy[["Heart Rate"]] = pt.fit_transform(df_copy[["Heart Rate"]])

# Estandarizar columnas
scaler = StandardScaler()
df_copy[to_normalize] = scaler.fit_transform(df_copy[to_normalize])

In [59]:
df_copy

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,Sleep Disorder,BMI Encoded,...,Occupation_Engineer,Occupation_Lawyer,Occupation_Manager,Occupation_Nurse,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,BP Numeric
0,0,-1.753096,-1.298887,-1.098280,-0.825418,0.347021,1.545762,-1.619584,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2
1,0,-1.637643,-1.173036,-1.098280,0.039844,1.475592,1.229266,1.970077,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0,-1.637643,-1.173036,-1.098280,0.039844,1.475592,1.229266,1.970077,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0,-1.637643,-1.550588,-2.771424,-1.402260,1.475592,2.396916,-2.362273,2.0,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
4,0,-1.637643,-1.550588,-2.771424,-1.402260,1.475592,2.396916,-2.362273,2.0,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,1,1.941401,1.218127,1.411435,0.760896,-1.345836,-0.450317,0.113356,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3
370,1,1.941401,1.092276,1.411435,0.760896,-1.345836,-0.450317,0.113356,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3
371,1,1.941401,1.218127,1.411435,0.760896,-1.345836,-0.450317,0.113356,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3
372,1,1.941401,1.218127,1.411435,0.760896,-1.345836,-0.450317,0.113356,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3


##### Separación training-test

In [60]:
# Features y target
X = df_copy.drop(columns=["Sleep Disorder"])
y = df_copy["Sleep Disorder"]

# División train-test, stratify mantiene las proporciones de clases del target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y)

# Comprobar matches
print(f"Forma de X_train: {X_train.shape}\tForma de y_train: {y_train.shape}")
print(f"Forma de X_test: {X_test.shape}\tForma de y_test: {y_test.shape}")

Forma de X_train: (299, 20)	Forma de y_train: (299,)
Forma de X_test: (75, 20)	Forma de y_test: (75,)


##### Selección de modelos

In [61]:
# Validación cruzada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Definición de modelos
models = {
    "LogisticRegression": LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric="mlogloss", learning_rate= 0.05),
    "SVM": SVC(kernel="rbf", probability=True)
}

# Evaluación
results = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring='f1_macro')
    results[name] = {
        "mean_f1": np.mean(scores),
        "std_f1": np.std(scores)
    }

for name, scores in results.items():
    print(f"Modelo: {name}")
    print(f"Media: {scores["mean_f1"]} // DT: {scores["std_f1"]}\n")



Modelo: LogisticRegression
Media: 0.8688726205264427 // DT: 0.047497188444418866

Modelo: RandomForest
Media: 0.8865585754635303 // DT: 0.042501290820022686

Modelo: XGBoost
Media: 0.8789935855314324 // DT: 0.03862056148107422

Modelo: SVM
Media: 0.8713691784825123 // DT: 0.04847743315247311



##### Refinar el mejor modelo: RandomForest

In [62]:
rf_base = models["RandomForest"]
rf_base.fit(X_train, y_train)
base_pred = rf_base.predict(X_test)

def evaluate_rf(y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred, average='macro'))
    print("R2:", r2_score(y_test, y_pred))

evaluate_rf(base_pred)

Accuracy: 0.9466666666666667
F1: 0.9335669123273328
R2: 0.9192680301399354


In [65]:
# Ajustar hiperparámetros
params = {
    "n_estimators": randint(100, 900),
    "max_depth": randint(5, 100),
    "max_features": ["sqrt", "log2"],
    "min_samples_split": randint(2, 50),
    "min_samples_leaf": randint(1, 10),
    "bootstrap": [True, False]
}

search = RandomizedSearchCV(
    rf_base, params, n_iter=60, scoring="f1_macro",
    cv=cv, n_jobs=-1, random_state=42, verbose=1
)
search.fit(X_train, y_train)

best_rf = search.best_estimator_

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [66]:
# Evaluar el mejor RF
best_rf.fit(X_train, y_train)
best_pred = best_rf.predict(X_test)

evaluate_rf(best_pred)

Accuracy: 0.9066666666666666
F1: 0.8704379266177019
R2: 0.8587190527448869


Como salió mejor **rf_base** que **best_rf**, nos quedamos con **rf_base**

In [71]:
current_dir = Path.cwd()
model_path = current_dir / "rf_sleep_predictor.pkl"

if not model_path.exists():
    joblib.dump(rf_base, "rf_sleep_predictor.pkl")
else:
    print("El modelo ya existe y está guardado")

El modelo ya existe y está guardado
