In [None]:
# ============================================
# 1. Imports e instalaci√≥n
# ============================================
!pip install openpyxl imbalanced-learn -q

import pandas as pd
import numpy as np

from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    roc_auc_score
)
from imblearn.over_sampling import RandomOverSampler


In [None]:
# ============================================
# 2. Subir el archivo dataset.xlsx
# ============================================
print("üëâ Sube tu archivo dataset.xlsx")
uploaded = files.upload()

filename = next(iter(uploaded))
print(f"Archivo recibido: {filename}")


üëâ Sube tu archivo dataset.xlsx


Saving dataset (1).xlsx to dataset (1).xlsx
Archivo recibido: dataset (1).xlsx


In [None]:
# ============================================
# 3. Cargar el dataset
# ============================================
df = pd.read_excel(filename)

print("Primeras filas:")
display(df.head())

print("\nInfo:")
print(df.info())

# Eliminar columnas basura de texto si existen (por ejemplo 'Unnamed: 11' o 'Unnamed: 17')
for col in df.columns:
    if "Unnamed" in str(col):
        df = df.drop(columns=[col])

print("\nColumnas despu√©s de limpiar 'Unnamed':")
print(df.columns)

print("\nNulos por columna:")
print(df.isna().sum())


Primeras filas:


Unnamed: 0,GPS_longitude,GPS_latitude,Heart Rate (bpm),SpO2 Level (%),Systolic Blood Pressure (mmHg),Diastolic Blood Pressure (mmHg),Body Temperature (¬∞C),Heart Rate Alert,SpO2 Level Alert,Blood Pressure Alert,Temperature Alert,Unnamed: 11
0,-100.378637,25.770431,86,99,102,80,36.828,1,0,1,0,"Heart Rate Alert: Low=0, Normal=1, High=2"
1,-100.262124,25.756445,60,95,121,67,36.188,1,0,1,0,"SpO2 Level Alert: Normal=0, Low=1"
2,-100.343235,25.532561,83,100,121,78,36.682,1,0,1,0,"Low=0, Normal=1, High=2, Hypertensive Crisis=3"
3,-100.562062,25.699691,79,91,133,94,36.871,1,0,2,0,"Normal=0, Abnormal=1"
4,-100.344226,25.701936,87,93,127,59,36.482,1,0,0,0,



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   GPS_longitude                    5000 non-null   float64
 1   GPS_latitude                     5000 non-null   float64
 2   Heart Rate (bpm)                 5000 non-null   int64  
 3   SpO2 Level (%)                   5000 non-null   int64  
 4   Systolic Blood Pressure (mmHg)   5000 non-null   int64  
 5   Diastolic Blood Pressure (mmHg)  5000 non-null   int64  
 6   Body Temperature (¬∞C)            5000 non-null   float64
 7   Heart Rate Alert                 5000 non-null   int64  
 8   SpO2 Level Alert                 5000 non-null   int64  
 9   Blood Pressure Alert             5000 non-null   int64  
 10  Temperature Alert                5000 non-null   int64  
 11  Unnamed: 11                      4 non-null      object 
dtypes: float64(3

In [None]:
# ============================================
# 4. Crear etiqueta binaria: Tiene_Problema
#    1 si alguna alerta > 0, 0 si todo est√° normal
# ============================================

# Intentamos detectar las columnas de alerta que existan en tu dataset
posibles_alertas = [
    'Heart Rate Alert',
    'SpO2 Level Alert',
    'Blood Pressure Alert',
    'Temperature Alert'
]

alert_cols = [c for c in posibles_alertas if c in df.columns]

if len(alert_cols) == 0:
    raise ValueError("No se encontraron columnas de alerta en el dataset.")

print("Columnas de alerta detectadas:", alert_cols)

df['Tiene_Problema'] = (df[alert_cols] > 0).any(axis=1).astype(int)

print("\nDistribuci√≥n de la etiqueta Tiene_Problema:")
print(df['Tiene_Problema'].value_counts())


Columnas de alerta detectadas: ['Heart Rate Alert', 'SpO2 Level Alert', 'Blood Pressure Alert', 'Temperature Alert']

Distribuci√≥n de la etiqueta Tiene_Problema:
Tiene_Problema
1    4981
0      19
Name: count, dtype: int64


In [None]:
# ============================================
# 5. Definir X y y
#    Usamos SOLO las m√©tricas, NO las columnas de alerta
# ============================================

# Tomamos solo columnas num√©ricas
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Quitamos las columnas de alerta y la etiqueta
feature_cols = [c for c in numeric_cols if c not in alert_cols + ['Tiene_Problema']]

print("Columnas usadas como features:")
print(feature_cols)

X = df[feature_cols]
y = df['Tiene_Problema']


Columnas usadas como features:
['GPS_longitude', 'GPS_latitude', 'Heart Rate (bpm)', 'SpO2 Level (%)', 'Systolic Blood Pressure (mmHg)', 'Diastolic Blood Pressure (mmHg)', 'Body Temperature (¬∞C)']


In [None]:
# ============================================
# 6. Train/Test Split
# ============================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Tama√±os:")
print("X_train:", X_train.shape, "X_test:", X_test.shape)

print("\nDistribuci√≥n en train:")
print(y_train.value_counts())
print("\nDistribuci√≥n en test:")
print(y_test.value_counts())


Tama√±os:
X_train: (4000, 7) X_test: (1000, 7)

Distribuci√≥n en train:
Tiene_Problema
1    3985
0      15
Name: count, dtype: int64

Distribuci√≥n en test:
Tiene_Problema
1    996
0      4
Name: count, dtype: int64


In [None]:
# ============================================
# 7. Rebalancear el conjunto de entrenamiento
# ============================================
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print("\nDistribuci√≥n despu√©s de oversampling (train rebalanceado):")
print(y_train_res.value_counts())



Distribuci√≥n despu√©s de oversampling (train rebalanceado):
Tiene_Problema
1    3985
0    3985
Name: count, dtype: int64


In [None]:
# ============================================
# 8. Entrenar modelo (RandomForest)
# ============================================
clf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

clf.fit(X_train_res, y_train_res)


In [None]:
# ============================================
# 9. Evaluaci√≥n
# ============================================
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_proba)

print(f"\n‚úÖ Accuracy: {acc:.4f}")
print(f"‚úÖ ROC-AUC: {roc:.4f}\n")

print("üìã Classification report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("üî¢ Matriz de confusi√≥n:")
print(confusion_matrix(y_test, y_pred))



‚úÖ Accuracy: 0.9960
‚úÖ ROC-AUC: 0.9992

üìã Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       1.00      1.00      1.00       996

    accuracy                           1.00      1000
   macro avg       0.50      0.50      0.50      1000
weighted avg       0.99      1.00      0.99      1000

üî¢ Matriz de confusi√≥n:
[[  0   4]
 [  0 996]]


In [None]:
def evaluar_paciente(
    GPS_longitude,
    GPS_latitude,
    HeartRate,
    SpO2,
    SystolicBP,
    DiastolicBP,
    Temperature,
    umbral=0.5
):
    datos = pd.DataFrame([{
        'GPS_longitude': GPS_longitude,
        'GPS_latitude': GPS_latitude,
        'Heart Rate (bpm)': HeartRate,
        'SpO2 Level (%)': SpO2,
        'Systolic Blood Pressure (mmHg)': SystolicBP,
        'Diastolic Blood Pressure (mmHg)': DiastolicBP,
        'Body Temperature (¬∞C)': Temperature,
    }])

    proba = clf.predict_proba(datos)[0][1]
    pred = int(proba >= umbral)

    etiqueta = "‚ö†Ô∏è Posible problema" if pred == 1 else "‚úÖ Sin problema aparente"

    print("Resultado:", etiqueta)
    print(f"Probabilidad de problema: {proba:.4f} (umbral={umbral})")

    return pred, proba


In [None]:
evaluar_paciente(
    GPS_longitude=-100.30,
    GPS_latitude=25.70,
    HeartRate=75,
    SpO2=98,
    SystolicBP=120,
    DiastolicBP=80,
    Temperature=36.7,
    umbral=0.6
)


Resultado: ‚ö†Ô∏è Posible problema
Probabilidad de problema: 1.0000 (umbral=0.6)


(1, np.float64(1.0))

In [None]:
# ============================================
# 10. Guardar el modelo entrenado
# ============================================
import joblib

joblib.dump(clf, "modelo_salud_randomforest.pkl")
print("Modelo guardado como 'modelo_salud_randomforest.pkl'")


Modelo guardado como 'modelo_salud_randomforest.pkl'
