Modelo predictivo para readmision de paciente

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from config import CLEAN_PATH

# Features to use
features = ['age', 'gender', 'specialty', 'visit_type', 'visit_duration_minutes', 'cost_usd']
target = 'readmission_within_30_days'

df = pd.read_csv(CLEAN_PATH)
# Create a copy of cleaned dataset
df_model = df[features + [target]].copy()



In [None]:
# Encode categorical variables
le_gender = LabelEncoder()
le_specialty = LabelEncoder()
le_visit_type = LabelEncoder()

df_model['gender'] = le_gender.fit_transform(df_model['gender'])
df_model['specialty'] = le_specialty.fit_transform(df_model['specialty'])
df_model['visit_type'] = le_visit_type.fit_transform(df_model['visit_type'])

# Define X and y
X = df_model[features]
y = df_model[target]

# Split data into train and test sets (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

LogisticRegression(class_weight='balanced', max_iter=1000)
DecisionTreeClassifier(class_weight='balanced')
RandomForestClassifier(class_weight='balanced')

print(y.value_counts(normalize=True))

readmission_within_30_days
0    0.849438
1    0.150562
Name: proportion, dtype: float64


In [3]:
# Fit models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_prob)
    }

results_df = pd.DataFrame(results).T
results_df.sort_values(by="ROC-AUC", ascending=False)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Accuracy,Precision,Recall,ROC-AUC
Logistic Regression,0.84944,0.0,0.0,0.502729
Decision Tree,0.728707,0.149194,0.170519,0.499081
Random Forest,0.84716,0.172414,0.003985,0.493889
