In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from joblib import dump

In [3]:
df_data = pd.read_csv("../results/3_standardization_final.csv")
df_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,is_isolated,is_lof,Outcome
0,0.639947,0.865108,-0.033518,0.6655021,-4.634358e-18,0.166292,0.468492,1.425995,1,1,1
1,-0.844885,-1.206162,-0.529859,-0.01746338,-4.634358e-18,-0.852531,-0.365061,-0.190672,1,1,0
2,1.23388,2.015813,-0.695306,3.501613e-17,-4.634358e-18,-1.332833,0.604397,-0.105584,1,1,1
3,-0.844885,-1.074652,-0.529859,-0.7004289,-0.7243887,-0.634212,-0.920763,-1.041549,1,1,0
4,-1.141852,0.503458,-2.680669,0.6655021,0.1465506,1.54898,5.484909,-0.020496,-1,-1,1


In [4]:
X = df_data.drop(columns=["Outcome"])
y = df_data["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"Original dataset shape: {y_train.value_counts()}")
print(f"Balanced dataset shape: {pd.Series(y_train_balanced).value_counts()}")

Original dataset shape: Outcome
0    400
1    214
Name: count, dtype: int64
Balanced dataset shape: Outcome
0    400
1    400
Name: count, dtype: int64


In [6]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_balanced, y_train_balanced)

In [7]:
# 5. Evaluar el modelo en el conjunto de prueba
y_pred = clf.predict(X_test)

# Calcular y mostrar las métricas de evaluación
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.7338
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.78      0.79       100
           1       0.61      0.65      0.63        54

    accuracy                           0.73       154
   macro avg       0.71      0.71      0.71       154
weighted avg       0.74      0.73      0.74       154

Confusion Matrix:
 [[78 22]
 [19 35]]


In [8]:
dump(clf, "../results/random_forest_model.joblib")

['../results/random_forest_model.joblib']