# Random Forest

In [None]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/dt-sc-Josue-Diaz/Proyecto-DS-II/refs/heads/main/05_models_evaluation/data_para_entrenamiento.csv')
df = df.drop('Unnamed: 0',axis = 'columns')

X = df.drop(columns=['moroso'])
y = df['moroso']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred_train_rf = random_forest.predict(X_train)
y_pred_test_rf = random_forest.predict(X_test)

print("Training")
print(classification_report(y_train, y_pred_train_rf))
print("---------------------------------------------------------")
print("Test")
print(classification_report(y_test, y_pred_test_rf))

Training
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    102176
           1       1.00      0.99      0.99      7381

    accuracy                           1.00    109557
   macro avg       1.00      1.00      1.00    109557
weighted avg       1.00      1.00      1.00    109557

---------------------------------------------------------
Test
              precision    recall  f1-score   support

           0       0.94      0.99      0.97     34053
           1       0.52      0.19      0.27      2466

    accuracy                           0.93     36519
   macro avg       0.73      0.59      0.62     36519
weighted avg       0.91      0.93      0.92     36519



## Optimizado

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

random_forest = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight="balanced",
    random_state=42
)

random_forest.fit(X_res, y_res)

y_pred_train_rf = random_forest.predict(X_train)
y_pred_test_rf = random_forest.predict(X_test)

print("Training")
print(classification_report(y_train, y_pred_train_rf))
print("---------------------------------------------------------")
print("Test")
print(classification_report(y_test, y_pred_test_rf))

Training
              precision    recall  f1-score   support

           0       0.98      0.78      0.87    102176
           1       0.22      0.82      0.34      7381

    accuracy                           0.79    109557
   macro avg       0.60      0.80      0.61    109557
weighted avg       0.93      0.79      0.84    109557

---------------------------------------------------------
Test
              precision    recall  f1-score   support

           0       0.98      0.78      0.87     34053
           1       0.20      0.78      0.32      2466

    accuracy                           0.78     36519
   macro avg       0.59      0.78      0.60     36519
weighted avg       0.93      0.78      0.83     36519



In [6]:
random_forest_sample = df.sample(n=20000, random_state=42)

X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X, y, test_size=0.3, random_state=42)

random_forest.fit(X_res, y_res)

y_pred_train_rf = random_forest.predict(X_train)
y_pred_test_rf = random_forest.predict(X_test)

print("Training")
print(classification_report(y_train, y_pred_train_rf))
print("---------------------------------------------------------")
print("Test")
print(classification_report(y_test, y_pred_test_rf))

Training
              precision    recall  f1-score   support

           0       0.98      0.78      0.87    102176
           1       0.22      0.82      0.34      7381

    accuracy                           0.79    109557
   macro avg       0.60      0.80      0.61    109557
weighted avg       0.93      0.79      0.84    109557

---------------------------------------------------------
Test
              precision    recall  f1-score   support

           0       0.98      0.78      0.87     34053
           1       0.20      0.78      0.32      2466

    accuracy                           0.78     36519
   macro avg       0.59      0.78      0.60     36519
weighted avg       0.93      0.78      0.83     36519



# Intentos de mejora con chatgpt

Dado el modelo de xgboost, que fue el que mejor encontramos; obtuvimos de chatgpt el siguiente modelo. 

In [9]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/dt-sc-Josue-Diaz/preentrega/refs/heads/main/data_credit_training.csv')
df.columns

df['MonthlyIncome'] = df['MonthlyIncome'].fillna(df['MonthlyIncome'].median())
df['NumberOfDependents'] = df['NumberOfDependents'].fillna(0)  # Si 0 tiene sentido
df['DebtRatio'] = df['DebtRatio'].clip(upper=1)

X = df.drop(columns=['SeriousDlqin2yrs'])
y = df['SeriousDlqin2yrs']


###------------------------------------------------------------------------


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from joblib import dump
import numpy as np

# Dividir los datos en entrenamiento y prueba
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.30, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Aplicar SMOTE para balancear clases
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Escalar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Parámetros para GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': [None, 'balanced']
}

# GridSearchCV con RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train_resampled)

# Mejor modelo
best_model = grid_search.best_estimator_
print(f"Mejores parámetros: {grid_search.best_params_}")

# Guardar modelo
model_filename = 'rf_best_model.joblib'
dump(best_model, model_filename)
print(f"Modelo guardado como {model_filename}")

# Predicciones y evaluación
y_pred_train = best_model.predict(X_train_scaled)
y_pred_test = best_model.predict(X_test_scaled)

# Obtener las probabilidades para ROC-AUC
y_pred_train_proba = best_model.predict_proba(X_train_scaled)[:, 1]
y_pred_test_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Evaluar en conjunto de entrenamiento y prueba
roc_auc_train = roc_auc_score(y_train_resampled, y_pred_train_proba)
roc_auc_test = roc_auc_score(y_test, y_pred_test_proba)

print(f"ROC-AUC (Entrenamiento): {roc_auc_train:.2f}")
print(f"ROC-AUC (Prueba): {roc_auc_test:.2f}")

print("Métricas de clasificación - Entrenamiento")
print(classification_report(y_train_resampled, y_pred_train))
print("---------------------------------------------------------")

print("Métricas de clasificación - Prueba")
print(classification_report(y_test, y_pred_test))
print("---------------------------------------------------------")

# Retornar el modelo y métricas
modelo, metricas  = best_model, {
    'roc_auc_train': roc_auc_train,
    'roc_auc_test': roc_auc_test,
    'classification_report_train': classification_report(y_train_resampled, y_pred_train, output_dict=True),
    'classification_report_test': classification_report(y_test, y_pred_test, output_dict=True)
}


Fitting 3 folds for each of 48 candidates, totalling 144 fits
Mejores parámetros: {'class_weight': None, 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Modelo guardado como rf_best_model.joblib
ROC-AUC (Entrenamiento): 0.97
ROC-AUC (Prueba): 0.83
Métricas de clasificación - Entrenamiento
              precision    recall  f1-score   support

           0       0.91      0.91      0.91     97982
           1       0.91      0.91      0.91     97982

    accuracy                           0.91    195964
   macro avg       0.91      0.91      0.91    195964
weighted avg       0.91      0.91      0.91    195964

---------------------------------------------------------
Métricas de clasificación - Prueba
              precision    recall  f1-score   support

           0       0.97      0.87      0.92     41992
           1       0.25      0.58      0.35      3008

    accuracy                           0.85     45000
   macro avg       0.61      0.73  

In [10]:
df = pd.read_csv('https://raw.githubusercontent.com/dt-sc-Josue-Diaz/preentrega/refs/heads/main/data_credit_training.csv')
df.columns

df['MonthlyIncome'] = df['MonthlyIncome'].fillna(df['MonthlyIncome'].median())
df['NumberOfDependents'] = df['NumberOfDependents'].fillna(0)  # Si 0 tiene sentido
df['DebtRatio'] = df['DebtRatio'].clip(upper=1)

X = df.drop(columns=['SeriousDlqin2yrs'])
y = df['SeriousDlqin2yrs']

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

random_forest = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight="balanced",
    random_state=42
)

random_forest.fit(X_res, y_res)

y_pred_train_rf = random_forest.predict(X_train)
y_pred_test_rf = random_forest.predict(X_test)

print("Training")
print(classification_report(y_train, y_pred_train_rf))
print("---------------------------------------------------------")
print("Test")
print(classification_report(y_test, y_pred_test_rf))

Training
              precision    recall  f1-score   support

           0       0.98      0.78      0.87     97982
           1       0.21      0.83      0.34      7018

    accuracy                           0.78    105000
   macro avg       0.60      0.80      0.60    105000
weighted avg       0.93      0.78      0.84    105000

---------------------------------------------------------
Test
              precision    recall  f1-score   support

           0       0.98      0.78      0.87     41992
           1       0.20      0.79      0.32      3008

    accuracy                           0.78     45000
   macro avg       0.59      0.79      0.60     45000
weighted avg       0.93      0.78      0.83     45000

