# Random Forest Classifier

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
import os

os.chdir("D:/Algorithmic-Fairness-Interpretability/afi_final_project")

In [2]:
df = pd.read_excel("data/dataproject2024.xlsx")

In [3]:
X = df.drop(["Default (y)", "Pred_default (y_hat)", "ID", "PD"], axis=1)
y = df["Default (y)"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [5]:
param_grid_rf = {
    "n_estimators": [100, 200, 300],  # Number of trees in the forest
    "max_depth": [None, 10, 20, 30],  # Maximum depth of the tree
    "min_samples_split": [
        2,
        5,
        10,
    ],
    "min_samples_leaf": [1, 2, 4],  # Minimum number of samples required at a leaf node
    "bootstrap": [True, False],  # Whether bootstrap samples are used
}

In [6]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
grid_search_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid_rf,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=2,
)


grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [7]:
print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
best_rf_model = grid_search_rf.best_estimator_

Best parameters for Random Forest: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [8]:
y_pred = best_rf_model.predict(X_test)
y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]

In [9]:
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

In [10]:
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8183
AUC-ROC: 0.7700

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.97      0.90      1788
           1       0.63      0.20      0.30       441

    accuracy                           0.82      2229
   macro avg       0.73      0.58      0.60      2229
weighted avg       0.79      0.82      0.78      2229



The Classification Report of the Random Forest indicates that the model misses many true defaults since there is a low recall for defaults (0.20).

In [12]:
feature_importances = best_rf_model.feature_importances_
importance_df = pd.DataFrame(
    {"Feature": X.columns, "Importance": feature_importances}
).sort_values(by="Importance", ascending=False)

print("\nFeature Importances:\n", importance_df)


Feature Importances:
             Feature  Importance
3    Funding amount    0.187192
6   Monthly payment    0.171444
2         Car price    0.170134
1               Age    0.149165
0        Job tenure    0.125264
5     Loan duration    0.075591
9         Homeowner    0.039290
10            Group    0.031945
8           Married    0.027838
7      Credit event    0.014988
4      Down payment    0.007149


In [13]:
result_df = X_test.copy()
result_df["Predicted_PD"] = y_pred_proba
result_df["True_Label"] = y_test.values

print(result_df.head())

result_df.to_csv("predictions_output_random_forest.csv", index=False)

      Job tenure  Age  Car price  Funding amount  Down payment  Loan duration  \
625            2   22       8900            8900             0             60   
2796           0   55      10400            9400             0             72   
101            1   40      15990           14990             0             60   
4767           5   43      19999           23233             0             72   
2018           1   26      11800            4298             1             24   

      Monthly payment  Credit event  Married  Homeowner  Group  Predicted_PD  \
625          0.084828             0        0          0      0      0.354683   
2796         0.083889             0        0          0      0      0.254694   
101          0.127142             0        1          0      0      0.200084   
4767         0.153289             0        0          0      0      0.337645   
2018         0.115528             0        0          0      0      0.166287   

      True_Label  
625          

# Gradient Boosting Classifier

In [14]:
param_grid_gb = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "subsample": [0.8, 1.0],
    "min_samples_split": [2, 5, 10],
}

In [15]:
gb_model = GradientBoostingClassifier(random_state=42)
grid_search_gb = GridSearchCV(
    estimator=gb_model,
    param_grid=param_grid_gb,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=2,
)
grid_search_gb.fit(X_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [16]:
print(f"Best parameters for Gradient Boosting: {grid_search_gb.best_params_}")
best_gb_model = grid_search_gb.best_estimator_

Best parameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 0.8}


In [17]:
y_pred_gb = best_gb_model.predict(X_test)
y_pred_proba_gb = best_gb_model.predict_proba(X_test)[:, 1]
print(f"Optimized Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print(
    f"Optimized Gradient Boosting AUC-ROC: {roc_auc_score(y_test, y_pred_proba_gb):.4f}"
)

Optimized Gradient Boosting Accuracy: 0.8174
Optimized Gradient Boosting AUC-ROC: 0.7759


In [21]:
result_df = X_test.copy()
result_df["Predicted_PD"] = y_pred_proba_gb
result_df["True_Label"] = y_test.values

print(result_df.head())

result_df.to_csv("predictions_output_gradient_boosting.csv", index=False)

      Job tenure  Age  Car price  Funding amount  Down payment  Loan duration  \
625            2   22       8900            8900             0             60   
2796           0   55      10400            9400             0             72   
101            1   40      15990           14990             0             60   
4767           5   43      19999           23233             0             72   
2018           1   26      11800            4298             1             24   

      Monthly payment  Credit event  Married  Homeowner  Group  Predicted_PD  \
625          0.084828             0        0          0      0      0.268061   
2796         0.083889             0        0          0      0      0.350588   
101          0.127142             0        1          0      0      0.204690   
4767         0.153289             0        0          0      0      0.391570   
2018         0.115528             0        0          0      0      0.109665   

      True_Label  
625          

1. Accuracy:
- Random Forest: 81.83%
- Gradient Boosting: 81.74%
Both models achieve similar accuracy, but accuracy isn't very meaningful with imbalanced data (many more non-default cases).

2. AUC-ROC:
- Random Forest: 0.7700
- Gradient Boosting: 0.7759
Both models perform similarly in distinguishing between default and non-default, with Gradient Boosting slightly better.
