# 02 | Random Forest & XGBoost
*Evaluate ensemble tree-based classifiers to improve ROC-AUC and PR-AUC over the baseline logistic regression.*

In [None]:
import sqlite3
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

## Load Engineered Dataset  
We’ll pull from our `credit_risk_engineered` table in the SQLite DB.

In [None]:
conn = sqlite3.connect("../data/loanvet.db")
df = pd.read_sql_query("SELECT * FROM credit_risk_engineered", conn)
conn.close()
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,...,TotalDelinquencies_log,HighUtilizationFlag,IncomePerCreditLine,AgeGroup_MidAge,AgeGroup_Senior,DependentsGroup_Small,DependentsGroup_Large,Util_x_Late,IncomePerDependent,CreditLines_x_Delinquencies
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,...,1.098612,0,651.428571,1,0,1,0,0.0,3040.0,26
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,...,0.0,1,520.0,1,0,1,0,0.0,1300.0,0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,...,1.098612,0,1014.0,1,0,0,0,0.350539,3042.0,4
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,...,0.0,0,550.0,1,0,0,0,0.0,3300.0,0
4,0,0.907239,49,1,0.024926,50000.0,7,0,1,0,...,0.693147,1,6250.0,1,0,0,0,0.0,50000.0,7


## Drop Redundant Columns

In [None]:
drop_cols = [
    "RevolvingUtilizationOfUnsecuredLines",
    "DebtRatio",
    "NumberOfTime30-59DaysPastDueNotWorse",
    "NumberOfTimes90DaysLate",
    "NumberOfTime60-89DaysPastDueNotWorse",
    'NumberOfTime30-59DaysPastDueNotWorse_log',
    'NumberOfTimes90DaysLate_log',
    'NumberOfTime60-89DaysPastDueNotWorse_log',
    "age",
    "NumberOfDependents",
    "TotalDelinquencies",
    "MonthlyIncome"
]
df = df.drop(columns=drop_cols, errors="ignore")

## Train/Test Split
Stratify on the target to preserve imbalance.

In [None]:
X = df.drop(columns=["SeriousDlqin2yrs"])
y = df["SeriousDlqin2yrs"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
rf_y_pred_prob = rf.predict_proba(X_test)[:,1]

## XGBoost

In [None]:
xgb = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_y_pred_prob = xgb.predict_proba(X_test)[:,1]

## Performance Metrics  
Calculate ROC‑AUC and PR‑AUC.

In [None]:
def calc_metrics(y_true, y_scores):
    roc_auc = roc_auc_score(y_true, y_scores)
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    pr_auc = auc(recall, precision)
    return roc_auc, pr_auc

rf_roc_auc, rf_pr_auc = calc_metrics(y_test, rf_y_pred_prob)
xgb_roc_auc, xgb_pr_auc = calc_metrics(y_test, xgb_y_pred_prob)

print(f"RF ROC-AUC: {rf_roc_auc:.4f}, PR-AUC: {rf_pr_auc:.4f}")
print(f"XGB ROC-AUC: {xgb_roc_auc:.4f}, PR-AUC: {xgb_pr_auc:.4f}")

ROC‑AUC: 0.8372, PR‑AUC: 0.3410


## Conclusions

### Random Forest Classifier
- Baseline ROC‑AUC: 0.8372
- PR‑AUC: 0.3410

### XGBoost
- Baseline ROC‑AUC: 0.8626
- PR‑AUC: 0.4035