# PoC - Modelling

## Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings

# Machine Learning Preprocessing & Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report,
                             roc_auc_score)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchmetrics.classification import MulticlassAccuracy, MulticlassAUROC


"""
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import AUC
"""


'\nimport tensorflow as tf\nfrom tensorflow.keras.utils import to_categorical\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input\nfrom tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\nfrom tensorflow.keras.metrics import AUC\n'

## Load data

In [2]:
path = "feature_engineered_data.csv"
df = pd.read_csv(path)

pd.set_option('display.max_columns', None)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268293 entries, 0 to 268292
Data columns (total 64 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   loan_amnt                            268293 non-null  float64
 1   funded_amnt                          268293 non-null  float64
 2   funded_amnt_inv                      268293 non-null  float64
 3   int_rate                             268293 non-null  float64
 4   installment                          268293 non-null  float64
 5   sub_grade                            268293 non-null  float64
 6   emp_length                           268293 non-null  float64
 7   annual_inc                           268293 non-null  float64
 8   loan_status                          268293 non-null  float64
 9   dti                                  268293 non-null  float64
 10  delinq_2yrs                          268293 non-null  float64
 11  inq_last_6mth

In [4]:
df["loan_status"].value_counts()

loan_status
0.0    209526
1.0     58767
Name: count, dtype: int64

## Train-Test Split

In [5]:
y = df["loan_status"].astype(int)
X = df.drop(columns=["loan_status"])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.3,
    random_state = 35,
    stratify = y,
    shuffle=True
)

print("\nData split Train 70% Test 30%")
print(f"x_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


Data split Train 70% Test 30%
x_train shape: (187805, 63), y_train shape: (187805,)
x_test shape: (80488, 63), y_test shape: (80488,)


## XGBoost

In [7]:
print("\n--- Training XGBoost ---")
xgb_classifier = XGBClassifier(
    objective='multi:softmax', 
    num_class=2,       
    n_estimators=500,          
    learning_rate=0.1,         
    max_depth=3,               
    subsample=0.8,             
    colsample_bytree=0.8,      
    gamma=0,                   
    reg_lambda=1,              
    use_label_encoder=False,   
    eval_metric='mlogloss',    # Evaluation metric for multi-class
    random_state=35,
    n_jobs=-1                  # Use all available cores
)

# Entrenamiento de XGBoost
xgb_classifier.fit(X_train, y_train)
print("XGBoost training complete.")



--- Training XGBoost ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost training complete.


In [8]:
print("\n--- Evaluating XGBoost (Train Set) ---")
y_pred_train_xgb = xgb_classifier.predict(X_train)
cm_train_xgb = confusion_matrix(y_train, y_pred_train_xgb)
f1_train_xgb = f1_score(y_train, y_pred_train_xgb, average='weighted')
acc_train_xgb = accuracy_score(y_train, y_pred_train_xgb)
print(f"Train Accuracy: {acc_train_xgb:.4f}")
print(f"Train F1 Score (Weighted): {f1_train_xgb:.4f}")
# print("Train Classification Report:") # Optional: uncomment for full report
print(classification_report(y_train, y_pred_train_xgb, target_names=["Paga", "No Paga"]))
#plot_plotly_confusion_matrix(cm_train_xgb, class_labels, title='XGBoost Confusion Matrix (Train)')



--- Evaluating XGBoost (Train Set) ---
Train Accuracy: 0.9993
Train F1 Score (Weighted): 0.9993
              precision    recall  f1-score   support

        Paga       1.00      1.00      1.00    146668
     No Paga       1.00      1.00      1.00     41137

    accuracy                           1.00    187805
   macro avg       1.00      1.00      1.00    187805
weighted avg       1.00      1.00      1.00    187805



In [9]:
print("\n--- Evaluating XGBoost (Test Set) ---")
y_pred_test_xgb = xgb_classifier.predict(X_test)
cm_test_xgb = confusion_matrix(y_test, y_pred_test_xgb)
f1_test_xgb = f1_score(y_test, y_pred_test_xgb, average='weighted')
acc_test_xgb = accuracy_score(y_test, y_pred_test_xgb)
print(f"Test Accuracy: {acc_test_xgb:.4f}")
print(f"Test F1 Score (Weighted): {f1_test_xgb:.4f}")
print("Test Classification Report:")
# Use zero_division=0 to avoid warnings
print(classification_report(y_test, y_pred_test_xgb, target_names=["Paga", "No Paga"], zero_division=0))



--- Evaluating XGBoost (Test Set) ---
Test Accuracy: 0.9987
Test F1 Score (Weighted): 0.9987
Test Classification Report:
              precision    recall  f1-score   support

        Paga       1.00      1.00      1.00     62858
     No Paga       1.00      0.99      1.00     17630

    accuracy                           1.00     80488
   macro avg       1.00      1.00      1.00     80488
weighted avg       1.00      1.00      1.00     80488



In [10]:
xgb_classifier.save_model("xgboost_model.model")

"""
PARA CARGAR EL MODELO:
* loaded_model = xgb.XGBClassifier()
* loaded_model.load_model("xgboost_model.model)
"""

  self.get_booster().save_model(fname)


'\nPARA CARGAR EL MODELO:\n* loaded_model = xgb.XGBClassifier()\n* loaded_model.load_model("xgboost_model.model)\n'

## Neural Network

In [11]:
class MLP(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(MLP, self).__init__()
        
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            
            nn.Linear(64, n_classes)
        )
    
    def forward(self, x):
        return self.model(x)


In [12]:
# Train Dataset Translation for PyTorch
boolean_columns = X_train.select_dtypes(include="bool").columns

X_train[boolean_columns] = X_train[boolean_columns].astype(int)
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

X_test[boolean_columns] = X_test[boolean_columns].astype(int)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader   = DataLoader(test_dataset, batch_size=32)


In [13]:
# Setup
input_dim = df.shape[1] - 1
n_classes = 2

# Definición del modelo
model = MLP(input_dim, n_classes)

# Función de pérdida
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Metrics
accuracy = MulticlassAccuracy(num_classes=len(set(y_train)))
auc = MulticlassAUROC(num_classes=len(set(y_train)), average='macro')


In [14]:
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    accuracy.reset()
    auc.reset()

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch, y_batch
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        accuracy.update(outputs, y_batch)
        auc.update(outputs, y_batch)

    print(f"Epoch [{epoch+1}/{EPOCHS}]")
    print(f"Train Loss: {running_loss / len(train_loader):.4f} | Accuracy: {accuracy.compute():.4f} | AUC: {auc.compute():.4f}")


Epoch [1/10]
Train Loss: 0.0452 | Accuracy: 0.9769 | AUC: 0.9969
Epoch [2/10]
Train Loss: 0.0207 | Accuracy: 0.9903 | AUC: 0.9990
Epoch [3/10]
Train Loss: 0.0157 | Accuracy: 0.9926 | AUC: 0.9993
Epoch [4/10]
Train Loss: 0.0147 | Accuracy: 0.9931 | AUC: 0.9994
Epoch [5/10]
Train Loss: 0.0133 | Accuracy: 0.9940 | AUC: 0.9994
Epoch [6/10]
Train Loss: 0.0121 | Accuracy: 0.9945 | AUC: 0.9995
Epoch [7/10]
Train Loss: 0.0118 | Accuracy: 0.9946 | AUC: 0.9995
Epoch [8/10]
Train Loss: 0.0109 | Accuracy: 0.9950 | AUC: 0.9996
Epoch [9/10]
Train Loss: 0.0108 | Accuracy: 0.9953 | AUC: 0.9996
Epoch [10/10]
Train Loss: 0.0107 | Accuracy: 0.9954 | AUC: 0.9996


In [15]:
model.eval()
accuracy.reset()
auc.reset()

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch, y_batch
        outputs = model(X_batch)
        accuracy.update(outputs, y_batch)
        auc.update(outputs, y_batch)

print(f"Test Accuracy: {accuracy.compute():.4f} | Test AUC: {auc.compute():.4f}")


Test Accuracy: 0.9964 | Test AUC: 1.0000


In [16]:
torch.save(model.state_dict(), "mlp_model.pt")