# PoC - Modelling

## Libraries

In [23]:
import pandas as pd
import numpy as np
import warnings

# Machine Learning Preprocessing & Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix, classification_report,)
from xgboost import XGBClassifier

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchmetrics.classification import MulticlassAccuracy, MulticlassAUROC


"""
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import AUC
"""


'\nimport tensorflow as tf\nfrom tensorflow.keras.utils import to_categorical\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input\nfrom tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\nfrom tensorflow.keras.metrics import AUC\n'

## Load data

In [24]:
path = "feature_engineered_data.csv"
df = pd.read_csv(path)

pd.set_option('display.max_columns', None)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268293 entries, 0 to 268292
Data columns (total 56 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   loan_amnt                            268293 non-null  float64
 1   funded_amnt                          268293 non-null  float64
 2   funded_amnt_inv                      268293 non-null  float64
 3   int_rate                             268293 non-null  float64
 4   installment                          268293 non-null  float64
 5   sub_grade                            268293 non-null  float64
 6   emp_length                           268293 non-null  float64
 7   annual_inc                           268293 non-null  float64
 8   loan_status                          268293 non-null  float64
 9   dti                                  268293 non-null  float64
 10  delinq_2yrs                          268293 non-null  float64
 11  inq_last_6mth

In [26]:
df["loan_status"].value_counts()

loan_status
0.0    209526
1.0     58767
Name: count, dtype: int64

## Train-Test Split

In [27]:
y = df["loan_status"].astype(int)
X = df.drop(columns=["loan_status"])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.3,
    random_state = 35,
    stratify = y,
    shuffle=True
)

print("\nData split Train 70% Test 30%")
print(f"x_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


Data split Train 70% Test 30%
x_train shape: (187805, 55), y_train shape: (187805,)
x_test shape: (80488, 55), y_test shape: (80488,)


In [29]:
y_train.value_counts()

loan_status
0    146668
1     41137
Name: count, dtype: int64

## XGBoost

In [30]:
print("\n--- Training XGBoost ---")
xgb_classifier = XGBClassifier(
    objective='multi:softmax', 
    num_class=2,       
    n_estimators=500,          
    learning_rate=0.1,         
    max_depth=3,               
    subsample=0.8,             
    colsample_bytree=0.8,      
    gamma=0,                   
    reg_lambda=1,              
    use_label_encoder=False,   
    eval_metric='mlogloss',    # Evaluation metric for multi-class
    random_state=35,
    n_jobs=-1                  # Use all available cores
)

# Entrenamiento de XGBoost
xgb_classifier.fit(X_train, y_train)
print("XGBoost training complete.")



--- Training XGBoost ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost training complete.


In [31]:
print("\n--- Evaluating XGBoost (Train Set) ---")
y_pred_train_xgb = xgb_classifier.predict(X_train)
cm_train_xgb = confusion_matrix(y_train, y_pred_train_xgb)
f1_train_xgb = f1_score(y_train, y_pred_train_xgb, average='weighted')
acc_train_xgb = accuracy_score(y_train, y_pred_train_xgb)
print(f"Train Accuracy: {acc_train_xgb:.4f}")
print(f"Train F1 Score (Weighted): {f1_train_xgb:.4f}")
# print("Train Classification Report:") # Optional: uncomment for full report
print(classification_report(y_train, y_pred_train_xgb, target_names=["Paga", "No Paga"]))
#plot_plotly_confusion_matrix(cm_train_xgb, class_labels, title='XGBoost Confusion Matrix (Train)')



--- Evaluating XGBoost (Train Set) ---
Train Accuracy: 0.8322
Train F1 Score (Weighted): 0.7928
              precision    recall  f1-score   support

        Paga       0.83      0.99      0.90    146668
     No Paga       0.92      0.26      0.40     41137

    accuracy                           0.83    187805
   macro avg       0.87      0.63      0.65    187805
weighted avg       0.85      0.83      0.79    187805



In [32]:
print("\n--- Evaluating XGBoost (Test Set) ---")
y_pred_test_xgb = xgb_classifier.predict(X_test)
cm_test_xgb = confusion_matrix(y_test, y_pred_test_xgb)
f1_test_xgb = f1_score(y_test, y_pred_test_xgb, average='weighted')
acc_test_xgb = accuracy_score(y_test, y_pred_test_xgb)
print(f"Test Accuracy: {acc_test_xgb:.4f}")
print(f"Test F1 Score (Weighted): {f1_test_xgb:.4f}")
print("Test Classification Report:")
# Use zero_division=0 to avoid warnings
print(classification_report(y_test, y_pred_test_xgb, target_names=["Paga", "No Paga"], zero_division=0))



--- Evaluating XGBoost (Test Set) ---
Test Accuracy: 0.8293
Test F1 Score (Weighted): 0.7889
Test Classification Report:
              precision    recall  f1-score   support

        Paga       0.82      0.99      0.90     62858
     No Paga       0.90      0.25      0.39     17630

    accuracy                           0.83     80488
   macro avg       0.86      0.62      0.65     80488
weighted avg       0.84      0.83      0.79     80488



In [33]:
xgb_classifier.save_model("xgboost_model.model")

"""
PARA CARGAR EL MODELO:
* loaded_model = xgb.XGBClassifier()
* loaded_model.load_model("xgboost_model.model)
"""

  self.get_booster().save_model(fname)


'\nPARA CARGAR EL MODELO:\n* loaded_model = xgb.XGBClassifier()\n* loaded_model.load_model("xgboost_model.model)\n'

## Neural Network

In [34]:
class MLP(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(MLP, self).__init__()
        
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            
            nn.Linear(64, n_classes)
        )
    
    def forward(self, x):
        return self.model(x)


In [35]:
# Train Dataset Translation for PyTorch
boolean_columns = X_train.select_dtypes(include="bool").columns

X_train[boolean_columns] = X_train[boolean_columns].astype(int)
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

X_test[boolean_columns] = X_test[boolean_columns].astype(int)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader   = DataLoader(test_dataset, batch_size=32)


In [36]:
# Setup
input_dim = df.shape[1] - 1
n_classes = 2

# Definición del modelo
model = MLP(input_dim, n_classes)

# Función de pérdida
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Metrics
accuracy = MulticlassAccuracy(num_classes=len(set(y_train)))
auc = MulticlassAUROC(num_classes=len(set(y_train)), average='macro')


In [37]:
EPOCHS = 10

all_preds_train = []
all_labels_train = []

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    accuracy.reset()
    auc.reset()

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch, y_batch
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs, dim=1)
        all_preds_train.extend(preds.cpu().numpy())
        all_labels_train.extend(y_batch.cpu().numpy())

        running_loss += loss.item()
        accuracy.update(outputs, y_batch)
        auc.update(outputs, y_batch)

    print(f"Epoch [{epoch+1}/{EPOCHS}]")
    print(f"Train Loss: {running_loss / len(train_loader):.4f} | Accuracy: {accuracy.compute():.4f} | AUC: {auc.compute():.4f}")


Epoch [1/10]
Train Loss: 0.4293 | Accuracy: 0.6188 | AUC: 0.7435
Epoch [2/10]
Train Loss: 0.4218 | Accuracy: 0.6172 | AUC: 0.7539
Epoch [3/10]
Train Loss: 0.4201 | Accuracy: 0.6185 | AUC: 0.7561
Epoch [4/10]
Train Loss: 0.4193 | Accuracy: 0.6174 | AUC: 0.7576
Epoch [5/10]
Train Loss: 0.4187 | Accuracy: 0.6181 | AUC: 0.7588
Epoch [6/10]
Train Loss: 0.4178 | Accuracy: 0.6188 | AUC: 0.7601
Epoch [7/10]
Train Loss: 0.4166 | Accuracy: 0.6183 | AUC: 0.7618
Epoch [8/10]
Train Loss: 0.4164 | Accuracy: 0.6176 | AUC: 0.7629
Epoch [9/10]
Train Loss: 0.4165 | Accuracy: 0.6163 | AUC: 0.7623
Epoch [10/10]
Train Loss: 0.4159 | Accuracy: 0.6183 | AUC: 0.7629


In [44]:
print("\n--- Evaluating MLP (Train Set) ---")
print(classification_report(all_labels_train, all_preds_train))


--- Evaluating MLP (Train Set) ---
              precision    recall  f1-score   support

           0       0.82      0.99      0.90   1466680
           1       0.88      0.24      0.38    411370

    accuracy                           0.83   1878050
   macro avg       0.85      0.62      0.64   1878050
weighted avg       0.84      0.83      0.79   1878050



In [39]:
model.eval()
accuracy.reset()
auc.reset()

all_labels_test = []
all_preds_test = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch, y_batch
        outputs = model(X_batch)

        preds = torch.argmax(outputs, dim=1)
        all_preds_test.extend(preds.cpu().numpy())
        all_labels_test.extend(y_batch.cpu().numpy())

        accuracy.update(outputs, y_batch)
        auc.update(outputs, y_batch)

In [40]:
print("\n--- Evaluating MLP (Test Set) ---")
print(classification_report(all_labels_test, all_preds_test))


--- Evaluating MLP (Test Set) ---
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     62858
           1       0.95      0.23      0.37     17630

    accuracy                           0.83     80488
   macro avg       0.89      0.61      0.64     80488
weighted avg       0.85      0.83      0.78     80488



In [41]:
# One input
input_vector = X_train.iloc[0].to_numpy()
output_vector = y_train.iloc[0]
input_tensor = torch.tensor(input_vector, dtype=torch.float32).unsqueeze(0)

with torch.no_grad():
    output = model(input_tensor)
    predicted_class = torch.argmax(output, dim=1)

In [42]:
X_train_cols = X_train.columns

for j in range(len(X_train_cols)):
    print(f"{X_train_cols[j]}: {input_vector[j]}")


loan_amnt: -0.3968609865470852
funded_amnt: -0.4031890660592255
funded_amnt_inv: -0.4045977011494253
int_rate: 0.8428324697754747
installment: -0.29260115230624
sub_grade: 3.8
emp_length: 5.0
annual_inc: -0.5238095238095238
dti: -0.1899293286219081
delinq_2yrs: 0.0
inq_last_6mths: 1.0
open_acc: -0.5714285714285714
pub_rec: 1.0
revol_bal: -0.5820433436532507
revol_util: 0.824
total_acc: 0.0625
out_prncp: 0.0
out_prncp_inv: 0.0
collections_12_mths_ex_med: 0.0
acc_now_delinq: 0.0
tot_coll_amt: 0.0
tot_cur_bal: -0.5872224711119279
total_rev_hi_lim: -1.7582573017713174
term_ 36 months: 1.0
term_ 60 months: 0.0
home_ownership_MORTGAGE: 0.0
home_ownership_OWN: 0.0
home_ownership_RENT: 1.0
verification_status_Not Verified: 0.0
verification_status_Source Verified: 0.0
verification_status_Verified: 1.0
initial_list_status_f: 1.0
initial_list_status_w: 0.0
purpose_car: 0.0
purpose_credit_card: 0.0
purpose_debt_consolidation: 1.0
purpose_educational: 0.0
purpose_home_improvement: 0.0
purpose_house

In [43]:
torch.save(model.state_dict(), "mlp_model.pt")