#### Library Imports

In [None]:
!pip install skorch torch scikit-learn
import xgboost as xgb
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
    roc_auc_score,
    roc_curve,
    precision_score,
    recall_score,
    f1_score,
    precision_recall_curve,
    accuracy_score,
    log_loss,
    PrecisionRecallDisplay,
    make_scorer,
    RocCurveDisplay
)
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, average_precision_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import imblearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import cv
from xgboost import XGBClassifier
import scipy as stats
from skorch.callbacks import EarlyStopping, Checkpoint, EpochScoring
from skorch.helper import predefined_split

#### Import Data

In [None]:
file_id_1 = '18c5DynpKSiey55WdTBkNE7Iwb7l_HL-k'
gdown.download(f'https://drive.google.com/uc?id={file_id_1}', 'data2011.csv', quiet=False)
df1 = pd.read_csv('data2011.csv')

file_id_2 = '1bJsC9bUmrMHXlKIv82Gkl-Qxldy9D-KQ'
gdown.download(f'https://drive.google.com/uc?id={file_id_2}', 'data2102.csv', quiet=False)
df2 = pd.read_csv('data2102.csv')

file_id_3 = '1BU41bihK6rCTVWmyUFr4gEmYwIclKeMD'
gdown.download(f'https://drive.google.com/uc?id={file_id_3}', 'data2105.csv', quiet=False)
df3 = pd.read_csv('data2105.csv')

file_id_4 = '1VUA3AgnL7ouqCY3vrui7G6qr5RbbJwDQ'
gdown.download(f'https://drive.google.com/uc?id={file_id_4}', 'data2108.csv', quiet=False)
df4 = pd.read_csv('data2108.csv')

file_id_5 = '1GSL8AOlv9fWylFU-HAKbIbOCxuN1b754'
gdown.download(f'https://drive.google.com/uc?id={file_id_5}', 'data2111.csv', quiet=False)
df5 = pd.read_csv('data2111.csv')

#### Data Processing

In [None]:
## Rename Columns
def rename(df):
    return df.rename(columns={
        'RREL16': 'primary_income',
        'RREL13': 'employment_status',
        'RREL27': 'loan_purpose',
        'RREL25': 'original_term',
        'RREL30': 'current_balance',
        'RREL29': 'original_balance',
        'RREL43': 'current_interest_rate',
        'RREL42': 'interest_type',
        'RREL69': 'account_status',
        'RREL39': 'payment_due',
        'RREL67': 'arrears_balance',
        'RREL68': 'days_in_arrears',
        'RREL71': 'default_amount',
        'RREC6': 'collateral_region',
        'RREC7': 'occupancy_type',
        'RREC9': 'property_type',
        'RREC16': 'original_ltv',
        'RREC17': 'original_valuation',
        'RREC12': 'current_ltv',
        'RREC13': 'current_valuation',
        'age': 'age',
        'PrepaymentFee': 'prepayment_fee',
        'PrepaymentHistory': 'prepayment_history',
        'RREL30_t_1': 'past_balance',
        'RREL39_t_1': 'past_payment_due',
        'RREL43_t_1': 'past_interest_rate',
        'RREC12_t_1': 'past_ltv',
        'RREC13_t_1': 'past_valuation',
        'incentive': 'incentive',
        'target': 'target'
    })

In [None]:
## Embed Categorical columns
def embed(df):
    df['employment_status'] = df['employment_status'].astype('category')
    df['loan_purpose'] = df['loan_purpose'].astype('category')
    df['collateral_region'] = df['collateral_region'].astype('category')
    df['occupancy_type'] = df['occupancy_type'].astype('category')
    df['property_type'] = df['property_type'].astype('category')
    df['interest_type'] = df['interest_type'].astype('category')
    df['account_status'] = df['account_status'].astype('category')
    df['prepayment_fee'] = df['prepayment_fee'].astype('category')
    df['prepayment_history'] = df['prepayment_history'].astype('category')
    return df

In [None]:
# Rename all datasets
df1 = rename(df1)
df2 = rename(df2)
df3 = rename(df3)
df4 = rename(df4)
df5 = rename(df5)

In [None]:
# Drop single employment PNNR observation
df5 = df5[df5['employment_status'] != 'PNNR']

In [None]:
# Embed all categorical variables
df1 = embed(df1)
df2 = embed(df2)
df3 = embed(df3)
df4 = embed(df4)
df5 = embed(df5)

In [None]:
# Split data from targets
X1 = df1.drop(['target', 'prepayment_fee'], axis=1)
y1 = df1['target']
X2 = df2.drop(['target', 'prepayment_fee'], axis=1)
y2 = df2['target']
X3 = df3.drop(['target', 'prepayment_fee'], axis=1)
y3 = df3['target']
X4 = df4.drop(['target', 'prepayment_fee'], axis=1)
y4 = df4['target']
X5 = df5.drop(['target', 'prepayment_fee'], axis=1)
y5 = df5['target']

#### SHAP-Reduction Data Processing

In [None]:
# Function to maintain SHAP variables
def drop(datasets):
    columns_to_keep = ['collateral_region', 'primary_income', 'current_interest_rate', 'original_valuation',
                       'original_ltv', 'past_interest_rate', 'age', 'payment_due', 'past_ltv', 'current_ltv',
                       'original_balance', 'current_balance', 'current_valuation', 'past_valuation', 'past_payment_due',
                       'past_balance', 'property_type', 'occupancy_type', 'original_term', 'loan_purpose']
    return [df[columns_to_keep] for df in datasets]

# Apply and reassign
X1_shap, X2_shap, X3_shap, X4_shap, X5_shap = drop([X1, X2, X3, X4, X5])

In [None]:
# Create train and test data
X_shap_train = pd.concat([X1_shap, X2_shap], ignore_index=True)
y_shap_train = pd.concat([y1, y2], ignore_index=True)
X_shap_val = pd.concat([X3_shap, X4_shap], ignore_index=True)
y_shap_val = pd.concat([y3, y4], ignore_index=True)

### SHAP-reduced RF Model

In [None]:
# Set up hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 1, 10],
    'scale_pos_weight': [50, 100],
    'n_estimators': [500, 1000, 3000]  # Moved inside param_grid
}

# Set up XGBoost without early stopping in the initializer
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric=['aucpr', 'logloss'],
    enable_categorical=True,
    use_label_encoder=False,
    verbosity=0,
    random_state=42
)

# 5-Fold Stratified Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Custom scoring for imbalanced data
scoring = {
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'aucpr': 'average_precision'
}

# RandomizedSearchCV
grid = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=30,  # Reduced for faster execution
    scoring=scoring,
    refit='aucpr',
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    return_train_score=True
)

# Fit without early stopping in grid search
grid.fit(X_shap_train, y_shap_train)

# Now train final model using best params
best_params = grid.best_params_.copy()


final_model = xgb.XGBClassifier(
    **best_params,
    objective='binary:logistic',
    eval_metric=['aucpr', 'logloss'],
    early_stopping_rounds=50,
    enable_categorical=True,
    use_label_encoder=False,
    verbosity=1,
    random_state=42
)

final_model.fit(
    X_shap_train, y_shap_train,
    eval_set=[(X_shap_val, y_shap_val)],
    verbose=True
)
# Best model
print("Best Parameters:", grid.best_params_)

# Evaluate on test set with optimal threshold
y_proba = final_model.predict_proba(X_shap_val)[:, 1]

# Find optimal threshold
precision, recall, thresholds = precision_recall_curve(y_shap_val, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
y_pred = (y_proba >= optimal_threshold).astype(int)

# Metrics
print(f"\nOptimal Threshold: {optimal_threshold:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_shap_val, y_pred)}")
print(f"Precision: {precision_score(y_shap_val, y_pred, zero_division=0):.4f}")
print(f"Recall:    {recall_score(y_shap_val, y_pred, zero_division=0):.4f}")
print(f"F1 Score:  {f1_score(y_shap_val, y_pred, zero_division=0):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_shap_val, y_proba):.4f}")
print(f"PR AUC:    {average_precision_score(y_shap_val, y_proba):.4f}")

# Precision-Recall Curve
plt.figure(figsize=(8, 6))
disp = PrecisionRecallDisplay.from_estimator(final_model, X_shap_val, y_shap_val)
plt.title('SHAP-Reduced Precision-Recall Curve')
plt.grid(True)
plt.savefig("shap_pr.png", dpi=300, bbox_inches='tight')
#files.download("shap_pr.png")
plt.show()

# ROC Curve
plt.figure(figsize=(8, 6))
RocCurveDisplay.from_estimator(final_model, X_shap_val, y_shap_val)
plt.title('SHAP-Reduced ROC Curve', fontsize=12, pad=20)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.show()

# Learning curves
results = final_model.evals_result()
plt.figure(figsize=(10, 4))
plt.plot(results['validation_0']['logloss'], label='Test Log Loss')
plt.plot(results['validation_0']['aucpr'], label='Test AUC-PR')
plt.xlabel('Iterations')
plt.ylabel('Metric Value')
plt.title('Learning Curves')
plt.legend()
plt.grid(True)
plt.show()