# Phase 1: Test & Train on 1 Window (D2011)

### Library and Data Imports

In [None]:
# --- Core Libraries for Model Training and Evaluation ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

!pip install skorch torch scikit-learn
# Scikit-Learn Core Components
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, learning_curve, train_test_split, GridSearchCV
from sklearn.metrics import (make_scorer, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, precision_recall_curve,
                             confusion_matrix, PrecisionRecallDisplay, ConfusionMatrixDisplay)
# XGBoost Specific
import xgboost as xgb
import shap  # Only for XGBoost SHAP analysis

# Scikit-Learn Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# PyTorch & skorch
import torch
import torch.nn as nn
import torch.optim as optim
from skorch import NeuralNetClassifier
from skorch.callbacks import EarlyStopping, Checkpoint, EpochScoring

In [None]:
file_id_1 = '18c5DynpKSiey55WdTBkNE7Iwb7l_HL-k'
gdown.download(f'https://drive.google.com/uc?id={file_id_1}', 'data2011.csv', quiet=False)
df = pd.read_csv('data2011.csv')

### Data Processing

In [None]:
## Rename Columns
def rename(df):
    return df.rename(columns={
        'RREL16': 'primary_income',
        'RREL13': 'employment_status',
        'RREL27': 'loan_purpose',
        'RREL25': 'original_term',
        'RREL30': 'current_balance',
        'RREL29': 'original_balance',
        'RREL43': 'current_interest_rate',
        'RREL42': 'interest_type',
        'RREL69': 'account_status',
        'RREL39': 'payment_due',
        'RREL67': 'arrears_balance',
        'RREL68': 'days_in_arrears',
        'RREL71': 'default_amount',
        'RREC6': 'collateral_region',
        'RREC7': 'occupancy_type',
        'RREC9': 'property_type',
        'RREC16': 'original_ltv',
        'RREC17': 'original_valuation',
        'RREC12': 'current_ltv',
        'RREC13': 'current_valuation',
        'age': 'age',
        'PrepaymentFee': 'prepayment_fee',
        'PrepaymentHistory': 'prepayment_history',
        'RREL30_t_1': 'past_balance',
        'RREL39_t_1': 'past_payment_due',
        'RREL43_t_1': 'past_interest_rate',
        'RREC12_t_1': 'past_ltv',
        'RREC13_t_1': 'past_valuation',
        'incentive': 'incentive',
        'target': 'target'
    })
df = rename(df)

In [None]:
## Embed Categorical columns
def embed(df):
    df['employment_status'] = df['employment_status'].astype('category')
    df['loan_purpose'] = df['loan_purpose'].astype('category')
    df['collateral_region'] = df['collateral_region'].astype('category')
    df['occupancy_type'] = df['occupancy_type'].astype('category')
    df['property_type'] = df['property_type'].astype('category')
    df['interest_type'] = df['interest_type'].astype('category')
    df['account_status'] = df['account_status'].astype('category')
    df['prepayment_fee'] = df['prepayment_fee'].astype('category')
    df['prepayment_history'] = df['prepayment_history'].astype('category')
    return df
df = embed(df)

In [None]:
# Define train and test sets
X = df.drop('target', axis=1)
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Create one-hot encoded version
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Align columns to ensure same structure
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [None]:
# Create scaled version
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

### CV XGBoost + GridSearch

In [None]:
# --- Phase 1: XGBoost Model Training and Hyperparameter Optimization ---
# This script implements a comprehensive machine learning pipeline for predicting mortgage prepayment,
# utilizing XGBoost with stratified cross-validation and randomized hyperparameter search.
# The methodology is specifically designed to address severe class imbalance.

# Define the hyperparameter grid for randomized search.
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 1, 10],
    'scale_pos_weight': [50, 100],
    'n_estimators': [500, 1000]
}

# Initialize the base XGBoost classifier.
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric=['aucpr', 'logloss'],
    enable_categorical=True,
    use_label_encoder=False,
    verbosity=0,
    random_state=42
)

# Initialize a stratified 5-fold cross-validator.
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics for model evaluation.
scoring = {
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'aucpr': 'average_precision'
}

# Initialize the RandomizedSearchCV object.
grid = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=30,
    scoring=scoring,
    refit='aucpr',
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    return_train_score=True
)

# Execute the randomized hyperparameter search.
print("Initiating randomized hyperparameter search for XGBoost...")
grid.fit(X_train, y_train)
print("Hyperparameter search complete.")

# Store best parameters and reinitialize final model with early stopping.
best_params = grid.best_params_.copy()
final_model = xgb.XGBClassifier(
    **best_params,
    objective='binary:logistic',
    eval_metric=['aucpr', 'logloss'],
    early_stopping_rounds=50,
    enable_categorical=True,
    use_label_encoder=False,
    verbosity=1,
    random_state=42
)

# Train final model with early stopping.
final_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

# Output best hyperparameters.
print("Best Parameters:", grid.best_params_)

# --- Model Interpretation: SHAP Analysis ---
explainer = shap.TreeExplainer(final_model)
shap_values = explainer(X_test).values
shap.summary_plot(shap_values, X_test, plot_type="bar", max_display=20)

# --- Model Evaluation ---
y_proba = final_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
y_pred = (y_proba >= optimal_threshold).astype(int)

# Print performance metrics.
print(f"\nOptimal Threshold: {optimal_threshold:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, zero_division=0):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print(f"PR AUC: {average_precision_score(y_test, y_proba):.4f}")

# --- Visualization ---
# Precision-Recall Curve
plt.figure(figsize=(8, 6))
PrecisionRecallDisplay.from_estimator(final_model, X_test, y_test)
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.show()

# Learning Curves
results = final_model.evals_result()
plt.figure(figsize=(10, 4))
plt.plot(results['validation_0']['logloss'], label='Validation Log Loss')
plt.plot(results['validation_0']['aucpr'], label='Validation AUC-PR')
plt.xlabel('Iterations')
plt.ylabel('Metric Value')
plt.title('XGBoost Learning Curves')
plt.legend()
plt.grid(True)
plt.show()

### CV LogReg + GridSearch

In [None]:
# --- Phase 1: Logistic Regression Model Training and Hyperparameter Optimization ---
# This script implements a traditional logistic regression baseline for predicting mortgage prepayment,
# utilizing stratified cross-validation and randomized hyperparameter search.

# Define the hyperparameter grid for randomized search.
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500],
    'class_weight': [None, 'balanced', {0:1, 1:50}, {0:1, 1:100}],
    'l1_ratio': [0, 0.25, 0.5, 0.75, 1]
}

# Initialize the base classifier.
lr_clf = LogisticRegression(random_state=42, n_jobs=-1)

# Initialize a stratified 5-fold cross-validator.
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics for model evaluation.
scoring = {
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'aucpr': 'average_precision'
}

# Initialize the RandomizedSearchCV object.
grid = RandomizedSearchCV(
    estimator=lr_clf,
    param_distributions=param_grid,
    n_iter=30,
    scoring=scoring,
    refit='aucpr',
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    return_train_score=True
)

# Execute the randomized hyperparameter search.
print("Initiating randomized hyperparameter search for Logistic Regression...")
grid.fit(X_train_encoded, y_train)
print("Hyperparameter search complete.")

# Store the best estimator.
best_model = grid.best_estimator_
print("Best Parameters:", grid.best_params_)

# --- Model Evaluation ---
y_proba = best_model.predict_proba(X_test_encoded)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
y_pred = (y_proba >= optimal_threshold).astype(int)

# Print performance metrics.
print(f"\nOptimal Threshold: {optimal_threshold:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, zero_division=0):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print(f"PR AUC: {average_precision_score(y_test, y_proba):.4f}")

# --- Visualization ---
# Precision-Recall Curve
plt.figure(figsize=(8, 6))
PrecisionRecallDisplay.from_estimator(best_model, X_test_encoded, y_test)
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.show()

# Learning Curves
train_sizes, train_scores, test_scores = learning_curve(
    best_model,
    X_train_encoded,
    y_train,
    cv=cv,
    scoring='average_precision',
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10),
    verbose=1
)

plt.figure(figsize=(10, 4))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Training AUC-PR')
plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label='Validation AUC-PR')
plt.xlabel('Number of Training Examples')
plt.ylabel('AUC-PR Score')
plt.title('Logistic Regression Learning Curves')
plt.legend()
plt.grid(True)
plt.show()

### CV RF + Grid Search

In [None]:
# --- Phase 1: Random Forest Model Training and Hyperparameter Optimization ---
# This script implements a Random Forest model for predicting mortgage prepayment,
# utilizing stratified cross-validation and randomized hyperparameter search.

# Define the hyperparameter grid for randomized search.
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample', None, {0:1, 1:50}, {0:1, 1:100}],
    'max_samples': [None, 0.7, 0.8],
    'ccp_alpha': [0.0, 0.01, 0.1]
}

# Initialize the base classifier.
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Initialize a stratified 5-fold cross-validator.
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics for model evaluation.
scoring = {
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'aucpr': 'average_precision'
}

# Initialize the RandomizedSearchCV object.
grid = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_grid,
    n_iter=30,
    scoring=scoring,
    refit='aucpr',
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    return_train_score=True
)

# Execute the randomized hyperparameter search.
print("Initiating randomized hyperparameter search for Random Forest...")
grid.fit(X_train_encoded, y_train)
print("Hyperparameter search complete.")

# Store the best estimator.
best_model = grid.best_estimator_
print("Best Parameters:", grid.best_params_)

# --- Model Evaluation ---
y_proba = best_model.predict_proba(X_test_encoded)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
y_pred = (y_proba >= optimal_threshold).astype(int)

# Print performance metrics.
print(f"\nOptimal Threshold: {optimal_threshold:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, zero_division=0):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print(f"PR AUC: {average_precision_score(y_test, y_proba):.4f}")

# --- Visualization ---
# Precision-Recall Curve
plt.figure(figsize=(8, 6))
PrecisionRecallDisplay.from_estimator(best_model, X_test_encoded, y_test)
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.show()

# Learning Curves
train_sizes, train_scores, test_scores = learning_curve(
    best_model,
    X_train_encoded,
    y_train,
    cv=cv,
    scoring='average_precision',
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10),
    verbose=1
)

plt.figure(figsize=(10, 4))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Training AUC-PR')
plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label='Validation AUC-PR')
plt.xlabel('Number of Training Examples')
plt.ylabel('AUC-PR Score')
plt.title('Random Forest Learning Curves')
plt.legend()
plt.grid(True)
plt.show()

### CV NN + GridSearch

In [None]:
# --- Phase 1: Neural Network Model Training and Hyperparameter Optimization ---
# This script implements a Deep Neural Network (DNN) for predicting mortgage prepayment,
# utilizing skorch for sklearn compatibility and randomized hyperparameter search.

# Define the neural network architecture.
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size1=64, hidden_size2=30, dropout_rate=0.2):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, 1)
        self.dropout = nn.Dropout(dropout_rate)
        self.bn1 = nn.BatchNorm1d(hidden_size1)
        self.bn2 = nn.BatchNorm1d(hidden_size2)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Prepare data tensors.
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Calculate class weight.
pos_weight = torch.tensor([len(y_train) / y_train.sum()], dtype=torch.float32)

# Define the hyperparameter grid.
param_grid = {
    'module__hidden_size1': [32, 64, 128],
    'module__hidden_size2': [16, 30, 64],
    'module__dropout_rate': [0.1, 0.2, 0.3],
    'optimizer__lr': [0.001, 0.01, 0.0001],
    'optimizer__weight_decay': [0, 0.001, 0.01],
    'batch_size': [32, 64, 128],
    'max_epochs': [50, 100, 150]
}

# Initialize callbacks.
early_stopping = EarlyStopping(patience=10, threshold=0.001, threshold_mode='rel')
checkpoint = Checkpoint(monitor='valid_loss_best')

# Initialize the skorch classifier.
net = NeuralNetClassifier(
    module=NeuralNet,
    module__input_size=X_train_tensor.shape[1],
    criterion=nn.BCEWithLogitsLoss,
    criterion__pos_weight=pos_weight,
    optimizer=optim.Adam,
    callbacks=[early_stopping, checkpoint],
    device='cuda' if torch.cuda.is_available() else 'cpu',
    verbose=1
)

# Initialize a stratified 5-fold cross-validator.
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics for model evaluation.
scoring = {
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'aucpr': 'average_precision'
}

# Initialize the RandomizedSearchCV object.
nn_grid = RandomizedSearchCV(
    estimator=net,
    param_distributions=param_grid,
    n_iter=20,
    scoring=scoring,
    refit='aucpr',
    cv=cv,
    n_jobs=1,
    verbose=2,
    random_state=42,
    return_train_score=True
)

# Execute the randomized hyperparameter search.
print("Initiating randomized hyperparameter search for Neural Network...")
nn_grid.fit(X_train_tensor, y_train_tensor)
print("Hyperparameter search complete.")

# Store the best estimator.
best_model = nn_grid.best_estimator_
print("Best Parameters:", nn_grid.best_params_)

# --- Model Evaluation ---
y_proba = best_model.predict_proba(X_test_tensor)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
y_pred = (y_proba >= optimal_threshold).astype(int)

# Print performance metrics.
print(f"\nOptimal Threshold: {optimal_threshold:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, zero_division=0):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print(f"PR AUC: {average_precision_score(y_test, y_proba):.4f}")

# --- Visualization ---
# Precision-Recall Curve
plt.figure(figsize=(8, 6))
PrecisionRecallDisplay.from_estimator(best_model, X_test_tensor, y_test)
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.show()

# Learning Curves
try:
    history = best_model.history_
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history[:, 'train_loss'], label='Train Loss')
    plt.plot(history[:, 'valid_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(history[:, 'valid_roc_auc'], label='Validation ROC AUC')
    plt.plot(history[:, 'valid_average_precision'], label='Validation PR AUC')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)

    plt.suptitle('Neural Network Learning Curves')
    plt.tight_layout()
    plt.show()
except AttributeError:
    print("Could not retrieve training history for plotting.")