In [20]:
import os
import sys
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold

# modules
from src.config import OUTPUT_DIR, PROCESSED_DIR

# vis config 
plt.style.use('default')
sns.set_palette("husl")
FIGSIZE = (10, 6)
FIGSIZE_SMALL = (8, 5)

In [16]:
print("=" * 80)
print("LOADING PIPELINE A DATA")
print("=" * 80)

# load processed data
X_train_A = np.load(PROCESSED_DIR / 'X_train_A.npy')
X_test_A = np.load(PROCESSED_DIR / 'X_test_A.npy')
y_train = np.load(PROCESSED_DIR / 'y_train.npy')
y_test = np.load(PROCESSED_DIR / 'y_test.npy')

# load feature names
with open(PROCESSED_DIR / 'feature_names_A.txt', 'r') as f:
    feature_names_A = f.read().split(r'\n')

# validation
print(f"\nShapes:")
print(f"  X_train_A: {X_train_A.shape}")
print(f"  X_test_A:  {X_test_A.shape}")
print(f"  y_train:   {y_train.shape}")
print(f"  y_test:    {y_test.shape}")

print(f"\nFeatures ({len(feature_names_A)}):")
print(f"  {feature_names_A}")

# check for missing values
print(f"\nData Quality:")
print(f"  X_train_A missing: {np.isnan(X_train_A).sum()}")
print(f"  X_test_A missing:  {np.isnan(X_test_A).sum()}")

# class distribution
unique, counts = np.unique(y_train, return_counts=True)
print(f"\nClass Distribution (Training):")
for label, count in zip(unique, counts):
    print(f"  Class {label}: {count:,} ({count/len(y_train)*100:.2f}%)")

print("\nPipeline A data loaded successfully")

LOADING PIPELINE A DATA

Shapes:
  X_train_A: (20000, 8)
  X_test_A:  (5000, 8)
  y_train:   (20000,)
  y_test:    (5000,)

Features (8):
  ['Age', 'Income', 'CreditScore', 'LoanAmount', 'EmploymentYears', 'NumDependents', 'DebtToIncome', 'EducationLevel']

Data Quality:
  X_train_A missing: 0
  X_test_A missing:  0

Class Distribution (Training):
  Class 0: 13,137 (65.69%)
  Class 1: 6,863 (34.31%)

Pipeline A data loaded successfully


In [17]:
print("=" * 80)
print("LOADING PIPELINE B DATA")
print("=" * 80)

# load processed data
X_train_B = np.load(PROCESSED_DIR / 'X_train_B.npy')
X_test_B = np.load(PROCESSED_DIR / 'X_test_B.npy')

# load feature names
with open(PROCESSED_DIR / 'feature_names_B.txt', 'r') as f:
    feature_names_B = f.read().split(r'\n')

# validation
print(f"\nShapes:")
print(f"  X_train_B: {X_train_B.shape}")
print(f"  X_test_B:  {X_test_B.shape}")

print(f"\nFeatures ({len(feature_names_B)}):")
for i, feat in enumerate(feature_names_B):
    print(f"  {i+1:2d}. {feat}")

# check for missing values
print(f"\nData Quality:")
print(f"  X_train_B missing: {np.isnan(X_train_B).sum()}")
print(f"  X_test_B missing:  {np.isnan(X_test_B).sum()}")

print("\nPipeline B data loaded successfully")

LOADING PIPELINE B DATA

Shapes:
  X_train_B: (20000, 14)
  X_test_B:  (5000, 14)

Features (14):
   1. Age
   2. Income
   3. CreditScore
   4. LoanAmount
   5. EmploymentYears
   6. NumDependents
   7. DebtToIncome
   8. EducationLevel
   9. FavoriteColor_Green
  10. FavoriteColor_Red
  11. FavoriteColor_Yellow
  12. Hobby_Reading
  13. Hobby_Sports
  14. Hobby_Traveling

Data Quality:
  X_train_B missing: 0
  X_test_B missing:  0

Pipeline B data loaded successfully


In [18]:
print("=" * 80)
print("CROSS-VALIDATION SETUP")
print("=" * 80)

# create stratified k-fold
N_SPLITS = 5
RANDOM_STATE = 42

cv = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE
)

# verify stratification
print(f"\nCross-Validation Configuration:")
print(f"  Strategy: Stratified K-Fold")
print(f"  Splits: {N_SPLITS}")
print(f"  Shuffle: True")
print(f"  Random State: {RANDOM_STATE}")

# check fold class distribution
print(f"\nVerifying stratification:")
fold_distributions = []
for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_A, y_train)):
    y_fold = y_train[val_idx]
    default_rate = (y_fold == 1).sum() / len(y_fold)
    fold_distributions.append(default_rate)
    print(f"  Fold {fold_idx + 1}: {default_rate*100:.2f}% default rate")

print(f"\nStratification Quality:")
print(f"  Mean default rate: {np.mean(fold_distributions)*100:.2f}%")
print(f"  Std default rate:  {np.std(fold_distributions)*100:.3f}%")
print(f"  Target (from train): {(y_train==1).sum()/len(y_train)*100:.2f}%")

print("\nCross-validation setup complete")

CROSS-VALIDATION SETUP

Cross-Validation Configuration:
  Strategy: Stratified K-Fold
  Splits: 5
  Shuffle: True
  Random State: 42

Verifying stratification:
  Fold 1: 34.30% default rate
  Fold 2: 34.30% default rate
  Fold 3: 34.33% default rate
  Fold 4: 34.33% default rate
  Fold 5: 34.33% default rate

Stratification Quality:
  Mean default rate: 34.31%
  Std default rate:  0.012%
  Target (from train): 34.31%

Cross-validation setup complete


In [21]:
print("=" * 80)
print("EVALUATION METRICS SETUP")
print("=" * 80)

# define scoring metrics for cross-validation
SCORING_METRICS = {
    'roc_auc': 'roc_auc',
    'f1': 'f1',
    'precision': 'precision',
    'recall': 'recall',
    'accuracy': 'accuracy'
}

print("\nMetrics for evaluation:")
print("  Primary:   ROC-AUC (threshold-independent, handles imbalance)")
print("  Secondary: F1, Precision, Recall (threshold-dependent)")
print("  Additional: Accuracy, Brier Score (calibration)")

# helper function for consistent evaluation
def evaluate_model(
    model,
    X_train, y_train,
    X_test, y_test,
    cv,
    model_name: str,
    pipeline_name: str
) -> Dict:
    """
    Comprehensive model evaluation with CV and test set metrics.
    
    Returns:
        Dictionary with CV scores, test scores, and predictions
    """
    
    # CV
    cv_results = cross_validate(
        model, X_train, y_train,
        cv=cv,
        scoring=SCORING_METRICS,
        return_train_score=False,
        n_jobs=-1
    )
    
    # test set predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # test set metrics
    test_metrics = {
        'roc_auc': roc_auc_score(y_test, y_pred_proba),
        'f1': f1_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'accuracy': (y_pred == y_test).mean(),
        'brier': brier_score_loss(y_test, y_pred_proba)
    }
    
    results = {
        'model_name': model_name,
        'pipeline': pipeline_name,
        'cv_scores': cv_results,
        'test_metrics': test_metrics,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'model': model
    }
    
    return results


EVALUATION METRICS SETUP

Metrics for evaluation:
  Primary:   ROC-AUC (threshold-independent, handles imbalance)
  Secondary: F1, Precision, Recall (threshold-dependent)
  Additional: Accuracy, Brier Score (calibration)
