## 3.1 Model Preparation

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, roc_curve
)
from imblearn.over_sampling import SMOTE
import time

import pandas as pd
import numpy as np
from typing import Tuple

# set random seed for reproducibility
SEED = 2025
np.random.seed(SEED)

def prepare_modelling_data(df: pd.DataFrame, target: str = 'target') -> Tuple:
    """
    prepare data for model training with proper encoding and splitting
    
    args:
        df: preprocessed dataframe
        target: target variable name
        
    returns:
        tuple of (X_train, X_val, X_test, y_train, y_val, y_test, feature_names, encoders)
    """
    df_model_prep = df.copy()
    
    # separate features and target
    if target not in df_model_prep.columns:
        raise ValueError(f"target column '{target}' not found in dataframe")
    
    y = df_model_prep[target]
    X = df_model_prep.drop(columns=[target])
    
    # encode categorical variables
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    encoders = {}
    
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        encoders[col] = le
    
    # convert datetime columns to numeric
    datetime_cols = X.select_dtypes(include=['datetime64']).columns.tolist()
    for col in datetime_cols:
        X[f'{col}_year'] = X[col].dt.year
        X[f'{col}_month'] = X[col].dt.month
        X = X.drop(columns=[col])
    
    feature_names = X.columns.tolist()
    
    # split data: 70% train, 15% validation, 15% test
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.30, stratify=y, random_state=SEED
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=SEED
    )
    
    print(f"train set: {len(X_train):,} samples")
    print(f"validation set: {len(X_val):,} samples")
    print(f"test set: {len(X_test):,} samples")
    print(f"features: {len(feature_names)}")
    
    return X_train, X_val, X_test, y_train, y_val, y_test, feature_names, encoders

# load data
df_model_processed = pd.read_csv("/Users/chenjing/Desktop/credit-risk-prediction/data/processed/accepted_loans_model_ready.csv")


# prepare data
X_train, X_val, X_test, y_train, y_val, y_test, feature_names, encoders = prepare_modelling_data(
    df_model_processed
)

train set: 211,610 samples
validation set: 45,345 samples
test set: 45,346 samples
features: 26


## 3.2 Model 1: Logistic Regression

In [8]:
def train_logistic_regression(X_train, y_train, X_val, y_val, use_smote: bool = True):
    """
    train and evaluate logistic regression model
    
    args:
        X_train: training features
        y_train: training target
        X_val: validation features
        y_val: validation target
        use_smote: whether to apply SMOTE for class balance
        
    returns:
        trained model, scaler, predictions, metrics
    """
    print("=" * 70)
    print("LOGISTIC REGRESSION")
    print("=" * 70)
    
    # handle class imbalance with SMOTE if requested
    if use_smote:
        print("\napplying SMOTE for class balance...")
        smote = SMOTE(random_state=SEED)
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
        print(f"balanced training set: {len(X_train_balanced):,} samples")
    else:
        X_train_balanced = X_train
        y_train_balanced = y_train
    
    # scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_balanced)
    X_val_scaled = scaler.transform(X_val)
    
    # train model
    print("\ntraining logistic regression...")
    start_time = time.time()
    
    model = LogisticRegression(
        max_iter=1000,
        random_state=SEED,
        class_weight='balanced',
        solver='lbfgs',
        n_jobs=-1
    )
    model.fit(X_train_scaled, y_train_balanced)
    
    training_time = time.time() - start_time
    print(f"training completed in {training_time:.2f} seconds")
    
    # predictions
    y_train_pred = model.predict(X_train_scaled)
    y_train_pred_proba = model.predict_proba(X_train_scaled)[:, 1]
    
    y_val_pred = model.predict(X_val_scaled)
    y_val_pred_proba = model.predict_proba(X_val_scaled)[:, 1]
    
    # evaluation
    metrics = {
        'train': {
            'accuracy': accuracy_score(y_train_balanced, y_train_pred),
            'precision': precision_score(y_train_balanced, y_train_pred),
            'recall': recall_score(y_train_balanced, y_train_pred),
            'f1': f1_score(y_train_balanced, y_train_pred),
            'roc_auc': roc_auc_score(y_train_balanced, y_train_pred_proba)
        },
        'validation': {
            'accuracy': accuracy_score(y_val, y_val_pred),
            'precision': precision_score(y_val, y_val_pred),
            'recall': recall_score(y_val, y_val_pred),
            'f1': f1_score(y_val, y_val_pred),
            'roc_auc': roc_auc_score(y_val, y_val_pred_proba)
        }
    }
    
    # print results
    print("\n" + "-" * 70)
    print("TRAINING SET PERFORMANCE")
    print("-" * 70)
    for metric, value in metrics['train'].items():
        print(f"{metric:.<20} {value:.4f}")
    
    print("\n" + "-" * 70)
    print("VALIDATION SET PERFORMANCE")
    print("-" * 70)
    for metric, value in metrics['validation'].items():
        print(f"{metric:.<20} {value:.4f}")
    
    # confusion matrix
    print("\n" + "-" * 70)
    print("CONFUSION MATRIX (Validation)")
    print("-" * 70)
    cm = confusion_matrix(y_val, y_val_pred)
    print(f"true negatives:  {cm[0, 0]:>8,}")
    print(f"false positives: {cm[0, 1]:>8,}")
    print(f"false negatives: {cm[1, 0]:>8,}")
    print(f"true positives:  {cm[1, 1]:>8,}")
    
    # visualise confusion matrix
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # confusion matrix heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
    axes[0].set_title('confusion matrix')
    axes[0].set_xlabel('predicted')
    axes[0].set_ylabel('actual')
    axes[0].set_xticklabels(['non-default', 'default'])
    axes[0].set_yticklabels(['non-default', 'default'])
    
    # roc curve
    fpr, tpr, _ = roc_curve(y_val, y_val_pred_proba)
    axes[1].plot(fpr, tpr, label=f"auc = {metrics['validation']['roc_auc']:.4f}", linewidth=2)
    axes[1].plot([0, 1], [0, 1], 'k--', label='random classifier')
    axes[1].set_xlabel('false positive rate')
    axes[1].set_ylabel('true positive rate')
    axes[1].set_title('roc curve')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return model, scaler, y_val_pred_proba, metrics

# train logistic regression
lr_model, lr_scaler, lr_pred_proba, lr_metrics = train_logistic_regression(
    X_train, y_train, X_val, y_val, use_smote=True
)

LOGISTIC REGRESSION

applying SMOTE for class balance...


ValueError: Input X contains infinity or a value too large for dtype('float64').

## 3.3 Model 2: Random Forest

In [9]:
def train_random_forest(X_train, y_train, X_val, y_val, use_smote: bool = False):
    """
    train and evaluate random forest classifier
    
    args:
        X_train: training features
        y_train: training target
        X_val: validation features
        y_val: validation target
        use_smote: whether to apply SMOTE (not typically needed for RF)
        
    returns:
        trained model, predictions, metrics
    """
    print("=" * 70)
    print("RANDOM FOREST")
    print("=" * 70)
    
    # handle class imbalance with SMOTE if requested
    if use_smote:
        print("\napplying SMOTE for class balance...")
        smote = SMOTE(random_state=SEED)
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
        print(f"balanced training set: {len(X_train_balanced):,} samples")
    else:
        X_train_balanced = X_train
        y_train_balanced = y_train
    
    # train model
    print("\ntraining random forest...")
    start_time = time.time()
    
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        class_weight='balanced',
        random_state=SEED,
        n_jobs=-1,
        verbose=0
    )
    model.fit(X_train_balanced, y_train_balanced)
    
    training_time = time.time() - start_time
    print(f"training completed in {training_time:.2f} seconds")
    
    # predictions
    y_train_pred = model.predict(X_train_balanced)
    y_train_pred_proba = model.predict_proba(X_train_balanced)[:, 1]
    
    y_val_pred = model.predict(X_val)
    y_val_pred_proba = model.predict_proba(X_val)[:, 1]
    
    # evaluation
    metrics = {
        'train': {
            'accuracy': accuracy_score(y_train_balanced, y_train_pred),
            'precision': precision_score(y_train_balanced, y_train_pred),
            'recall': recall_score(y_train_balanced, y_train_pred),
            'f1': f1_score(y_train_balanced, y_train_pred),
            'roc_auc': roc_auc_score(y_train_balanced, y_train_pred_proba)
        },
        'validation': {
            'accuracy': accuracy_score(y_val, y_val_pred),
            'precision': precision_score(y_val, y_val_pred),
            'recall': recall_score(y_val, y_val_pred),
            'f1': f1_score(y_val, y_val_pred),
            'roc_auc': roc_auc_score(y_val, y_val_pred_proba)
        }
    }
    
    # print results
    print("\n" + "-" * 70)
    print("TRAINING SET PERFORMANCE")
    print("-" * 70)
    for metric, value in metrics['train'].items():
        print(f"{metric:.<20} {value:.4f}")
    
    print("\n" + "-" * 70)
    print("VALIDATION SET PERFORMANCE")
    print("-" * 70)
    for metric, value in metrics['validation'].items():
        print(f"{metric:.<20} {value:.4f}")
    
    # confusion matrix
    print("\n" + "-" * 70)
    print("CONFUSION MATRIX (Validation)")
    print("-" * 70)
    cm = confusion_matrix(y_val, y_val_pred)
    print(f"true negatives:  {cm[0, 0]:>8,}")
    print(f"false positives: {cm[0, 1]:>8,}")
    print(f"false negatives: {cm[1, 0]:>8,}")
    print(f"true positives:  {cm[1, 1]:>8,}")
    
    # visualise confusion matrix and roc curve
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # confusion matrix heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', ax=axes[0])
    axes[0].set_title('confusion matrix')
    axes[0].set_xlabel('predicted')
    axes[0].set_ylabel('actual')
    axes[0].set_xticklabels(['non-default', 'default'])
    axes[0].set_yticklabels(['non-default', 'default'])
    
    # roc curve
    fpr, tpr, _ = roc_curve(y_val, y_val_pred_proba)
    axes[1].plot(fpr, tpr, label=f"auc = {metrics['validation']['roc_auc']:.4f}", linewidth=2)
    axes[1].plot([0, 1], [0, 1], 'k--', label='random classifier')
    axes[1].set_xlabel('false positive rate')
    axes[1].set_ylabel('true positive rate')
    axes[1].set_title('roc curve')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return model, y_val_pred_proba, metrics

# train random forest
rf_model, rf_pred_proba, rf_metrics = train_random_forest(
    X_train, y_train, X_val, y_val, use_smote=False
)

RANDOM FOREST

training random forest...


ValueError: Input X contains infinity or a value too large for dtype('float32').