<a href="https://colab.research.google.com/github/deburky/boosting-scorecards/blob/main/other_notebooks/xbooster-chi-square-validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <span style="font-family: Arial, sans-serif; color:#97f788">xbooster</span>
## <span style="font-family: Arial, sans-serif; color:navyblue">Validation with χ2</span>

<span style="font-family: Arial, sans-serif; color:navyblue">Repo: <a href="https://github.com/xRiskLab/xBooster" title="GitHub link">https://github.com/xRiskLab/xBooster</a></span>

In [1]:
%%capture
!pip install xbooster

In [2]:
from importlib.metadata import version

print(version("xbooster"))

0.2.2


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Fetch blended credit data
url = (
    "https://github.com/xRiskLab/xBooster/raw/main/examples/data/credit_data.parquet"
)
dataset = pd.read_parquet(url)

features = [
    "external_risk_estimate",
    "revolving_utilization_of_unsecured_lines",
    "account_never_delinq_percent",
    "net_fraction_revolving_burden",
    "num_total_cc_accounts",
    "average_months_in_file",
]

target = "is_bad"

X, y = dataset[features], dataset[target]

ix_train, ix_test = train_test_split(
    X.index, stratify=y, test_size=0.3, random_state=62
)

## Model training

In [4]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score

best_params = dict(
    n_estimators=100,
    learning_rate=0.55,
    max_depth=1,
    min_child_weight=10,
    grow_policy="lossguide",
    early_stopping_rounds=5,
)

# Create an XGBoost model
xgb_model = xgb.XGBClassifier(
    **best_params, random_state=62
)
evalset = [
    (X.loc[ix_train], y.loc[ix_train]),
    (X.loc[ix_test], y.loc[ix_test]),
]

# Fit the XGBoost model
xgb_model.fit(
    X.loc[ix_train],
    y.loc[ix_train],
    eval_set=evalset,
    verbose=False,
)

# Make predictions using the model
predictions_trn = xgb_model.predict_proba(X.loc[ix_train])[
    :, 1
]
predictions_tst = xgb_model.predict_proba(X.loc[ix_test])[
    :, 1
]

# Calculate the Gini score
gini_trn = roc_auc_score(y.loc[ix_train], predictions_trn) * 2 - 1  # type: ignore
gini_tst = roc_auc_score(y.loc[ix_test], predictions_tst) * 2 - 1  # type: ignore

print(
    f"Train Gini score: {gini_trn:.2%}\n"
    f"Test Gini score: {gini_tst:.2%}"
)

Train Gini score: 89.84%
Test Gini score: 89.11%


In [5]:
# Import necessary modules
from xbooster.constructor import XGBScorecardConstructor

# Set up the scorecard constructor
scorecard_constructor = XGBScorecardConstructor(
    xgb_model, X.loc[ix_train], y.loc[ix_train]
)

# Construct the scorecard
xgb_scorecard = scorecard_constructor.construct_scorecard()

xgb_scorecard_with_points = (
    scorecard_constructor.create_points(
        pdo=50, target_points=600, target_odds=50
    )
)

# Make predictions using the scorecard
credit_scores = scorecard_constructor.predict_score(
    X.loc[ix_test]
)
gini = roc_auc_score(y.loc[ix_test], -credit_scores) * 2 - 1  # type: ignore

print(f"Test Gini score: {gini:.2%}")

Test Gini score: 89.09%


In [6]:
cols_to_select = [
    'Tree',
    'Node',
    'Feature',
    'Sign',
    'Split',
    'Events',
    'NonEvents',
]

xgb_train_summary = xgb_scorecard_with_points[cols_to_select].copy()

## Model validation

In [7]:
# Add noise to X_test
def add_noise(data, noise_level=0.1):
    noisy_data = data.copy()
    for column in noisy_data.columns:
        noise = np.random.normal(0, noise_level, size=noisy_data[column].shape)
        noisy_data[column] -= noise
    return noisy_data

# Slightly perturb the target values
def perturb_target(target, perturbation_rate=0.1):
    perturbed_target = target.copy()
    n_perturb = int(len(target) * perturbation_rate)
    perturb_indices = np.random.choice(target.index, n_perturb, replace=False)
    perturbed_target.loc[perturb_indices] = 1 - perturbed_target.loc[perturb_indices]
    return perturbed_target

# Adding noise to X_test
noise_level = 2.5

dataset_for_validation = pd.concat([X.loc[ix_test], y.loc[ix_test]], axis=1)

X_test_noisy = add_noise(dataset_for_validation[features], noise_level).reset_index(drop=True)
y_test_noisy = perturb_target(dataset_for_validation[target], 0.01).reset_index(drop=True)

# Combining the noisy X_test with y_test
dataset_for_validation_noisy = pd.concat([X_test_noisy, y_test_noisy], axis=1)

In [8]:
# Import necessary modules
from xbooster.constructor import XGBScorecardConstructor

# Set up the scorecard constructor
scorecard_constructor_val = XGBScorecardConstructor(
    xgb_model, dataset_for_validation_noisy[features], dataset_for_validation_noisy[target]
)

# Construct the scorecard
xgb_scorecard_val = scorecard_constructor_val.construct_scorecard()

xgb_scorecard_with_points_val = (
    scorecard_constructor_val.create_points(
        pdo=50, target_points=600, target_odds=50
    )
)

# Make predictions using the scorecard
credit_scores = scorecard_constructor_val.predict_score(
    dataset_for_validation_noisy[features]
)
gini = roc_auc_score(dataset_for_validation_noisy[target], -credit_scores) * 2 - 1  # type: ignore

print(f"Test Gini score: {gini:.2%}")

Test Gini score: 62.20%


In [9]:
cols_to_select = [
    'Tree',
    'Node',
    'Feature',
    'Sign',
    'Split',
    'Events',
    'NonEvents',
]

xgb_val_summary = xgb_scorecard_with_points_val[cols_to_select].copy()

## χ2 test

In [12]:
import numpy as np
import pandas as pd
from scipy import stats

def calculate_chi_square_statistic(df, lambda_=2):
    tree_chi_sq = {'Tree': [], 'Chi-Square': [], 'P-Value': [], 'N_Nodes': []}

    for tree in df['Tree'].unique():
        tree_df = df[df['Tree'] == tree]

        # Creating a contingency table from the dataframe
        contingency_table = tree_df[['Events', 'NonEvents']].values

        # Row sums and column sums
        row_sums = contingency_table.sum(axis=1, keepdims=True)
        col_sums = contingency_table.sum(axis=0, keepdims=True)
        total = contingency_table.sum()

        # Expected frequencies
        expected = row_sums @ col_sums / total

        # Chi-square statistic
        chi_square_matrix = (contingency_table - expected) ** 2 / expected
        chi_square = chi_square_matrix.sum()

        # Penalty for complexity
        n_leaves = len(tree_df['Node'].unique())
        depth = int(np.log2(n_leaves))
        complexity = (n_leaves - 1) * (depth - 1)

        if n_leaves > 2:
            chi_square_adj = chi_square / (1 + lambda_ * complexity)
        else:
            chi_square_adj = chi_square

        # Degrees of freedom
        r, c = contingency_table.shape
        degrees_of_freedom = (r - 1) * (c - 1)

        p_value = stats.chi2.sf(chi_square_adj, df=degrees_of_freedom)

        # Store the results
        tree_chi_sq['Tree'].append(tree)
        tree_chi_sq['Chi-Square'].append(chi_square)
        tree_chi_sq['P-Value'].append(p_value)
        tree_chi_sq['N_Nodes'].append(len(tree_df))

    return tree_chi_sq

def analyze_chi_square_statistics(summary_df, dataset_name):
    # Calculate chi-square statistics
    chi_square_results = calculate_chi_square_statistic(summary_df)
    df_results = pd.DataFrame(chi_square_results)

    # Calculate total degrees of freedom
    total_degrees_of_freedom = 0
    for tree in summary_df['Tree'].unique():
        tree_df = summary_df[summary_df['Tree'] == tree]
        contingency_table = tree_df[['Events', 'NonEvents']].values
        r, c = contingency_table.shape
        total_degrees_of_freedom += (r - 1) * (c - 1)

    # Calculate average chi-square
    avg_chi_square = df_results['Chi-Square'].mean()
    print(f"{dataset_name} - Average Chi-Square: {avg_chi_square:.2f}")

    # Calculate p-value for the average chi-square
    p_value_avg_chi_square = stats.chi2.sf(avg_chi_square, df=total_degrees_of_freedom)
    print(f"{dataset_name} - p-value of the average Chi-Square: {p_value_avg_chi_square:.5f}")

    # Calculate number of non-significant nodes
    number_non_significant = (df_results['P-Value'] > 0.05).sum()
    print(f"{dataset_name} - Percent of non-significant trees: {number_non_significant / len(df_results):.2%}\n")

In [13]:
# Analyze chi-square results for training data
analyze_chi_square_statistics(xgb_train_summary, "Training Data")

# Analyze chi-square results for validation data
analyze_chi_square_statistics(xgb_val_summary, "Validation Data")

Training Data - Average Chi-Square: 280.06
Training Data - p-value of the average Chi-Square: 0.00000
Training Data - Percent of non-significant trees: 12.94%

Validation Data - Average Chi-Square: 55.48
Validation Data - p-value of the average Chi-Square: 0.99452
Validation Data - Percent of non-significant trees: 37.65%

