# <center> <span style="font-family: Virgil GS, sans-serif; color:#97f788">XGBoost to Logistic Regression</span> </center>
## <center> <span style="font-family: Virgil GS, sans-serif; color:navyblue">Credit Scoring Approach</span> </center>

 <span style="font-family: Virgil GS, sans-serif; color:navyblue">Author: <a href="https://github.com/deburky" title="GitHub link">https://github.com/deburky</a></span>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Fetch blended credit data
url = (
    "https://drive.google.com/file/d/1Is8UZnPRExI-SLJMle8GRbFGpha5IvYZ/view?usp=sharing"
)
url = "https://drive.google.com/uc?id=" + url.split("/")[-2]
dataset = pd.read_csv(url, index_col=False)

features = [
    "external_risk_estimate",
    "revolving_utilization_of_unsecured_lines",
    "account_never_delinq_percent",
    "net_fraction_revolving_burden",
    "num_total_cc_accounts",
    "average_months_in_file",
    # "num_historical_failed_to_pay"
]
categorical_features = ["num_historical_failed_to_pay"]
target = 'is_bad'

X, y = dataset[features], dataset[target]

ix_train, ix_test = train_test_split(
    X.index, stratify=y, test_size=0.3, random_state=62
)

In [2]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score

best_params = dict(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=2,
    min_child_weight=10,
    grow_policy="lossguide",
    early_stopping_rounds=5,
)

# Experimental: balanced sample weights
sample_weights = np.ones(len(y.loc[ix_train]))
mask = y.loc[ix_train] == 0
sample_weights[mask] = 1.0
sample_weights[~mask] = 1.0

# Create an XGBoost model
xgb_model = xgb.XGBClassifier(**best_params, random_state=62)
evalset = [(X.loc[ix_train], y.loc[ix_train]), (X.loc[ix_test], y.loc[ix_test])]

# Fit the XGBoost model
xgb_model.fit(
    X.loc[ix_train],
    y.loc[ix_train],
    sample_weight=sample_weights,
    eval_set=evalset,
    verbose=False,
)

# Make predictions using the model
predictions_trn = xgb_model.predict_proba(X.loc[ix_train])[:, 1]
predictions_tst = xgb_model.predict_proba(X.loc[ix_test])[:, 1]

# Calculate the Gini score
gini_trn = roc_auc_score(y.loc[ix_train], predictions_trn) * 2 - 1  # type: ignore
gini_tst = roc_auc_score(y.loc[ix_test], predictions_tst) * 2 - 1  # type: ignore

print(f"Train Gini score: {gini_trn:.2%}\n" f"Test Gini score: {gini_tst:.2%}")

Train Gini score: 90.44%
Test Gini score: 89.14%


In [3]:
from xbooster.constructor import XGBScorecardConstructor

# Set up the scorecard constructor
scorecard_constructor = XGBScorecardConstructor(
    xgb_model, X.loc[ix_train], y.loc[ix_train]
)

# Construct the scorecard
xgb_scorecard = scorecard_constructor.construct_scorecard()

# Create a scorecard with points
xgb_scorecard_with_points = scorecard_constructor.create_points(
    pdo=50, target_points=600, target_odds=50, # score_type='WOE'
)

# Make predictions using the scorecard
credit_scores = scorecard_constructor.predict_score(X.loc[ix_test])
gini = roc_auc_score(y.loc[ix_test], -credit_scores) * 2 - 1  # type: ignore

print(f"Test Gini score: {gini:.2%}")

Test Gini score: 89.13%


In [4]:
dataframe_with_trees = scorecard_constructor.get_leafs(X.loc[ix_train], output_type='leaf_index')
dataframe_with_trees.iloc[:, 1:11]

Unnamed: 0,tree_1,tree_2,tree_3,tree_4,tree_5,tree_6,tree_7,tree_8,tree_9,tree_10
0,4.0,3.0,4.0,3.0,3.0,4.0,4.0,3.0,4.0,3.0
1,6.0,6.0,5.0,6.0,5.0,5.0,6.0,6.0,5.0,5.0
2,6.0,6.0,5.0,6.0,5.0,5.0,6.0,6.0,5.0,5.0
3,3.0,4.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0
4,3.0,4.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0
...,...,...,...,...,...,...,...,...,...,...
6995,4.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0
6996,6.0,6.0,5.0,6.0,5.0,5.0,6.0,6.0,5.0,5.0
6997,6.0,6.0,5.0,6.0,5.0,5.0,6.0,6.0,5.0,5.0
6998,6.0,6.0,5.0,6.0,5.0,5.0,6.0,6.0,5.0,5.0


In [5]:
df = dataframe_with_trees.iloc[0:6, 2:3].copy()
pd.get_dummies(df['tree_2'], prefix='tree_2', drop_first=True).astype(int)

Unnamed: 0,tree_2_4.0,tree_2_6.0
0,0,0
1,0,1
2,0,1
3,1,0
4,1,0
5,0,1


In [6]:
scorecard_constructor.get_leafs(X.loc[ix_train]).iloc[:, 1:11] / scorecard_constructor.learning_rate

Unnamed: 0,tree_1,tree_2,tree_3,tree_4,tree_5,tree_6,tree_7,tree_8,tree_9,tree_10
0,-0.278543,-0.500020,-0.365842,-0.350632,-0.460732,-0.231152,-0.641821,-0.384222,-0.363448,-0.436815
1,-1.154928,-1.133223,-1.124693,-1.096582,-1.087804,-1.074142,-1.052737,-1.039665,-1.044419,-1.033982
2,-1.154928,-1.133223,-1.124693,-1.096582,-1.087804,-1.074142,-1.052737,-1.039665,-1.044419,-1.033982
3,1.865259,1.409518,1.305674,1.290506,0.957481,1.232955,0.658707,0.828023,0.762642,0.646442
4,1.865259,1.409518,1.305674,1.290506,0.957481,1.232955,0.658707,0.828023,0.762642,0.646442
...,...,...,...,...,...,...,...,...,...,...
6995,-0.278543,-0.500020,-0.365842,-0.350632,-0.460732,-0.231152,0.658707,-0.384222,-0.363448,-0.436815
6996,-1.154928,-1.133223,-1.124693,-1.096582,-1.087804,-1.074142,-1.052737,-1.039665,-1.044419,-1.033982
6997,-1.154928,-1.133223,-1.124693,-1.096582,-1.087804,-1.074142,-1.052737,-1.039665,-1.044419,-1.033982
6998,-1.154928,-1.133223,-1.124693,-1.096582,-1.087804,-1.074142,-1.052737,-1.039665,-1.044419,-1.033982


In [7]:
from sklearn.linear_model import LogisticRegression

dataframe_with_trees = scorecard_constructor.get_leafs(X.loc[ix_train])
feature_trees = dataframe_with_trees.columns

xgboost_logreg = LogisticRegression(fit_intercept=True)
xgboost_logreg.fit(dataframe_with_trees, y.loc[ix_train])

print(xgboost_logreg.intercept_, xgboost_logreg.coef_)

# Test performance
dataframe_with_trees_test = scorecard_constructor.get_leafs(X.loc[ix_test])
preds = xgboost_logreg.predict_proba(dataframe_with_trees_test)[:, 1]
gini = roc_auc_score(y.loc[ix_test], preds) * 2 - 1
print(f"Test Gini score: {gini:.2%}")

[-1.62352526] [[1.2020327  0.97575971 1.15797107 0.94565509 0.95508742 0.90668016
  1.28900426 1.20586514 0.81415158 0.74754902 0.75442047 1.15835286
  1.00462902 1.07186301 1.32491479 0.86561505 0.86455394 1.61863739
  1.00303684 0.94733892 1.0009415  1.07450323 0.8456734  1.26297626
  1.4588525  1.05754771 1.03020566 1.29043286 1.06841534 1.37002718
  1.25652411 1.16154753 1.10961945 1.1119615  1.32266487 1.3082921
  1.14207318 1.50009438 1.2873827  1.02255989 1.20146479 1.19012363
  1.01301803 1.12589038 0.71898659 1.07271603 1.20889274 1.10049925
  0.99414106 1.24884242 1.1818015  1.26451315 1.13617215 0.79867256
  0.79247341 0.97902319 1.36307041 0.88100753 0.99550972 1.00467602
  0.69304552 0.69596907 0.82996096 0.94739084 1.37195452 0.82122443
  0.89276536 1.01786259 1.20976927 0.60941342 0.68983199 0.82426148
  0.81495502 1.01811936 1.07077166 0.69975434 0.61008468 0.77539232
  0.58559521 0.96066049 0.91404242 0.73222261 0.79444167 0.59809685
  0.83246552 0.93542351 0.80768764 

Test Gini score: 89.15%


In [11]:
from scipy.special import expit as sigmoid

print(sigmoid(xgboost_logreg.intercept_.item()), scorecard_constructor.base_score)

0.1647192673571 0.16798161


In [9]:
df_to_show = xgb_scorecard_with_points.query("Tree < 4").copy()
df_to_show['BaseScore'] = scorecard_constructor.base_score
df_to_show[['Tree', 'Node', 'Feature', 'Sign', 'Split', 'BaseScore', 'EventRate', 'XAddEvidence']]

Unnamed: 0,Tree,Node,Feature,Sign,Split,BaseScore,EventRate,XAddEvidence
0,0,3,revolving_utilization_of_unsecured_lines,<,0.609306,0.167982,0.122041,-0.032747
1,0,4,revolving_utilization_of_unsecured_lines,>=,0.609306,0.167982,0.486249,0.225939
2,0,5,revolving_utilization_of_unsecured_lines,<,0.697415,0.167982,0.00174,-0.118699
3,0,6,revolving_utilization_of_unsecured_lines,>=,0.697415,0.167982,0.026991,-0.099913
4,1,3,external_risk_estimate,<,69.0,0.167982,0.454153,0.186526
5,1,4,external_risk_estimate,>=,69.0,0.167982,0.134031,-0.027854
6,1,5,external_risk_estimate,<,61.0,0.167982,0.03038,-0.092684
7,1,6,external_risk_estimate,>=,61.0,0.167982,0.002919,-0.115493
8,2,3,revolving_utilization_of_unsecured_lines,<,0.384021,0.167982,0.097919,-0.050002
9,2,4,revolving_utilization_of_unsecured_lines,>=,0.384021,0.167982,0.429759,0.140952
