# <span style="font-family: Arial, sans-serif; color:#01afff">Logistic Regression</span>
## <span style="font-family: Arial, sans-serif; color:navyblue">Comparing models with raw and WOE inputs</span>

Author: https://www.github.com/deburky

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Fetch blended credit data
url = (
    "https://drive.google.com/file/d/1Is8UZnPRExI-SLJMle8GRbFGpha5IvYZ/view?usp=sharing"
)
url = "https://drive.google.com/uc?id=" + url.split("/")[-2]
dataset = pd.read_csv(url, index_col=False)

features = [
    "revolving_utilization_of_unsecured_lines",
    "account_never_delinq_percent",
    "net_fraction_revolving_burden",
    "external_risk_estimate",
    "num_total_cc_accounts",
    "average_months_in_file",
]

target = 'is_bad'

X, y = dataset[features], dataset[target]

ix_train, ix_test = train_test_split(
    X.index, stratify=y, test_size=0.3, random_state=62
)

## Raw inputs

Here we use raw numerical data to fit a logistic regression model.

In [2]:
from fisher_scoring import FisherScoringLogisticRegression
from sklearn.metrics import log_loss, roc_auc_score
from scipy.special import expit as sigmoid

model = FisherScoringLogisticRegression(use_bias=True, information='expected', max_iter=5, verbose=True)
model.fit(X.loc[ix_train, :], y[ix_train])

# Extract model weights and calculate Gini coefficient
model_weights = model.beta.T
print(f"Coefficients: {model_weights}")

p_of_bias = sigmoid(model_weights[:, 0]).flatten().item()
print(f"Probability of bias: {p_of_bias:.2%}")

predictions = model.predict_proba(X.loc[ix_test, :])[:, 1]
gini = 2 * roc_auc_score(y[ix_test], predictions) - 1
print(f"Gini (test): {gini:.2%}")

log_loss_score = log_loss(y[ix_test], predictions)
print(f"Log loss (test): {log_loss_score:.2f}")

# Display the summary of the model
model.display_summary(style='deep_sky_blue1')

Starting Fisher Scoring Iterations...
Iteration: 1, Log Loss: 0.6931
Iteration: 2, Log Loss: 0.2799
Iteration: 3, Log Loss: 0.2231
Iteration: 4, Log Loss: 0.2096
Iteration: 5, Log Loss: 0.2080
Maximum iterations reached without convergence.
Coefficients: [[ 5.8040147   2.87241899 -0.09790921  0.02463083 -0.01566946  0.03816645
  -0.00896689]]
Probability of bias: 99.70%
Gini (test): 79.74%
Log loss (test): 0.21


In [3]:
from sklearn.linear_model import LogisticRegression
from scipy.special import expit as sigmoid

sk_model = LogisticRegression(solver='newton-cg', penalty=None)
sk_model.fit(X.loc[ix_train, :], y[ix_train])

sk_predictions = sk_model.predict_proba(X.loc[ix_test, :])[:, 1]
sk_gini = 2 * roc_auc_score(y[ix_test], sk_predictions) - 1
print(f"Gini (test): {sk_gini:.2%}")

# Print intercept and coefficients
print(f"Intercept: {sk_model.intercept_}")
print(f"Coefficients: {sk_model.coef_}")

p_of_bias = sigmoid(sk_model.intercept_).flatten().item()
print(f"Probability of bias: {p_of_bias:.2%}")

Gini (test): 79.73%
Intercept: [5.78775471]
Coefficients: [[ 2.87751276 -0.09778669  0.02464418 -0.01564931  0.03816656 -0.00897061]]
Probability of bias: 99.69%


## WOE inputs

Weight of Evidence (WOE) converts numerical ranges to a set of discrete categories each containing a log likelihood ratio. 

The value of WOE tells us how likely the data (evidence) supports the hypothesis (e.g., default or no default).

In the traditional credit scoring methodology, the coefficients are expected to be negative due to the formula applied to feature bins:

$P(X=x_i|Y=0)/P(X=x_i|Y=1)$

### WOE via conditional probabilities (OptBinning)

We use [**OptBinning**](https://gnpalencia.org/optbinning/index.html) library for preprocessing inputs.

In [4]:
from optbinning import BinningProcess
from fisher_scoring import FisherScoringLogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from scipy.special import expit as sigmoid

# Create the binning process and logistic model pipeline
binning_process = BinningProcess(variable_names=features, categorical_variables=[])
model = FisherScoringLogisticRegression(use_bias=True, information='expected', verbose=True)

woe_logistic_model = make_pipeline(binning_process, model)
woe_logistic_model.fit(X.loc[ix_train, :], y[ix_train])

# Extract model weights and calculate Gini coefficient
model_weights = woe_logistic_model[-1].beta.T
print(f"Coefficients: {model_weights}")

predictions = woe_logistic_model.predict_proba(X.loc[ix_test, :])[:, 1]
gini = 2 * roc_auc_score(y[ix_test], predictions) - 1
print(f"Gini (test): {gini:.2%}")

log_loss_score = log_loss(y[ix_test], predictions)
print(f"Log loss (test): {log_loss_score:.2f}")

p_of_bias = sigmoid(model_weights[:, 0]).flatten().item()
print(f"Probability of bias: {p_of_bias:.2%}")

# Display the summary of the model
woe_logistic_model[-1].display_summary(style='light_slate_blue')

(CVXPY) Sep 28 08:18:48 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.10.4067). Expected < 9.10.0. Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) Sep 28 08:18:48 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.10.4067). Expected < 9.10.0. Please open a feature request on cvxpy to enable support for this version.')
Starting Fisher Scoring Iterations...
Iteration: 1, Log Loss: 0.6931
Iteration: 2, Log Loss: 0.2724
Iteration: 3, Log Loss: 0.2002
Iteration: 4, Log Loss: 0.1740
Iteration: 5, Log Loss: 0.1665
Iteration: 6, Log Loss: 0.1654
Iteration: 7, Log Loss: 0.1653
Iteration: 8, Log Loss: 0.1653
Iteration: 9, Log Loss: 0.1653
Iteration: 10, Log Loss: 0.1653
Convergence reached after 10 iterations.
Coefficients: [[-2.25705117 -1.01248996 -1.02589516 -0.39071063 -0.77812967 -1.25793343
  -0.45646565]]
Gini (test): 88.30%


### WOE via Bayes factor (custom)

This is an approach based on the Turing-Good Bayes factor. This differs from the conventional WOE calculation with conditional probabilities in that we use target encoder derive WOE from probabilities in relation to average event rate and its complement.

We use a custom scikit-learn implementation to show this approach in action.

In [13]:
import numpy as np
from sklearn.preprocessing import (
    KBinsDiscretizer,
    TargetEncoder,
    FunctionTransformer
)
from sklearn.pipeline import make_pipeline
from scipy.special import logit

base_log_odds = np.log(
    np.mean(y.loc[ix_train]) 
    / (1 - np.mean(y.loc[ix_train]))
)

def convert_to_woe(X: pd.DataFrame):
    eps = 1e-8
    X_log_odds = logit(X + eps)
    X_woe = base_log_odds - X_log_odds # negate WOE for scoring
    return pd.DataFrame(X_woe, columns=X.columns, index=X.index)

# Ensure the pipeline maintains DataFrames with their feature names
bayes_factor_encoder = make_pipeline(
    KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="kmeans").set_output(transform='pandas'),
    TargetEncoder(smooth=1e-0, cv=5).set_output(transform='pandas'),
    FunctionTransformer(convert_to_woe, validate=False, feature_names_out='one-to-one')
)

In [14]:
from optbinning import BinningProcess
from fisher_scoring import FisherScoringLogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from scipy.special import expit as sigmoid

# Create the binning process and logistic model pipeline
model = FisherScoringLogisticRegression(use_bias=True, information='expected', max_iter=7, verbose=True)

woe_logistic_model = make_pipeline(bayes_factor_encoder, model)
woe_logistic_model.fit(X.loc[ix_train, :], y[ix_train])

# Extract model weights and calculate Gini coefficient
model_weights = woe_logistic_model[-1].beta.T
print(f"Coefficients: {model_weights}")

predictions = woe_logistic_model.predict_proba(X.loc[ix_test, :])[:, 1]
gini = 2 * roc_auc_score(y[ix_test], predictions) - 1
print(f"Gini (test): {gini:.2%}")

log_loss_score = log_loss(y[ix_test], predictions)
print(f"Log loss (test): {log_loss_score:.2f}")

p_of_bias = sigmoid(model_weights[:, 0]).flatten().item()
print(f"Probability of bias: {p_of_bias:.2%}")

# Display the summary of the model
woe_logistic_model[-1].display_summary(style='deep_pink2')

Starting Fisher Scoring Iterations...
Iteration: 1, Log Loss: 0.6931
Iteration: 2, Log Loss: 0.2732
Iteration: 3, Log Loss: 0.2034
Iteration: 4, Log Loss: 0.1798
Iteration: 5, Log Loss: 0.1742
Iteration: 6, Log Loss: 0.1736
Iteration: 7, Log Loss: 0.1736
Maximum iterations reached without convergence.
Coefficients: [[-2.19433421 -1.02561293 -1.00022244 -0.3158842  -0.8153639  -0.81020418
  -0.63188525]]
Gini (test): 88.30%
Log loss (test): 0.17
Probability of bias: 10.03%
