# <center> <span style="font-family: Virgil GS, sans-serif; color:#97f788">WOE Naive Bayes Experiment</span> </center>
## <center> <span style="font-family: Virgil GS, sans-serif; color:navyblue">From event rates to WOE</span> </center>

 <span style="font-family: Virgil GS, sans-serif; color:navyblue">Author: <a href="https://github.com/deburky" title="GitHub link">https://github.com/deburky</a></span>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Fetch blended credit data
url = (
    "https://drive.google.com/file/d/1Is8UZnPRExI-SLJMle8GRbFGpha5IvYZ/view?usp=sharing"
)
url = "https://drive.google.com/uc?id=" + url.split("/")[-2]
dataset = pd.read_csv(url, index_col=False)

features = [
    "external_risk_estimate",
    "revolving_utilization_of_unsecured_lines",
    "account_never_delinq_percent",
    "net_fraction_revolving_burden",
    "num_total_cc_accounts",
    "average_months_in_file",
]

X, y = dataset[features], dataset["is_bad"]

ix_train, ix_test = train_test_split(
    X.index, stratify=y, test_size=0.3, random_state=62
)

<span style="font-family: Virgil GS, sans-serif; color: navyblue; font-size: 25px;">Scikit-Learn pipeline</span>

In [152]:
import numpy as np
from sklearn.preprocessing import (
    KBinsDiscretizer,
    TargetEncoder, # type: ignore
    FunctionTransformer
)
from sklearn.pipeline import make_pipeline
from scipy.special import logit

base_log_odds = np.log(np.mean(y.loc[ix_train]) / (1 - np.mean(y.loc[ix_train])))

# This means we take average DR in bin and convert to log-odds like intercept
# After this we subtract the intercept to create WOE scores
def convert_to_woe(X: np.ndarray):
    # we get log odds first
    X = logit(X)
    # then we subtract X from the base log odds
    X = X - base_log_odds
    # reshape to 2D array
    X = X.reshape(-1, 1)
    return X

training_feature = X.loc[ix_train]['revolving_utilization_of_unsecured_lines']
training_feature = training_feature.values.reshape(-1, 1)

target_encoder_pipeline = make_pipeline(
    KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="kmeans"),
    TargetEncoder(smooth=0.0001, cv=2),
    FunctionTransformer(convert_to_woe, validate=False)
)

target_encoder_pipeline.fit(training_feature, y.loc[ix_train])



<span style="font-family: Virgil GS, sans-serif; color: navyblue; font-size: 25px;">Binning table for WOE</span>

In [217]:
from scipy.special import logit

# Output of the pipeline
binning_output = target_encoder_pipeline[0].transform(training_feature)
encoded_output = target_encoder_pipeline[1].transform(binning_output).round(6)
woe_output = target_encoder_pipeline[2].transform(encoded_output)

# Create a dataframe to display the results
new_df = pd.DataFrame({
    'binned_feature': binning_output.flatten(),
    'target_encoded_feature': encoded_output.flatten(),
    'woe_feature': woe_output.flatten(),
    'label': y.loc[ix_train].values,
})

# Calculate the WOE table
bin_counts = (
    new_df.groupby("binned_feature")["label"].value_counts().unstack().fillna(0)
)  # .reset_index(drop=True)
bin_counts.rename(columns={0: "NonEvents", 1: "Events"}, inplace=True)
bin_counts["Total"] = bin_counts.sum(axis=1)
bin_counts["EventRate"] = round(bin_counts["Events"] / bin_counts["Total"], 6)
bin_counts = bin_counts.reset_index(drop=False)
bin_counts_te = (
    new_df.groupby("binned_feature")["target_encoded_feature"]
    .agg(["mean"])
    .reset_index()
)

woe_df = new_df.groupby("binned_feature")["woe_feature"].agg(["max"]).reset_index()
woe_df.columns = ["binned_feature", "BackwardsWOE"]

merged_summary = bin_counts.merge(bin_counts_te, on="binned_feature", how="left")

# WOE calculation based on EventRate only
event_rate_sample = np.sum(merged_summary["NonEvents"]) / np.sum(merged_summary["Total"])
merged_summary["EventRateWOE"] = logit(event_rate_sample) + logit(merged_summary["EventRate"])

merged_summary = merged_summary.merge(woe_df, on="binned_feature", how="left")

# Display the final summary
display(merged_summary)

print(np.testing.assert_almost_equal(merged_summary["EventRateWOE"].values, merged_summary["BackwardsWOE"].values, decimal=6))

Unnamed: 0,binned_feature,NonEvents,Events,Total,EventRate,mean,EventRateWOE,BackwardsWOE
0,0.0,2377,63,2440,0.02582,0.02582,-1.433222,-1.433222
1,1.0,896,40,936,0.042735,0.042735,-0.911837,-0.911837
2,2.0,623,33,656,0.050305,0.050305,-0.740812,-0.740812
3,3.0,461,27,488,0.055328,0.055328,-0.640334,-0.640334
4,4.0,360,39,399,0.097744,0.097744,-0.025322,-0.025322
5,5.0,268,36,304,0.118421,0.118421,0.189756,0.189756
6,6.0,235,50,285,0.175439,0.175439,0.649665,0.649665
7,7.0,222,63,285,0.221053,0.221053,0.937684,0.937684
8,8.0,221,68,289,0.235294,0.235294,1.018569,1.018569
9,9.0,637,281,918,0.3061,0.3061,1.378809,1.378809


None


<span style="font-family: Virgil GS, sans-serif; color: navyblue; font-size: 25px;">WOE formula (bad-to-good)</span>

In [218]:
# Calculation for Bin 0
event_rate_i = merged_summary[merged_summary['binned_feature'] == 0]['EventRate'].values
non_event_rate_i = 1-event_rate_i

event_rate_sample = merged_summary['NonEvents'].sum() / merged_summary['Total'].sum()
non_event_rate_sample = 1 - event_rate_sample

WOE_i = np.log(event_rate_i/non_event_rate_i) + np.log(event_rate_sample/non_event_rate_sample)
print(WOE_i)

[-1.43322213]


In [272]:
# To verify the calculation, we can calculate the WOE manually (small deviations)
merged_summary['CumNonEvents'] = merged_summary['NonEvents'].sum()
merged_summary['CumEvents'] = merged_summary['Events'].sum()

merged_summary['RealWOE'] = np.log(
   (merged_summary['Events'] / merged_summary['CumEvents']).astype(np.float64)
   /
   (merged_summary['NonEvents'] / merged_summary['CumNonEvents']).astype(np.float64)
)
merged_summary[['EventRateWOE', 'BackwardsWOE', 'RealWOE']]

Unnamed: 0,EventRateWOE,BackwardsWOE,RealWOE
0,-1.433222,-1.433222,-1.433235
1,-0.911837,-0.911837,-0.911836
2,-0.740812,-0.740812,-0.740814
3,-0.640334,-0.640334,-0.640337
4,-0.025322,-0.025322,-0.025318
5,0.189756,0.189756,0.189757
6,0.649665,0.649665,0.649662
7,0.937684,0.937684,0.937682
8,1.018569,1.018569,1.01857
9,1.378809,1.378809,1.37881
