## <center><span style="font-family: ClearSans, sans-serif; color:#00BBD7">Enhancing Explainability in Credit Scoring</span></center>
## <span style="font-family: ClearSans, sans-serif; color:navyblue">Dataset</span>


<span style="font-family: ClearSans, sans-serif; color:navyblue">Author: <a href="https://github.com/deburky" title="GitHub link">https://github.com/deburky</a></span>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### <span style="font-family: ClearSans, sans-serif; color:#00BBD7">Dataset A: FICO xML Challenge</span>

In [2]:
dataset_A = pd.read_csv("data/dataset_A.csv")

In [3]:
# processing target variables
dataset_A["RiskPerformance"].replace({"Good": 0, "Bad": 1}, inplace=True)
dataset_A.rename(columns={"RiskPerformance": "is_bad"}, inplace=True)

# processing special codes
special_codes = [-9, -8, -7]

In [4]:
feats_dataset_A = [
    "ExternalRiskEstimate",
    "MSinceOldestTradeOpen",
    "MSinceMostRecentTradeOpen",
    "AverageMInFile",
    "NumSatisfactoryTrades",
    "NumTrades60Ever2DerogPubRec",
    "NumTrades90Ever2DerogPubRec",
    "PercentTradesNeverDelq",
    "MSinceMostRecentDelq",
    "NumTradesOpeninLast12M",
    "MSinceMostRecentInqexcl7days",
    "NumInqLast6M",
    "NumInqLast6Mexcl7days",
    "NetFractionRevolvingBurden",
    "NetFractionInstallBurden",
    "NumBank2NatlTradesWHighUtilization",
]

In [5]:
dataset_A = dataset_A[feats_dataset_A + ["is_bad"]].copy()

In [6]:
len(dataset_A)

10459

### <span style="font-family: ClearSans, sans-serif; color:#00BBD7">Dataset B: Lending Club</span>

In [7]:
dataset_B = pd.read_csv("data/dataset_B.csv")

In [8]:
dataset_B.rename(columns={"delinq_2y": "is_bad"}, inplace=True)
dataset_B["is_bad"] = np.where(dataset_B["is_bad"] > 0, 1, 0)

In [9]:
feats_dataset_B = [
    "annual_income",
    "debt_to_income",
    "emp_length",
    "loan_amount",
    "total_credit_limit",
    "total_credit_utilized",
    "current_installment_accounts",
    "paid_total",
    "num_mort_accounts",
    "account_never_delinq_percent",
    "balance",
    "num_historical_failed_to_pay",
    "num_total_cc_accounts",
    "num_open_cc_accounts",
    "num_cc_carrying_balance",
]

In [10]:
dataset_B = dataset_B[feats_dataset_B + ["is_bad"]].copy()
dataset_B["debt_to_income"] = np.where(
    dataset_B["debt_to_income"].isna(), 1, dataset_B["debt_to_income"]
)
dataset_B["emp_length"] = np.where(
    dataset_B["emp_length"].isna(), 0, dataset_B["debt_to_income"]
)

In [11]:
len(dataset_B)

10000

### <span style="font-family: ClearSans, sans-serif; color:#00BBD7">Dataset C: Give Me Some Credit</span>

In [12]:
dataset_C = pd.read_csv("data/dataset_C.csv")

In [13]:
dataset_C.rename(columns={"SeriousDlqin2yrs": "is_bad"}, inplace=True)
dataset_C["MonthlyIncome"] = np.where(
    dataset_C["MonthlyIncome"].isna(), 0, dataset_C["MonthlyIncome"]
)
dataset_C["NumberOfDependents"] = np.where(
    dataset_C["NumberOfDependents"].isna(), 0, dataset_C["NumberOfDependents"]
)
dataset_C["RevolvingUtilizationOfUnsecuredLines"] = np.clip(
    dataset_C["RevolvingUtilizationOfUnsecuredLines"], 0, 1
)

dataset_C["DebtRatio"] = np.clip(dataset_C["DebtRatio"], 0, 1)

In [14]:
feats_dataset_C = [
    "age",
    "DebtRatio",
    "MonthlyIncome",
    "NumberOfOpenCreditLinesAndLoans",
    "NumberOfDependents",
    "NumberRealEstateLoansOrLines",
    "RevolvingUtilizationOfUnsecuredLines",
]

In [15]:
dataset_C = dataset_C[feats_dataset_C + ["is_bad"]].copy()

In [16]:
len(dataset_C)

150000

### <span style="font-family: ClearSans, sans-serif; color:#00BBD7">Blending</span>

In [17]:
desired_sample_size_bad = 1000

# Sample 'is_bad' labels from each dataset
bad_A = (
    dataset_A[dataset_A["is_bad"] == 1]["is_bad"]
    .sample(n=desired_sample_size_bad, replace=True, random_state=42)
    .reset_index(drop=True)
)

bad_B = (
    dataset_B[dataset_B["is_bad"] == 1]["is_bad"]
    .sample(n=desired_sample_size_bad, replace=True, random_state=42)
    .reset_index(drop=True)
)

bad_C = (
    dataset_C[dataset_C["is_bad"] == 1]["is_bad"]
    .sample(n=desired_sample_size_bad, replace=True, random_state=42)
    .reset_index(drop=True)
)

# Concatenate the "bad" rows from all datasets
bad_D = (
    pd.concat([bad_A, bad_B, bad_C], axis=0)
    .sample(n=desired_sample_size_bad, replace=True, random_state=42)
    .reset_index(drop=True)
)

feat_A = (
    dataset_A[dataset_A.is_bad == 1]
    .sample(n=len(bad_A), replace=True, random_state=42)
    .drop(columns="is_bad")
    .reset_index(drop=True)
)
feat_B = (
    dataset_B[dataset_B.is_bad == 1]
    .sample(n=len(bad_B), replace=True, random_state=42)
    .drop(columns="is_bad")
    .reset_index(drop=True)
)
feat_C = (
    dataset_C[dataset_C.is_bad == 1]
    .sample(n=len(bad_C), replace=True, random_state=42)
    .drop(columns="is_bad")
    .reset_index(drop=True)
)

feat_D = pd.concat([feat_A, feat_B, feat_C], axis=1)

sample_bads = pd.concat([feat_D, bad_D], axis=1)

In [18]:
desired_sample_size_goods = 9000

# Sample 'is_bad' labels from each dataset
good_A = (
    dataset_A[dataset_A["is_bad"] == 0]["is_bad"]
    .sample(n=desired_sample_size_goods, replace=True, random_state=42)
    .reset_index(drop=True)
)

good_B = (
    dataset_B[dataset_B["is_bad"] == 0]["is_bad"]
    .sample(n=desired_sample_size_goods, replace=True, random_state=42)
    .reset_index(drop=True)
)

good_C = (
    dataset_C[dataset_C["is_bad"] == 0]["is_bad"]
    .sample(n=desired_sample_size_goods, replace=True, random_state=42)
    .reset_index(drop=True)
)

# Concatenate the "bad" rows from all datasets
good_D = (
    pd.concat([good_A, good_B, good_C], axis=0)
    .sample(n=desired_sample_size_goods, replace=True, random_state=42)
    .reset_index(drop=True)
)

feat_A = (
    dataset_A[dataset_A.is_bad == 0]
    .sample(n=len(good_A), replace=True, random_state=42)
    .drop(columns="is_bad")
    .reset_index(drop=True)
)
feat_B = (
    dataset_B[dataset_B.is_bad == 0]
    .sample(n=len(good_B), replace=True, random_state=42)
    .drop(columns="is_bad")
    .reset_index(drop=True)
)
feat_C = (
    dataset_C[dataset_C.is_bad == 0]
    .sample(n=len(good_C), replace=True, random_state=42)
    .drop(columns="is_bad")
    .reset_index(drop=True)
)

feat_D = pd.concat([feat_A, feat_B, feat_C], axis=1)

sample_goods = pd.concat([feat_D, good_D], axis=1)

In [19]:
dataset_D = pd.concat([sample_bads, sample_goods], axis=0)

In [20]:
dataset_D["is_bad"].mean()

0.1

### <span style="font-family: ClearSans, sans-serif; color:#00BBD7">Renaming and monotonic constraints</span>

In [21]:
data_dict = {
    "ExternalRiskEstimate": {
        "standardized_attribute_name": "external_risk_estimate",
        "causal_knowledge": -1,
    },
    "MSinceOldestTradeOpen": {
        "standardized_attribute_name": "months_since_oldest_trade_open",
        "causal_knowledge": -1,
    },
    "MSinceMostRecentTradeOpen": {
        "standardized_attribute_name": "months_since_most_recent_trade_open",
        "causal_knowledge": -1,
    },
    "AverageMInFile": {
        "standardized_attribute_name": "average_months_in_file",
        "causal_knowledge": -1,
    },
    "NumSatisfactoryTrades": {
        "standardized_attribute_name": "num_satisfactory_trades",
        "causal_knowledge": -1,
    },
    "NumTrades60Ever2DerogPubRec": {
        "standardized_attribute_name": "num_trades_60_ever_2_derog_pub_rec",
        "causal_knowledge": 1,
    },
    "NumTrades90Ever2DerogPubRec": {
        "standardized_attribute_name": "num_trades_90_ever_2_derog_pub_rec",
        "causal_knowledge": 1,
    },
    "PercentTradesNeverDelq": {
        "standardized_attribute_name": "percent_trades_never_delq",
        "causal_knowledge": -1,
    },
    "MSinceMostRecentDelq": {
        "standardized_attribute_name": "months_since_most_recent_delq",
        "causal_knowledge": -1,
    },
    "NumTradesOpeninLast12M": {
        "standardized_attribute_name": "num_trades_open_in_last_12m",
        "causal_knowledge": 1,
    },
    "MSinceMostRecentInqexcl7days": {
        "standardized_attribute_name": "months_since_most_recent_inqexcl7days",
        "causal_knowledge": -1,
    },
    "NumInqLast6M": {
        "standardized_attribute_name": "num_inq_last_6m",
        "causal_knowledge": 1,
    },
    "NumInqLast6Mexcl7days": {
        "standardized_attribute_name": "num_inq_last_6m_excl7days",
        "causal_knowledge": 1,
    },
    "NetFractionRevolvingBurden": {
        "standardized_attribute_name": "net_fraction_revolving_burden",
        "causal_knowledge": 1,
    },
    "NetFractionInstallBurden": {
        "standardized_attribute_name": "net_fraction_install_burden",
        "causal_knowledge": 1,
    },
    "NumBank2NatlTradesWHighUtilization": {
        "standardized_attribute_name": "num_bank_2_natl_trades_w_high_utilization",
        "causal_knowledge": 1,
    },
    "emp_length": {"standardized_attribute_name": "emp_length", "causal_knowledge": -1},
    "annual_income": {
        "standardized_attribute_name": "annual_income",
        "causal_knowledge": 1,
    },
    "debt_to_income": {
        "standardized_attribute_name": "debt_to_income",
        "causal_knowledge": 1,
    },
    "total_credit_limit": {
        "standardized_attribute_name": "total_credit_limit",
        "causal_knowledge": 1,
    },
    "total_credit_utilized": {
        "standardized_attribute_name": "total_credit_utilized",
        "causal_knowledge": 1,
    },
    "num_historical_failed_to_pay": {
        "standardized_attribute_name": "num_historical_failed_to_pay",
        "causal_knowledge": 1,
    },
    "current_installment_accounts": {
        "standardized_attribute_name": "current_installment_accounts",
        "causal_knowledge": 1,
    },
    "num_total_cc_accounts": {
        "standardized_attribute_name": "num_total_cc_accounts",
        "causal_knowledge": 1,
    },
    "num_open_cc_accounts": {
        "standardized_attribute_name": "num_open_cc_accounts",
        "causal_knowledge": 1,
    },
    "num_cc_carrying_balance": {
        "standardized_attribute_name": "num_cc_carrying_balance",
        "causal_knowledge": 1,
    },
    "num_mort_accounts": {
        "standardized_attribute_name": "num_mort_accounts",
        "causal_knowledge": -1,
    },
    "account_never_delinq_percent": {
        "standardized_attribute_name": "account_never_delinq_percent",
        "causal_knowledge": 1,
    },
    "loan_amount": {
        "standardized_attribute_name": "loan_amount",
        "causal_knowledge": 1,
    },
    "balance": {"standardized_attribute_name": "balance", "causal_knowledge": 1},
    "paid_total": {"standardized_attribute_name": "paid_total", "causal_knowledge": -1},
    "age": {"standardized_attribute_name": "age", "causal_knowledge": -1},
    "DebtRatio": {"standardized_attribute_name": "debt_ratio", "causal_knowledge": -1},
    "MonthlyIncome": {
        "standardized_attribute_name": "monthly_income",
        "causal_knowledge": -1,
    },
    "NumberOfOpenCreditLinesAndLoans": {
        "standardized_attribute_name": "number_of_open_credit_lines_and_loans",
        "causal_knowledge": 1,
    },
    "NumberRealEstateLoansOrLines": {
        "standardized_attribute_name": "number_real_estate_loans_or_lines",
        "causal_knowledge": 1,
    },
    "NumberOfDependents": {
        "standardized_attribute_name": "number_of_dependents",
        "causal_knowledge": 1,
    },
    "RevolvingUtilizationOfUnsecuredLines": {
        "standardized_attribute_name": "revolving_utilization_of_unsecured_lines",
        "causal_knowledge": 1,
    },
}

In [22]:
# standardizing names of columns
column_mapping = {
    col: data_dict[col]["standardized_attribute_name"] if col in data_dict else col
    for col in dataset_D.columns
}

dataset_D.rename(columns=column_mapping, inplace=True)

In [23]:
feats_connected = [
    "percent_trades_never_delq",
    "account_never_delinq_percent",
    "total_credit_utilized",
    "revolving_utilization_of_unsecured_lines",
    "annual_income",
    "monthly_income",
    "debt_ratio",
    "debt_to_income",
    "number_of_open_credit_lines_and_loans",
    "num_total_cc_accounts",
    "num_open_cc_accounts",
]

### <span style="font-family: ClearSans, sans-serif; color:#00BBD7">Removal of redundant features</span>

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [25]:
X = dataset_D.copy()
y = X.pop("is_bad")

ix_train, ix_test = train_test_split(X.index, stratify=y, random_state=62)

log_reg = LogisticRegression()

for feat in feats_connected:
    log_reg.fit(X[feat].loc[ix_train].values.reshape(-1, 1), y.loc[ix_train])
    pred = log_reg.predict_proba(X[feat].loc[ix_test].values.reshape(-1, 1))[:, 1]
    gini_score = roc_auc_score(y.loc[ix_test], pred) * 2 - 1
    print(f"{feat}: {gini_score:.2%}")

percent_trades_never_delq: 29.84%
account_never_delinq_percent: 72.51%
total_credit_utilized: 8.31%
revolving_utilization_of_unsecured_lines: 54.60%
annual_income: -7.80%
monthly_income: 5.74%
debt_ratio: 9.06%
debt_to_income: 10.42%
number_of_open_credit_lines_and_loans: 2.93%
num_total_cc_accounts: 12.83%
num_open_cc_accounts: 1.00%


In [26]:
dataset_D.drop(
    columns=[
        "total_credit_utilized",
        "number_of_open_credit_lines_and_loans",
        "num_open_cc_accounts",
        "annual_income",
        "debt_ratio",
    ],
    inplace=True,
)

In [32]:
dataset_D.to_parquet("blended_dataset.parquet", engine="pyarrow")

In [27]:
monotonic_constraints_woe = {}
monotonic_constraints_tree = {}

for col, attributes in data_dict.items():
    standardized_name = attributes["standardized_attribute_name"]
    if standardized_name in dataset_D.columns:
        # Check the value of 'causal_knowledge' to determine the monotonic constraint
        if attributes["causal_knowledge"] == -1:
            monotonic_constraints_woe[standardized_name] = {
                "monotonic_trend": "descending"
            }
            monotonic_constraints_tree[standardized_name] = -1
        elif attributes["causal_knowledge"] == 1:
            monotonic_constraints_woe[standardized_name] = {
                "monotonic_trend": "ascending"
            }
            monotonic_constraints_tree[standardized_name] = 1
monotonic_constraints_xgb = [value for key, value in monotonic_constraints_tree.items()]