In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp
from IPython.display import display
import os
import sys
import pickle

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, roc_curve, precision_recall_curve, confusion_matrix, classification_report, average_precision_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# binning
try:
    from optbinning import OptimalBinning
except:
    ! pip install optbinning
    from optbinning import OptimalBinning

# silence warnings
import warnings
warnings.filterwarnings('ignore')


from Stored_Functions_and_Params.impute_using_bounds import ImputeUsingBounds
from Stored_Functions_and_Params.data_imputation import DataImputation
from Stored_Functions_and_Params.woe_transformer import WoETransformer
from Stored_Functions_and_Params.model_scorer import ModelScorer

(CVXPY) Apr 18 12:56:44 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.11.4210). Expected < 9.10.0. Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) Apr 18 12:56:44 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.11.4210). Expected < 9.10.0. Please open a feature request on cvxpy to enable support for this version.')


In [2]:
# File path

file_path = r'M:/Risk Management/DW/Scorecard/Capacity Model/01_Input_Files/df_test_raw.csv'

In [3]:
# Read in test file

aa_df = pd.read_csv(file_path, low_memory=False)

In [4]:
# Cols to keep

cols_to_keep = [
    'BIGACCOUNTID',
    'APPLICATIONDATE',
    'SUM_OF_COMBINED_INCOME',
    'PAYMENT',
    'INT_N_EMPS',
    'PTI',
    'LTV',
    'FLTADVANCE',
    'TOTAL_INCOME',
    'FLTDOWNCASH',
    'VEHICLEYEAR',
    'BIGMILEAGE_ODOMETER',
    'DTI',
    'NUMOFDEBTS',
    'BITTARGET24MONTHS',
    'INQUIRYBANKING12MONTH',
    'INQUIRYAUTO12MONTH',
    'INQUIRYCOLLECTIONS12MONTH',
    'ADDRCHANGECOUNT06MONTH',
    'ADDRCHANGECOUNT12MONTH',
    'ADDRCHANGECOUNT24MONTH',
    'ADDRCHANGECOUNT60MONTH',
    'ADDRCURRENTLENGTHOFRES',
    'ADDRCURRENTTIMENEWEST',
    'ADDRCURRENTTIMEOLDEST',
    'ADDRINPUTTIMENEWEST',
    'ADDRINPUTTIMEOLDEST',
    'ADDRINPUTLENGTHOFRES',
    'ADDRPREVIOUSLENGTHOFRES',
    'CONFIRMATIONINPUTADDRESS',
    'ADDRINPUTSUBJECTCOUNT',
    'EVICTIONCOUNT',
    'ADDRSTABILITYINDEX',
    'ADDRONFILECOUNT',
    'ADDRINPUTMATCHINDEX',
    'ADDRINPUTOWNERSHIPINDEX',
    'ASSETPROP'
]


In [5]:
# Subset df to cols to keep

aa_df = aa_df[cols_to_keep]

In [6]:
# Load and using the class function

bounds = {"TOTAL_INCOME": (2150, 15000, 2150)}
imputer = ImputeUsingBounds(bounds)
aa_df = imputer.process(aa_df)


Processing column: TOTAL_INCOME


In [7]:
# Example usage
imputer = DataImputation()
aa_df = imputer.process(aa_df)

Replaced special values [-1] in column 'INQUIRYBANKING12MONTH' with NaN.
Replaced special values [-1] in column 'INQUIRYAUTO12MONTH' with NaN.
Replaced special values [-1] in column 'ADDRCHANGECOUNT24MONTH' with NaN.
Replaced special values [-1] in column 'ADDRCHANGECOUNT60MONTH' with NaN.
Replaced special values [-1] in column 'ADDRINPUTLENGTHOFRES' with NaN.
Replaced special values [-1] in column 'ADDRPREVIOUSLENGTHOFRES' with NaN.
Replaced special values [-1] in column 'CONFIRMATIONINPUTADDRESS' with NaN.
Replaced special values [-1] in column 'ADDRINPUTMATCHINDEX' with NaN.
Replaced special values [-1] in column 'ADDRINPUTOWNERSHIPINDEX' with NaN.
Replaced special values [-1] in column 'ASSETPROP' with NaN.
Imputed 85 NaNs in column 'INQUIRYBANKING12MONTH' using strategy 'constant'.
Imputed 85 NaNs in column 'INQUIRYAUTO12MONTH' using strategy 'constant'.
Imputed 85 NaNs in column 'ADDRCHANGECOUNT24MONTH' using strategy 'max' with value 4.0.
Imputed 85 NaNs in column 'ADDRCHANGECOU

In [8]:
 list_vars_reduced = [
     'TOTAL_INCOME',
     'ADDRCHANGECOUNT24MONTH',
     'ADDRCHANGECOUNT60MONTH',
     'ADDRINPUTLENGTHOFRES',
     'ADDRPREVIOUSLENGTHOFRES',
     'CONFIRMATIONINPUTADDRESS',
     'ADDRINPUTMATCHINDEX',
     'ADDRINPUTOWNERSHIPINDEX',
     'ASSETPROP'
]

In [9]:
# Load the saved binning models
with open(r'M:/Risk Management/DW/Scorecard/Capacity Model/Stored_Functions_and_Params/binning_models.pkl', 'rb') as f:
    binning_models = pickle.load(f)


In [10]:
model_coeffs = {# 'Intercept': -0.000533,
    'TOTAL_INCOME': -0.817063,
    'ADDRCHANGECOUNT24MONTH': -0.223352,
    'ADDRCHANGECOUNT60MONTH': -0.390745,
    'ADDRINPUTLENGTHOFRES': -0.435667,
    'ADDRPREVIOUSLENGTHOFRES': -0.452777,
    'CONFIRMATIONINPUTADDRESS': -0.553535,
    'ADDRINPUTMATCHINDEX': 0.052233,
    'ADDRINPUTOWNERSHIPINDEX': -0.236553,
    'ASSETPROP': -0.511746
               }

In [11]:
# def get_bin_contributions(var, binning_model, coef):
#     table = binning_model.binning_table.build()
#     table = table[['Bin', 'WoE']].copy()
#     table['Feature'] = var
#     table['Feature_Coefficient'] = coef
#     table['Bin_Contribution'] = table['WoE'] * coef
#     table['Abs_Bin_Contribution'] = table['Bin_Contribution'].abs()
#     return table


In [12]:
# def get_bin_contributions(var, binning_model, coef):
#     table = binning_model.binning_table.build()
#     table = table[['Bin', 'WoE']].copy()
    
#     # Force WoE to be numeric
#     table['WoE'] = pd.to_numeric(table['WoE'], errors='coerce')  # Will turn invalid values into NaN

#     table['Feature'] = var
#     table['Feature_Coefficient'] = coef
#     table['Bin_Contribution'] = table['WoE'] * coef
#     table['Abs_Bin_Contribution'] = table['Bin_Contribution'].abs()
    
#     return table


In [13]:
# def get_bin_contributions(var, binning_model, coef):
#     table = binning_model.binning_table.build()
#     table = table[['Bin', 'WoE']].copy()

#     # Ensure WoE is numeric
#     table['WoE'] = pd.to_numeric(table['WoE'], errors='coerce')
#     table['Feature'] = var
#     table['Feature_Coefficient'] = coef
#     table['Bin_Level_Coefficient'] = table['WoE'] * coef
#     table['Abs_Bin_Coeff'] = table['Bin_Level_Coefficient'].abs()

#     # Calculate distance from max
#     max_contribution = table['Abs_Bin_Coeff'].max()
#     table['Distance_From_Max'] = max_contribution - table['Abs_Bin_Coeff']

#     return table


In [14]:
def get_bin_contributions(var, binning_model, coef):
    table = binning_model.binning_table.build()
    table = table[['Bin', 'WoE']].copy()

    table['WoE'] = pd.to_numeric(table['WoE'], errors='coerce')
    table['Feature'] = var
    table['Feature_Coefficient'] = coef
    table['Bin_Level_Coefficient'] = table['WoE'] * coef
    table['Abs_Bin_Coeff'] = table['Bin_Level_Coefficient'].abs()

    # Define best bin based on coefficient direction
    if coef < 0:
        best_value = table['Bin_Level_Coefficient'].min()
    else:
        best_value = table['Bin_Level_Coefficient'].max()

    table['Distance_From_Best'] = (table['Bin_Level_Coefficient'] - best_value).abs()

    return table


In [15]:
bin_contributions_df = pd.DataFrame()
for var, coef in model_coeffs.items():
    bin_table = get_bin_contributions(var, binning_models[var], coef)
    bin_contributions_df = pd.concat([bin_contributions_df, bin_table], ignore_index=True)


In [16]:
def score_and_track(df, binning_models, model_coefficients):
    contributions = []
    scores = []

    for i, row in df.iterrows():
        total_score = 0
        row_reasons = []

        for var, coef in model_coefficients.items():
            woe = binning_models[var].transform([row[var]], metric="woe")[0]
            contrib = woe * coef
            total_score += contrib
            row_reasons.append({
                'Feature': var,
                'WOE': woe,
                'Coef': coef,
                'Contribution': contrib,
                'Abs_Contribution': abs(contrib)
            })

        sorted_reasons = sorted(row_reasons, key=lambda x: x['Abs_Contribution'], reverse=True)
        top_reasons = [f"{r['Feature']} (impact: {r['Contribution']:.3f})" for r in sorted_reasons[:5]]
        
        scores.append({
            'Score': total_score,
            'Top_Adverse_Reasons': top_reasons
        })

        contributions.append(row_reasons)

    scores_df = pd.DataFrame(scores)
    return scores_df, contributions


In [17]:
scores_df, contribs_per_row = score_and_track(aa_df, binning_models, model_coeffs)

# Merge scores + top reasons back to the original data
df_scored = pd.concat([aa_df.reset_index(drop=True), scores_df], axis=1)


In [18]:
# Display enriched binning tables with coefficients and contributions
for var, model in binning_models.items():
    if var not in model_coeffs:
        print(f"Skipping {var}: No coefficient found.")
        continue

    coef = model_coeffs[var]
    table = model.binning_table.build()
    
    # Make sure WoE is numeric
    table['WoE'] = pd.to_numeric(table['WoE'], errors='coerce')
    
    # Add coefficient and contribution columns
    table['Feature'] = var
    table['Feature_Coefficient'] = coef
    table['BinLevel_Coefficient'] = table['WoE'] * coef
    table['Abs_Bin_Contribution'] = table['BinLevel_Coefficient'].abs()
    
    print(f"\n=== Expanded Binning Table for {var} ===")
    display(table[['Bin', 'Count', 'Event rate', 'WoE', 'Feature_Coefficient', 'BinLevel_Coefficient', 'Abs_Bin_Contribution']])



=== Expanded Binning Table for TOTAL_INCOME ===


Unnamed: 0,Bin,Count,Event rate,WoE,Feature_Coefficient,BinLevel_Coefficient,Abs_Bin_Contribution
0,"(-inf, 2183.93)",2493,0.259126,-0.289455,-0.817063,0.236503,0.236503
1,"[2183.93, 3468.97)",9189,0.22244,-0.088469,-0.817063,0.072285,0.072285
2,"[3468.97, 3773.05)",2134,0.218369,-0.064778,-0.817063,0.052928,0.052928
3,"[3773.05, 4988.94)",7302,0.195015,0.077773,-0.817063,-0.063546,0.063546
4,"[4988.94, 5620.52)",2408,0.19103,0.103359,-0.817063,-0.08445,0.08445
5,"[5620.52, 7400.54)",3228,0.188662,0.118756,-0.817063,-0.097031,0.097031
6,"[7400.54, inf)",1485,0.142088,0.458085,-0.817063,-0.374285,0.374285
7,Special,0,0.0,0.0,-0.817063,-0.0,0.0
8,Missing,0,0.0,0.0,-0.817063,-0.0,0.0
Totals,,28239,0.207514,,-0.817063,,



=== Expanded Binning Table for ADDRCHANGECOUNT24MONTH ===


Unnamed: 0,Bin,Count,Event rate,WoE,Feature_Coefficient,BinLevel_Coefficient,Abs_Bin_Contribution
0,"(-inf, 0.50)",18015,0.191785,0.098482,-0.223352,-0.021996,0.021996
1,"[0.50, 1.50)",8158,0.231429,-0.139715,-0.223352,0.031206,0.031206
2,"[1.50, inf)",2066,0.250242,-0.242651,-0.223352,0.054197,0.054197
3,Special,0,0.0,0.0,-0.223352,-0.0,0.0
4,Missing,0,0.0,0.0,-0.223352,-0.0,0.0
Totals,,28239,0.207514,,-0.223352,,



=== Expanded Binning Table for ADDRCHANGECOUNT60MONTH ===


Unnamed: 0,Bin,Count,Event rate,WoE,Feature_Coefficient,BinLevel_Coefficient,Abs_Bin_Contribution
0,"(-inf, 0.50)",8688,0.175645,0.206166,-0.390745,-0.080558,0.080558
1,"[0.50, 1.50)",9300,0.213441,-0.035665,-0.390745,0.013936,0.013936
2,"[1.50, 2.50)",5810,0.214114,-0.039668,-0.390745,0.0155,0.0155
3,"[2.50, 3.50)",2748,0.246725,-0.223817,-0.390745,0.087455,0.087455
4,"[3.50, inf)",1693,0.252215,-0.25314,-0.390745,0.098913,0.098913
5,Special,0,0.0,0.0,-0.390745,-0.0,0.0
6,Missing,0,0.0,0.0,-0.390745,-0.0,0.0
Totals,,28239,0.207514,,-0.390745,,



=== Expanded Binning Table for ADDRINPUTLENGTHOFRES ===


Unnamed: 0,Bin,Count,Event rate,WoE,Feature_Coefficient,BinLevel_Coefficient,Abs_Bin_Contribution
0,"(-inf, 9.50)",11661,0.241746,-0.196843,-0.435667,0.085758,0.085758
1,"[9.50, 14.50)",1687,0.212211,-0.028324,-0.435667,0.01234,0.01234
2,"[14.50, 25.50)",3054,0.212181,-0.028143,-0.435667,0.012261,0.012261
3,"[25.50, 41.50)",2855,0.193695,0.086202,-0.435667,-0.037555,0.037555
4,"[41.50, 60.50)",2276,0.178822,0.184372,-0.435667,-0.080325,0.080325
5,"[60.50, 212.50)",5292,0.1678,0.261323,-0.435667,-0.11385,0.11385
6,"[212.50, inf)",1414,0.132249,0.541245,-0.435667,-0.235803,0.235803
7,Special,0,0.0,0.0,-0.435667,-0.0,0.0
8,Missing,0,0.0,0.0,-0.435667,-0.0,0.0
Totals,,28239,0.207514,,-0.435667,,



=== Expanded Binning Table for ADDRPREVIOUSLENGTHOFRES ===


Unnamed: 0,Bin,Count,Event rate,WoE,Feature_Coefficient,BinLevel_Coefficient,Abs_Bin_Contribution
0,"(-inf, 10.50)",3802,0.230142,-0.132464,-0.452777,0.059977,0.059977
1,"[10.50, 23.50)",4225,0.228402,-0.122619,-0.452777,0.055519,0.055519
2,"[23.50, 38.50)",4040,0.22203,-0.086096,-0.452777,0.038982,0.038982
3,"[38.50, 45.50)",1583,0.20657,0.005754,-0.452777,-0.002605,0.002605
4,"[45.50, 68.50)",5035,0.199801,0.047563,-0.452777,-0.021535,0.021535
5,"[68.50, 144.50)",6282,0.197549,0.061714,-0.452777,-0.027942,0.027942
6,"[144.50, 210.50)",1859,0.175901,0.204396,-0.452777,-0.092546,0.092546
7,"[210.50, inf)",1413,0.157113,0.339898,-0.452777,-0.153898,0.153898
8,Special,0,0.0,0.0,-0.452777,-0.0,0.0
9,Missing,0,0.0,0.0,-0.452777,-0.0,0.0



=== Expanded Binning Table for CONFIRMATIONINPUTADDRESS ===


Unnamed: 0,Bin,Count,Event rate,WoE,Feature_Coefficient,BinLevel_Coefficient,Abs_Bin_Contribution
0,"(-inf, 0.89)",6368,0.248273,-0.232127,-0.553535,0.12849,0.12849
1,"[0.89, inf)",21871,0.195647,0.073752,-0.553535,-0.040824,0.040824
2,Special,0,0.0,0.0,-0.553535,-0.0,0.0
3,Missing,0,0.0,0.0,-0.553535,-0.0,0.0
Totals,,28239,0.207514,,-0.553535,,



=== Expanded Binning Table for ADDRINPUTMATCHINDEX ===


Unnamed: 0,Bin,Count,Event rate,WoE,Feature_Coefficient,BinLevel_Coefficient,Abs_Bin_Contribution
0,"(-inf, 0.50)",5969,0.244262,-0.21052,0.052233,-0.010996,0.010996
1,"[0.50, 1.68)",5862,0.214261,-0.040546,0.052233,-0.002118,0.002118
2,"[1.68, inf)",16408,0.191736,0.098798,0.052233,0.00516,0.00516
3,Special,0,0.0,0.0,0.052233,0.0,0.0
4,Missing,0,0.0,0.0,0.052233,0.0,0.0
Totals,,28239,0.207514,,0.052233,,



=== Expanded Binning Table for ADDRINPUTOWNERSHIPINDEX ===


Unnamed: 0,Bin,Count,Event rate,WoE,Feature_Coefficient,BinLevel_Coefficient,Abs_Bin_Contribution
0,"(-inf, 0.50)",12453,0.2128,-0.031845,-0.236553,0.007533,0.007533
1,"[0.50, 1.55)",7068,0.225523,-0.10621,-0.236553,0.025124,0.025124
2,"[1.55, 2.50)",4405,0.215891,-0.050199,-0.236553,0.011875,0.011875
3,"[2.50, 3.50)",1740,0.193678,0.086312,-0.236553,-0.020417,0.020417
4,"[3.50, inf)",2573,0.127478,0.583474,-0.236553,-0.138022,0.138022
5,Special,0,0.0,0.0,-0.236553,-0.0,0.0
6,Missing,0,0.0,0.0,-0.236553,-0.0,0.0
Totals,,28239,0.207514,,-0.236553,,



=== Expanded Binning Table for ASSETPROP ===


Unnamed: 0,Bin,Count,Event rate,WoE,Feature_Coefficient,BinLevel_Coefficient,Abs_Bin_Contribution
0,"(-inf, 0.59)",23027,0.220089,-0.074828,-0.511746,0.038293,0.038293
1,"[0.59, inf)",5212,0.151957,0.37936,-0.511746,-0.194136,0.194136
2,Special,0,0.0,0.0,-0.511746,-0.0,0.0
3,Missing,0,0.0,0.0,-0.511746,-0.0,0.0
Totals,,28239,0.207514,,-0.511746,,


In [19]:
# Set your export path
output_path = r'M:/Risk Management/DW/Scorecard/Capacity Model/02_Output_Files/scored_and_binning_tables.xlsx'

In [20]:
# Begin writing to Excel
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    # Sheet 1: scored data
    df_scored.to_excel(writer, sheet_name='Scored Data', index=False)

    # Sheet 2: combined binning tables with contributions
    bin_contributions_df.to_excel(writer, sheet_name='All Binning Tables', index=False)

    # Sheets 3+: each feature's binning table
    for var, coef in model_coeffs.items():
        try:
            bin_table = get_bin_contributions(var, binning_models[var], coef)
            sheet_name = var[:31]  # Excel sheet name limit
            bin_table.to_excel(writer, sheet_name=sheet_name, index=False)
        except Exception as e:
            print(f"Skipped {var} due to error: {e}")