### Logistic regression ###

First, we replace all values in the raw data with their corresponding WoE values. We then perform logistic regression on each variable to obtain the coefficients needed for a credit score card.

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import pickle

In [2]:
predictions_df = pd.read_csv('../../data/cs-training.csv')

In [3]:
# The original data.
predictions_df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [4]:
predictions_df['SeriousDlqin2yrs'].value_counts()

0    139974
1     10026
Name: SeriousDlqin2yrs, dtype: int64

In [5]:
np.log(139974/10026)

2.636274972235245

When we do logistic regression, we will expect the coefficient to be about 1 and the intercept to be about 2.64.

In [6]:
woe_df = pd.read_csv('../../data/report_woe.csv')

In [7]:
woe_df_nan = woe_df[woe_df['QuantileRange'].isna()]

In [8]:
# Table of correspodning WoE for NaN values.
woe_df_nan

Unnamed: 0.1,Unnamed: 0,Variable,QuantileRange,EventsCount,EventsSum,% of Qualified,DelinquentCount,% of Delinquent,WoE,IV
20,3,MonthlyIncome,,29735,1669,0.166467,28066,0.200509,0.186059,0.006334
38,2,NumberOfDependents,,3924,179,0.017854,3745,0.026755,0.404516,0.003601


In [9]:
woe_df_not_nan = woe_df[woe_df['QuantileRange'].notna()]

In [10]:
# Table of correspodning WoE for non-NaN values.
woe_df_not_nan

Unnamed: 0.1,Unnamed: 0,Variable,QuantileRange,EventsCount,EventsSum,% of Qualified,DelinquentCount,% of Delinquent,WoE,IV
0,0,RevolvingUtilizationOfUnsecuredLines,"(-100.0, 0.5]",108712,2987,0.297925,105725,0.755319,0.930297,0.425512
1,1,RevolvingUtilizationOfUnsecuredLines,"(0.5, 1.0]",37967,5802,0.578695,32165,0.229793,-0.923599,0.322246
2,2,RevolvingUtilizationOfUnsecuredLines,"(1.0, 1.5]",2721,1081,0.10782,1640,0.011716,-2.219465,0.213298
3,3,RevolvingUtilizationOfUnsecuredLines,"(1.5, 60000.0]",600,156,0.01556,444,0.003172,-1.590306,0.0197
4,0,age,"(-100, 18]",1,0,5e-05,1,7e-06,-1.943128,8.3e-05
5,1,age,"(18, 26]",4220,485,0.048374,3735,0.026684,-0.594921,0.012904
6,2,age,"(26, 35]",17265,1906,0.190106,15359,0.109728,-0.54958,0.044174
7,3,age,"(35, 45]",29819,2628,0.262118,27191,0.194258,-0.299612,0.020332
8,4,age,"(45, 65]",70096,4317,0.43058,65779,0.469937,0.087465,0.003442
9,5,age,"(65, 120]",28599,690,0.068821,27909,0.199387,1.063738,0.138888


In [11]:
predictions_df = predictions_df.drop(columns='Unnamed: 0')

In [12]:
predictions_df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [13]:
# We will create a separate Dataframe for the WoE values.
predictions_df_woe = predictions_df.copy()

##### WoE for non-NaN values #####

In [14]:
# List of all variables in the data.
var_list = list(woe_df_not_nan['Variable'].unique())

In [15]:
# Function to extract each range of values for a given variable and the WoE for that range as a tuple.
def make_map(ref_df, variable):
    tuple_list = []
    for item in ref_df[(ref_df['Variable'] == variable) & ref_df['QuantileRange'].notna()].iterrows():
        range_bounds = item[1]['QuantileRange'][1:-1].split(', ')
        range_bounds = [float(num) for num in range_bounds]
        tuple_list.append((range_bounds, item[1]['WoE']))
    return tuple_list

In [16]:
# Use the make_map function to extract the range-to-WoE map for each variable and store all the maps in a dictionary.
maps = dict()
for var in var_list:
    maps[var] = (make_map(woe_df, var))

In [17]:
predictions_df_woe.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [18]:
# Function that will be used to replace each value in a column with its corresponding WoE.
def woe_replace(in_num, tuples):
    for tup in tuples:
        if tup[0][0] <= in_num <= tup[0][1]:
            return tup[1]

In [19]:
# Apply the woe_replace function to each column of the original data.
for var in var_list:
    predictions_df_woe[var] = predictions_df_woe[var].apply(woe_replace, args=(maps[var],))

In [20]:
predictions_df_woe.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,-0.923599,-0.299612,-1.616726,-0.574709,0.12045,0.035635,0.176674,-0.59298,0.288208,0.031633
1,0,-0.923599,-0.299612,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
2,0,-0.923599,-0.299612,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
3,0,0.930297,-0.54958,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
4,0,-0.923599,0.087465,0.257826,0.144841,0.460433,-0.004249,0.176674,0.256641,0.288208,0.031633


In [21]:
##### WoE for NaN values #####

In [22]:
# List of all variables that have NaNs
var_list_null = list(woe_df_nan['Variable'].unique())

In [23]:
# For each variable, replace all NaN values with the appropriate WoE
for var in var_list_null:
    predictions_df_woe[var].fillna(woe_df_nan[(woe_df_nan['Variable']==var)].reset_index().loc[0, 'WoE'], inplace=True)

In [24]:
predictions_df_woe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  float64
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  float64
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         150000 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  float64
 7   NumberOfTimes90DaysLate               150000 non-null  float64
 8   NumberRealEstateLoansOrLines          150000 non-null  float64
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  float64
 10  NumberOfDependents                    150000 non-null  float64
dtype

In [25]:
predictions_df_woe.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,-0.923599,-0.299612,-1.616726,-0.574709,0.12045,0.035635,0.176674,-0.59298,0.288208,0.031633
1,0,-0.923599,-0.299612,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
2,0,-0.923599,-0.299612,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
3,0,0.930297,-0.54958,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
4,0,-0.923599,0.087465,0.257826,0.144841,0.460433,-0.004249,0.176674,0.256641,0.288208,0.031633
5,0,0.930297,1.063738,0.257826,-0.067661,-0.274868,-0.004249,0.176674,0.256641,0.288208,0.031633
6,0,0.930297,0.087465,0.257826,0.193834,0.186059,-0.004249,0.176674,0.154523,0.288208,0.031633
7,0,-0.923599,-0.299612,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
8,0,0.930297,-0.54958,0.257826,0.193834,0.186059,-0.004249,0.176674,-0.23597,0.288208,0.404516
9,0,0.930297,0.087465,0.257826,-0.067661,0.460433,-0.004249,0.176674,-0.59298,0.288208,0.031633


##### Logistic Coefficients #####

In [26]:
predictions_df_woe['Qualified'] = 1-predictions_df_woe['SeriousDlqin2yrs']

In [27]:
logreg = LogisticRegression()

In [28]:
coefs = []
intercepts = []

In [29]:
for var in var_list:
    logreg.fit(predictions_df_woe[[var]], predictions_df_woe['Qualified']);
    coefs.append(logreg.coef_[0][0])
    intercepts.append(logreg.intercept_[0])

In [30]:
coef_df = pd.DataFrame(zip(var_list, coefs, intercepts), columns=['Variable', 'Coeffient', 'Intercept'])

In [31]:
coef_df

Unnamed: 0,Variable,Coeffient,Intercept
0,RevolvingUtilizationOfUnsecuredLines,0.999887,2.636229
1,age,0.998985,2.63622
2,NumberOfTime30-59DaysPastDueNotWorse,0.999847,2.636251
3,DebtRatio,0.99849,2.636235
4,MonthlyIncome,0.992028,2.63348
5,NumberOfOpenCreditLinesAndLoans,0.778155,2.636247
6,NumberOfTimes90DaysLate,1.000363,2.636178
7,NumberRealEstateLoansOrLines,0.998288,2.63623
8,NumberOfTime60-89DaysPastDueNotWorse,0.999875,2.636252
9,NumberOfDependents,0.994296,2.636232


These are excellent results, with coefficients all being near 1 and intercepts near 2.63, with the exception of NumberOfOpenCreditLinesAndLoans, which has a low IV and won't be used.

### Logistic Regression on the full WoE dataset ###

In [32]:
logreg = LogisticRegression()

In [33]:
predictions_df_woe.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents', 'Qualified'],
      dtype='object')

In [34]:
X = predictions_df_woe.drop(columns=['SeriousDlqin2yrs', 'Qualified', 'NumberOfOpenCreditLinesAndLoans'])
y = predictions_df_woe['Qualified']

NumberOfOpenCreditLinesAndLoans is dropped because it has a very low IV of 0.000375

In [35]:
logreg.fit(X, y);

In [36]:
logreg.coef_[0]

array([0.68755762, 0.4622951 , 0.51029588, 0.83966152, 0.19272313,
       0.50989794, 0.71626716, 0.50016957, 0.43973118])

In [37]:
scorecard_coefs = pd.DataFrame(zip(X.columns, logreg.coef_[0]), columns=['variable', 'coefficient'])

In [38]:
scorecard_coefs = scorecard_coefs.append(pd.DataFrame([['Intercept', logreg.intercept_[0]]], columns=['variable', 'coefficient']))

  scorecard_coefs = scorecard_coefs.append(pd.DataFrame([['Intercept', logreg.intercept_[0]]], columns=['variable', 'coefficient']))


In [39]:
scorecard_coefs

Unnamed: 0,variable,coefficient
0,RevolvingUtilizationOfUnsecuredLines,0.687558
1,age,0.462295
2,NumberOfTime30-59DaysPastDueNotWorse,0.510296
3,DebtRatio,0.839662
4,MonthlyIncome,0.192723
5,NumberOfTimes90DaysLate,0.509898
6,NumberRealEstateLoansOrLines,0.716267
7,NumberOfTime60-89DaysPastDueNotWorse,0.50017
8,NumberOfDependents,0.439731
0,Intercept,2.603919


In [40]:
predictions_df_woe.to_csv('../../data/woe_data.csv', index=False)

In [41]:
scorecard_coefs.to_csv('../../data/scorecard_coefs.csv', index=False)

In [42]:
with open('../../data/logreg.pkl', 'wb') as f:
    # Pickle the logistic regression model using the highest protocol available.
    pickle.dump(logreg, f, pickle.HIGHEST_PROTOCOL)