### Logistic regression ###

First, we replace all values in the raw data with their corresponding WoE values. We then perform logistic regression on each variable to obtain the coefficients needed for a credit score card.

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
predictions_df = pd.read_csv('../../data/cs-training.csv')

In [3]:
# The original data.
predictions_df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [4]:
predictions_df['SeriousDlqin2yrs'].value_counts()

0    139974
1     10026
Name: SeriousDlqin2yrs, dtype: int64

In [5]:
np.log(139974/10026)

2.636274972235245

When we do logistic regression, we will expect the coefficient to be about 1 and the intercept to be about 2.64.

In [6]:
woe_df = pd.read_csv('../../data/report_woe.csv')

In [7]:
woe_df_nan = woe_df[woe_df['QuantileRange'].isna()]

In [8]:
# Table of correspodning WoE for NaN values.
woe_df_nan

Unnamed: 0.1,Unnamed: 0,Variable,QuantileRange,EventsCount,EventsSum,% of Qualified,DelinquentCount,% of Delinquent,WoE,IV
4,4,RevolvingUtilizationOfUnsecuredLines,,22602,2481,0.247457,20121,0.143748,-0.543173,0.056332
10,5,age,,9536,764,0.076202,8772,0.062669,-0.195523,0.002646
13,2,NumberOfTime30-59DaysPastDueNotWorse,,20631,3628,0.361859,17003,0.121473,-1.091567,0.262398
18,4,DebtRatio,,4134,340,0.033912,3794,0.027105,-0.224044,0.001525
22,3,MonthlyIncome,,35592,1962,0.195691,33630,0.240259,0.205179,0.009144
30,4,NumberOfTimes90DaysLate,,139,88,0.008777,51,0.000364,-3.181786,0.026768
33,2,NumberRealEstateLoansOrLines,,114826,7842,0.782166,106984,0.764313,-0.02309,0.000412
37,3,NumberOfTime60-89DaysPastDueNotWorse,,1118,561,0.055955,557,0.003979,-2.643431,0.137393
40,2,NumberOfDependents,,23446,1763,0.175843,21683,0.154907,-0.126763,0.002654


In [9]:
woe_df_not_nan = woe_df[woe_df['QuantileRange'].notna()]

In [10]:
# Table of correspodning WoE for non-NaN values.
woe_df_not_nan

Unnamed: 0.1,Unnamed: 0,Variable,QuantileRange,EventsCount,EventsSum,% of Qualified,DelinquentCount,% of Delinquent,WoE,IV
0,0,RevolvingUtilizationOfUnsecuredLines,"(0.0, 0.49]",97178,2620,0.261321,94558,0.67554,0.949764,0.393411
1,1,RevolvingUtilizationOfUnsecuredLines,"(0.5, 0.99]",26905,3689,0.367943,23216,0.165859,-0.796789,0.161018
2,2,RevolvingUtilizationOfUnsecuredLines,"(1.0, 1.49]",2715,1080,0.10772,1635,0.011681,-2.221593,0.21336
3,3,RevolvingUtilizationOfUnsecuredLines,"(1.5, 60000.0]",600,156,0.01556,444,0.003172,-1.590306,0.0197
5,0,age,"(18, 25]",3027,338,0.033712,2689,0.019211,-0.562396,0.008156
6,1,age,"(26, 34]",15019,1665,0.166068,13354,0.095403,-0.554284,0.039168
7,2,age,"(35, 44]",26317,2344,0.233792,23973,0.171268,-0.311206,0.019458
8,3,age,"(45, 64]",67502,4225,0.421404,63277,0.452063,0.070228,0.002153
9,4,age,"(65, 120]",28599,690,0.068821,27909,0.199387,1.063738,0.138888
11,0,NumberOfTime30-59DaysPastDueNotWorse,"(-0.098, 0.99]",126018,5041,0.502793,120977,0.864282,0.541721,0.195826


In [11]:
predictions_df = predictions_df.drop(columns='Unnamed: 0')

In [12]:
predictions_df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [13]:
# We will create a separate Dataframe for the WoE values.
predictions_df_woe = predictions_df.copy()

##### WoE for non-NaN values #####

In [14]:
# List of all variables in the data.
var_list = list(woe_df_not_nan['Variable'].unique())

In [15]:
# Function to extract each range of values for a given variable and the WoE for that range as a tuple.
def make_map(ref_df, variable):
    tuple_list = []
    for item in ref_df[(ref_df['Variable'] == variable) & ref_df['QuantileRange'].notna()].iterrows():
        range_bounds = item[1]['QuantileRange'][1:-1].split(', ')
        range_bounds = [float(num) for num in range_bounds]
        tuple_list.append((range_bounds, item[1]['WoE']))
    return tuple_list

In [16]:
# Use the make_map function to extract the range-to-WoE map for each variable and store all the maps in a dictionary.
maps = dict()
for var in var_list:
    maps[var] = (make_map(woe_df, var))

In [17]:
predictions_df_woe.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [18]:
# Function that will be used to replace each value in a column with its corresponding WoE.
def woe_replace(in_num, tuples):
    for tup in tuples:
        if tup[0][0] <= in_num <= tup[0][1]:
            return tup[1]

In [19]:
# Apply the woe_replace function to each column of the original data.
for var in var_list:
    predictions_df_woe[var] = predictions_df_woe[var].apply(woe_replace, args=(maps[var],))

In [20]:
predictions_df_woe.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,-0.796789,0.070228,-2.251409,-0.575353,0.111096,0.035635,0.176674,-0.59298,0.288208,-0.352272
1,0,-0.796789,-0.311206,0.541721,0.175061,-0.31117,-0.004249,0.176674,,0.288208,0.078749
2,0,-0.796789,-0.311206,,0.175061,-0.31117,-0.004249,0.176674,,0.288208,0.078749
3,0,0.949764,-0.554284,0.541721,0.175061,-0.31117,-0.004249,0.176674,,0.288208,0.078749
4,0,-0.796789,0.070228,,0.175061,0.460433,-0.004249,0.176674,0.188639,0.288208,0.078749


In [21]:
##### WoE for NaN values #####

In [22]:
# List of all variables that have NaNs
var_list_null = list(woe_df_nan['Variable'].unique())

In [23]:
# For each variable, replace all NaN values with the appropriate WoE
for var in var_list_null:
    predictions_df_woe[var].fillna(woe_df_nan[(woe_df_nan['Variable']==var)].reset_index().loc[0, 'WoE'], inplace=True)

In [24]:
predictions_df_woe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  float64
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  float64
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         150000 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  float64
 7   NumberOfTimes90DaysLate               150000 non-null  float64
 8   NumberRealEstateLoansOrLines          150000 non-null  float64
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  float64
 10  NumberOfDependents                    150000 non-null  float64
dtype

In [25]:
predictions_df_woe.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,-0.796789,0.070228,-2.251409,-0.575353,0.111096,0.035635,0.176674,-0.59298,0.288208,-0.352272
1,0,-0.796789,-0.311206,0.541721,0.175061,-0.31117,-0.004249,0.176674,-0.02309,0.288208,0.078749
2,0,-0.796789,-0.311206,-1.091567,0.175061,-0.31117,-0.004249,0.176674,-0.02309,0.288208,0.078749
3,0,0.949764,-0.554284,0.541721,0.175061,-0.31117,-0.004249,0.176674,-0.02309,0.288208,0.078749
4,0,-0.796789,0.070228,-1.091567,0.175061,0.460433,-0.004249,0.176674,0.188639,0.288208,0.078749
5,0,0.949764,1.063738,0.541721,-0.067773,-0.31117,-0.004249,0.176674,0.188639,0.288208,0.078749
6,0,0.949764,0.070228,0.541721,0.193661,0.205179,-0.004249,0.176674,-0.59298,0.288208,0.078749
7,0,-0.796789,-0.311206,0.541721,0.175061,-0.31117,-0.004249,0.176674,-0.02309,0.288208,0.078749
8,0,0.949764,-0.554284,0.541721,0.193661,0.205179,-0.004249,0.176674,-0.02309,0.288208,-0.126763
9,0,0.949764,0.070228,0.541721,-0.067773,0.460433,-0.004249,0.176674,-0.59298,0.288208,-0.352272


##### Logistic Coefficients #####

In [26]:
logreg = LogisticRegression()

In [27]:
coefs = []
intercepts = []

In [28]:
for var in var_list:
    logreg.fit(predictions_df_woe[[var]], predictions_df_woe['SeriousDlqin2yrs']);
    coefs.append(logreg.coef_[0][0])
    intercepts.append(logreg.intercept_[0])

In [29]:
coef_df = pd.DataFrame(zip(var_list, coefs, intercepts), columns=['Variable', 'Coeffient', 'Intercept'])

In [30]:
coef_df

Unnamed: 0,Variable,Coeffient,Intercept
0,RevolvingUtilizationOfUnsecuredLines,-1.025263,-2.534305
1,age,-0.983258,-2.61869
2,NumberOfTime30-59DaysPastDueNotWorse,-0.876607,-2.701037
3,DebtRatio,-0.968957,-2.624434
4,MonthlyIncome,-0.955729,-2.640271
5,NumberOfOpenCreditLinesAndLoans,-0.778155,-2.636247
6,NumberOfTimes90DaysLate,-0.996717,-2.636661
7,NumberRealEstateLoansOrLines,-0.765749,-2.603261
8,NumberOfTime60-89DaysPastDueNotWorse,-0.97395,-2.640623
9,NumberOfDependents,-0.780107,-2.66143


### Logistic Regression on the full WoE dataset ###

In [47]:
logreg = LogisticRegression()

In [48]:
predictions_df_woe.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [49]:
X = predictions_df_woe.drop(columns=['SeriousDlqin2yrs', 'NumberOfOpenCreditLinesAndLoans'])
y = predictions_df_woe['SeriousDlqin2yrs']

NumberOfOpenCreditLinesAndLoans is dropped because it has a very low IV of 0.000375

In [50]:
logreg.fit(X, y);

In [51]:
logreg.coef_[0]

array([-0.67505692, -0.47860843, -0.49756609, -0.5557159 , -0.36903108,
       -0.51641583, -0.77320443, -0.44242076, -0.20012398])

In [55]:
scorecard_coefs = pd.DataFrame(zip(X.columns, logreg.coef_[0]), columns=['variable', 'coefficient'])

In [56]:
scorecard_coefs

Unnamed: 0,variable,coefficient
0,RevolvingUtilizationOfUnsecuredLines,-0.675057
1,age,-0.478608
2,NumberOfTime30-59DaysPastDueNotWorse,-0.497566
3,DebtRatio,-0.555716
4,MonthlyIncome,-0.369031
5,NumberOfTimes90DaysLate,-0.516416
6,NumberRealEstateLoansOrLines,-0.773204
7,NumberOfTime60-89DaysPastDueNotWorse,-0.442421
8,NumberOfDependents,-0.200124


In [71]:
scorecard_coefs = scorecard_coefs.append(pd.DataFrame([['Intercept', logreg.intercept_[0]]], columns=['variable', 'coefficient']))

In [36]:
predictions_df_woe.to_csv('../../data/woe_data.csv', index=False)

In [72]:
scorecard_coefs.to_csv('../../data/scorecard_coefs.csv', index=False)