In [9]:
# Load packages and libraries
import pandas as pd
import numpy as np
import scorecardpy as sc
import pickle

In [10]:
woe = pd.read_csv("../../data/woe_data.csv")
coefs = pd.read_csv("../../data/scorecard_coefs.csv")

In [11]:
woe.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,-0.923599,-0.299612,-1.616726,-0.574709,0.12045,0.035635,0.176674,-0.59298,0.288208,0.031633
1,0,-0.923599,-0.299612,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
2,0,-0.923599,-0.299612,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
3,0,0.930297,-0.54958,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633
4,0,-0.923599,0.087465,0.257826,0.144841,0.460433,-0.004249,0.176674,0.256641,0.288208,0.031633


In [12]:
coefs

Unnamed: 0,variable,coefficient
0,RevolvingUtilizationOfUnsecuredLines,-0.687558
1,age,-0.462295
2,NumberOfTime30-59DaysPastDueNotWorse,-0.510296
3,DebtRatio,-0.839662
4,MonthlyIncome,-0.192723
5,NumberOfTimes90DaysLate,-0.509898
6,NumberRealEstateLoansOrLines,-0.716267
7,NumberOfTime60-89DaysPastDueNotWorse,-0.50017
8,NumberOfDependents,-0.439731
9,Intercept,-2.603919


In [13]:
with open('../../data/logreg.pkl', 'rb') as f:
    lr = pickle.load(f) # deserialize using load()
    print(type(lr))

<class 'sklearn.linear_model._logistic.LogisticRegression'>


In [14]:
pdo=20
factor=pdo/np.log(2)
offset = 600 - (factor*np.log(50))
print('Factor:', round(factor,2),'| Offset:', round(offset,2))

Factor: 28.85 | Offset: 487.12


## Calculate Scores for each bin

$Score_i= (βi × WoE_i + α/n) × Factor + Offset/n$

Where:
* βi — logistic regression coefficient for the variable Xi
* α — logistic regression intercept
* WoE — Weight of Evidence value for variable Xi
* n — number of independent variable Xi in the model
* Factor, Offset — known as scaling parameter, where

Factor = pdo/ln(2)

Offset = Target Score — (Factor × ln(Target Odds))

In [16]:
iv_df = pd.read_csv("../../data/report_iv.csv")
iv_df.drop(columns=['Unnamed: 0'], inplace=True)
iv_df.rename(columns={'IV':'total_iv'}, inplace=True)
iv_df

Unnamed: 0,Variable,total_iv
0,RevolvingUtilizationOfUnsecuredLines,0.980756
1,age,0.219824
2,NumberOfTime30-59DaysPastDueNotWorse,0.486538
3,DebtRatio,0.061371
4,MonthlyIncome,0.063546
5,NumberOfOpenCreditLinesAndLoans,0.000375
6,NumberOfTimes90DaysLate,0.485778
7,NumberRealEstateLoansOrLines,0.060411
8,NumberOfTime60-89DaysPastDueNotWorse,0.60005
9,NumberOfDependents,0.017337


In [20]:
woe_bins = pd.read_csv('../../data/report_woe.csv')
woe_bins.drop(columns=['Unnamed: 0'], inplace=True)
woe_bins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Variable         39 non-null     object 
 1   QuantileRange    37 non-null     object 
 2   EventsCount      39 non-null     int64  
 3   EventsSum        39 non-null     int64  
 4   % of Qualified   39 non-null     float64
 5   DelinquentCount  39 non-null     int64  
 6   % of Delinquent  39 non-null     float64
 7   WoE              39 non-null     float64
 8   IV               39 non-null     float64
dtypes: float64(4), int64(3), object(2)
memory usage: 2.9+ KB


In [None]:
woe_bins = woe_bins.merge(iv_df, left_on='Variable', right_on='Variable', how='left')

In [56]:
woe_bins.fillna('None', inplace=True)

In [59]:
woe_bins

Unnamed: 0,Variable,QuantileRange,EventsCount,EventsSum,% of Qualified,DelinquentCount,% of Delinquent,WoE,IV,total_iv
0,RevolvingUtilizationOfUnsecuredLines,"(-100.0, 0.5]",108712,2987,0.297925,105725,0.755319,0.930297,0.425512,0.980756
1,RevolvingUtilizationOfUnsecuredLines,"(0.5, 1.0]",37967,5802,0.578695,32165,0.229793,-0.923599,0.322246,0.980756
2,RevolvingUtilizationOfUnsecuredLines,"(1.0, 1.5]",2721,1081,0.10782,1640,0.011716,-2.219465,0.213298,0.980756
3,RevolvingUtilizationOfUnsecuredLines,"(1.5, 60000.0]",600,156,0.01556,444,0.003172,-1.590306,0.0197,0.980756
4,age,"(-100, 18]",1,0,5e-05,1,7e-06,-1.943128,8.3e-05,0.219824
5,age,"(18, 26]",4220,485,0.048374,3735,0.026684,-0.594921,0.012904,0.219824
6,age,"(26, 35]",17265,1906,0.190106,15359,0.109728,-0.54958,0.044174,0.219824
7,age,"(35, 45]",29819,2628,0.262118,27191,0.194258,-0.299612,0.020332,0.219824
8,age,"(45, 65]",70096,4317,0.43058,65779,0.469937,0.087465,0.003442,0.219824
9,age,"(65, 120]",28599,690,0.068821,27909,0.199387,1.063738,0.138888,0.219824


In [None]:
breaks_list = []

for variable in woe.columns:
    for item in woe_bins[(woe_bins['Variable'] == variable) & woe_bins['QuantileRange']].iterrows():
            if type(item[1]['QuantileRange']) == str:
                breaks_list.append(np.nan)
            else:
                range_bounds = item[1]['QuantileRange'][1:-1].split(', ')
                range_bounds = [float(num) for num in range_bounds]
                breaks_list.append(range_bounds[1])
            print(item[1]['QuantileRange'])

# len(breaks_list)

In [62]:
woe_bins['breaks'] = breaks_list

In [63]:
new_woe_column_names = {
    'Variable':"variable",
    'QuantileRange':'bin',
    '% of Qualified':'count_distr',
    'EventsCount':'good',
    'DelinquentCount':'bad',
    '% of Delinquent':'badprob',
    'WoE':'woe',
    'IV':'bin_iv'}
    
woe_bins.rename(columns=new_woe_column_names, inplace=True)


In [64]:
woe_bins['count'] = woe_bins['good'] + woe_bins['bad']

In [65]:
woe_bins['is_special_values'] = False

In [66]:
woe_bins.head()

Unnamed: 0,variable,bin,good,EventsSum,count_distr,bad,badprob,woe,bin_iv,total_iv,breaks,count,is_special_values
0,RevolvingUtilizationOfUnsecuredLines,"(-100.0, 0.5]",108712,2987,0.297925,105725,0.755319,0.930297,0.425512,0.980756,,214437,False
1,RevolvingUtilizationOfUnsecuredLines,"(0.5, 1.0]",37967,5802,0.578695,32165,0.229793,-0.923599,0.322246,0.980756,,70132,False
2,RevolvingUtilizationOfUnsecuredLines,"(1.0, 1.5]",2721,1081,0.10782,1640,0.011716,-2.219465,0.213298,0.980756,,4361,False
3,RevolvingUtilizationOfUnsecuredLines,"(1.5, 60000.0]",600,156,0.01556,444,0.003172,-1.590306,0.0197,0.980756,,1044,False
4,age,"(-100, 18]",1,0,5e-05,1,7e-06,-1.943128,8.3e-05,0.219824,,2,False


In [68]:
train = pd.read_csv("../../data/cs-training.csv")

In [69]:
bins = sc.woebin(train, y="SeriousDlqin2yrs")

[INFO] creating woe binning ...


In [86]:
breaks_adj = {
    'NumberOfDependents': [2, 20],
    'RevolvingUtilizationOfUnsecuredLines': [0.5, 1, 1.5],
    'age': [18, 26, 35, 45, 65],
    'NumberOfTime30-59DaysPastDueNotWorse': [1,2],
    'DebtRatio': [.3125, .625, 2.5,],
    'MonthlyIncome': [5000, 10_000],
    'NumberOfOpenCreditLinesAndLoans': [11.6, 17.4],
    'NumberOfTimes90DaysLate': [1, 1.5, 5, 10],
    'NumberRealEstateLoansOrLines': [0.5, 1, 3],
    'NumberOfTime60-89DaysPastDueNotWorse': [0, 1, 2]
}

In [88]:
# binning adjustment
bins_adj = sc.woebin(train, y="SeriousDlqin2yrs", breaks_list=breaks_adj)

[INFO] creating woe binning ...


In [89]:
# Drop columns that were not used in the logreg model
x_woe = woe.drop(columns=['SeriousDlqin2yrs', 'NumberOfOpenCreditLinesAndLoans'])
xcolumns = x_woe.columns

In [92]:
# create scorecard
card = sc.scorecard(bins_adj, lr, xcolumns)

In [93]:
card

{'basepoints':      variable  bin  points
 0  basepoints  NaN   575.0,
 'RevolvingUtilizationOfUnsecuredLines':                                 variable         bin  points
 36  RevolvingUtilizationOfUnsecuredLines  [-inf,0.5)   -46.0
 37  RevolvingUtilizationOfUnsecuredLines   [0.5,1.0)    46.0
 38  RevolvingUtilizationOfUnsecuredLines   [1.0,1.5)   110.0
 39  RevolvingUtilizationOfUnsecuredLines   [1.5,inf)    79.0,
 'age':    variable          bin  points
 30      age  [-inf,18.0)    84.0
 31      age  [18.0,26.0)    19.0
 32      age  [26.0,35.0)    19.0
 33      age  [35.0,45.0)    11.0
 34      age  [45.0,65.0)    -2.0
 35      age   [65.0,inf)   -34.0,
 'NumberOfTime30-59DaysPastDueNotWorse':                                variable         bin  points
 7  NumberOfTime30-59DaysPastDueNotWorse  [-inf,1.0)   -20.0
 8  NumberOfTime30-59DaysPastDueNotWorse   [1.0,2.0)    33.0
 9  NumberOfTime30-59DaysPastDueNotWorse   [2.0,inf)    70.0,
 'DebtRatio':      variable             bin  po

In [None]:
# credit score
# Example I # only total score
score1 = sc.scorecard_ply(dt_sel, card)
# Example II # credit score for both total and each variable
score2 = sc.scorecard_ply(dt_sel, card, only_total_score = False

In [None]:
   # scorecard
len_x = len(coef_df)
basepoints = a - b*coef_const
card = {}
if basepoints_eq0: 
    card['basepoints'] = pd.DataFrame({'variable':"basepoints", 'bin':np.nan, 'points':0}, index=np.arange(1))
    for i in coef_df.index:
        card[i] = bins.loc[bins['variable']==i, ['variable', 'bin', 'woe']]\
          .assign(points = lambda x: round(-b*x['woe']*coef_df[i] + basepoints/len_x), ndigits=digits)\
          [["variable", "bin", "points"]]
else:
    card['basepoints'] = pd.DataFrame({'variable':"basepoints", 'bin':np.nan, 'points':round(basepoints, ndigits=digits)}, index=np.arange(1))
    for i in coef_df.index:
        card[i] = bins.loc[bins['variable']==i, ['variable', 'bin', 'woe']]\
          .assign(points = lambda x: round(-b*x['woe']*coef_df[i]), ndigits=digits)\
          [["variable", "bin", "points"]]