In [1]:
# Load packages and libraries
import pandas as pd
import numpy as np
import scorecardpy as sc
import pickle

In [2]:
woe = pd.read_csv("../../data/woe_data.csv")
coefs = pd.read_csv("../../data/scorecard_coefs.csv")

In [3]:
woe.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Qualified
0,1,-0.923599,-0.299612,-1.616726,-0.574709,0.12045,0.035635,0.176674,-0.59298,0.288208,0.031633,0
1,0,-0.923599,-0.299612,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633,1
2,0,-0.923599,-0.299612,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633,1
3,0,0.930297,-0.54958,0.257826,0.144841,-0.274868,-0.004249,0.176674,-0.23597,0.288208,0.031633,1
4,0,-0.923599,0.087465,0.257826,0.144841,0.460433,-0.004249,0.176674,0.256641,0.288208,0.031633,1


In [4]:
coefs

Unnamed: 0,variable,coefficient
0,RevolvingUtilizationOfUnsecuredLines,0.687558
1,age,0.462295
2,NumberOfTime30-59DaysPastDueNotWorse,0.510296
3,DebtRatio,0.839662
4,MonthlyIncome,0.192723
5,NumberOfTimes90DaysLate,0.509898
6,NumberRealEstateLoansOrLines,0.716267
7,NumberOfTime60-89DaysPastDueNotWorse,0.50017
8,NumberOfDependents,0.439731
9,Intercept,2.603919


In [5]:
with open('../../data/logreg.pkl', 'rb') as f:
    lr = pickle.load(f) # deserialize using load()
    print(type(lr))

<class 'sklearn.linear_model._logistic.LogisticRegression'>


In [6]:
pdo=20
factor=pdo/np.log(2)
offset = 600 - (factor*np.log(50))
print('Factor:', round(factor,2),'| Offset:', round(offset,2))

Factor: 28.85 | Offset: 487.12


## Calculate Scores for each bin

$Score_i= (βi × WoE_i + α/n) × Factor + Offset/n$

Where:
* βi — logistic regression coefficient for the variable Xi
* α — logistic regression intercept
* WoE — Weight of Evidence value for variable Xi
* n — number of independent variable Xi in the model
* Factor, Offset — known as scaling parameter, where

Factor = pdo/ln(2)

Offset = Target Score — (Factor × ln(Target Odds))

In [7]:
iv_df = pd.read_csv("../../data/report_iv.csv")
iv_df.drop(columns=['Unnamed: 0'], inplace=True)
iv_df.rename(columns={'IV':'total_iv'}, inplace=True)
iv_df

Unnamed: 0,Variable,total_iv
0,RevolvingUtilizationOfUnsecuredLines,0.980756
1,age,0.219824
2,NumberOfTime30-59DaysPastDueNotWorse,0.486538
3,DebtRatio,0.061371
4,MonthlyIncome,0.063546
5,NumberOfOpenCreditLinesAndLoans,0.000375
6,NumberOfTimes90DaysLate,0.485778
7,NumberRealEstateLoansOrLines,0.060411
8,NumberOfTime60-89DaysPastDueNotWorse,0.60005
9,NumberOfDependents,0.017337


In [8]:
woe_bins = pd.read_csv('../../data/report_woe.csv')
woe_bins.drop(columns=['Unnamed: 0'], inplace=True)
woe_bins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Variable         39 non-null     object 
 1   QuantileRange    37 non-null     object 
 2   EventsCount      39 non-null     int64  
 3   EventsSum        39 non-null     int64  
 4   % of Qualified   39 non-null     float64
 5   DelinquentCount  39 non-null     int64  
 6   % of Delinquent  39 non-null     float64
 7   WoE              39 non-null     float64
 8   IV               39 non-null     float64
dtypes: float64(4), int64(3), object(2)
memory usage: 2.9+ KB


In [9]:
woe_bins = woe_bins.merge(iv_df, left_on='Variable', right_on='Variable', how='left')

In [10]:
woe_bins.fillna('None', inplace=True)

In [11]:
woe_bins

Unnamed: 0,Variable,QuantileRange,EventsCount,EventsSum,% of Qualified,DelinquentCount,% of Delinquent,WoE,IV,total_iv
0,RevolvingUtilizationOfUnsecuredLines,"(-100.0, 0.5]",108712,2987,0.297925,105725,0.755319,0.930297,0.425512,0.980756
1,RevolvingUtilizationOfUnsecuredLines,"(0.5, 1.0]",37967,5802,0.578695,32165,0.229793,-0.923599,0.322246,0.980756
2,RevolvingUtilizationOfUnsecuredLines,"(1.0, 1.5]",2721,1081,0.10782,1640,0.011716,-2.219465,0.213298,0.980756
3,RevolvingUtilizationOfUnsecuredLines,"(1.5, 60000.0]",600,156,0.01556,444,0.003172,-1.590306,0.0197,0.980756
4,age,"(-100, 18]",1,0,5e-05,1,7e-06,-1.943128,8.3e-05,0.219824
5,age,"(18, 26]",4220,485,0.048374,3735,0.026684,-0.594921,0.012904,0.219824
6,age,"(26, 35]",17265,1906,0.190106,15359,0.109728,-0.54958,0.044174,0.219824
7,age,"(35, 45]",29819,2628,0.262118,27191,0.194258,-0.299612,0.020332,0.219824
8,age,"(45, 65]",70096,4317,0.43058,65779,0.469937,0.087465,0.003442,0.219824
9,age,"(65, 120]",28599,690,0.068821,27909,0.199387,1.063738,0.138888,0.219824


In [12]:
breaks_list = []

for variable in woe.columns:
    for item in woe_bins[(woe_bins['Variable'] == variable) & woe_bins['QuantileRange']].iterrows():
            if type(item[1]['QuantileRange']) == str:
                breaks_list.append(np.nan)
            else:
                range_bounds = item[1]['QuantileRange'][1:-1].split(', ')
                range_bounds = [float(num) for num in range_bounds]
                breaks_list.append(range_bounds[1])
            print(item[1]['QuantileRange'])

# len(breaks_list)

(-100.0, 0.5]
(0.5, 1.0]
(1.0, 1.5]
(1.5, 60000.0]
(-100, 18]
(18, 26]
(26, 35]
(35, 45]
(45, 65]
(65, 120]
(-0.098, 1.0]
(1.0, 2.0]
(2.0, 100.0]
(-100.0, 0.3125]
(0.3125, 0.625]
(0.625, 2.5]
(2.5, 330000.0]
(-100, 5000]
(10000, 1000000]
(5000, 10000]
None
(-100.0, 11.6]
(11.6, 17.4]
(17.4, 58.0]
(-100.0, 1.0]
(1.5, 5.0]
(10.0, 98.0]
(5.0, 10.0]
(-100.0, 0.5]
(0.5, 1.0]
(1.0, 3.0]
(3.0, 54.0]
(-100, 0]
(0, 1]
(1, 2]
(2, 100]
(-100, 2]
(2, 20]
None


In [13]:
woe_bins['breaks'] = breaks_list

In [14]:
new_woe_column_names = {
    'Variable':"variable",
    'QuantileRange':'bin',
    '% of Qualified':'count_distr',
    'EventsCount':'good',
    'DelinquentCount':'bad',
    '% of Delinquent':'badprob',
    'WoE':'woe',
    'IV':'bin_iv'}
    
woe_bins.rename(columns=new_woe_column_names, inplace=True)


In [15]:
woe_bins['count'] = woe_bins['good'] + woe_bins['bad']

In [16]:
woe_bins['is_special_values'] = False

In [17]:
woe_bins.head()

Unnamed: 0,variable,bin,good,EventsSum,count_distr,bad,badprob,woe,bin_iv,total_iv,breaks,count,is_special_values
0,RevolvingUtilizationOfUnsecuredLines,"(-100.0, 0.5]",108712,2987,0.297925,105725,0.755319,0.930297,0.425512,0.980756,,214437,False
1,RevolvingUtilizationOfUnsecuredLines,"(0.5, 1.0]",37967,5802,0.578695,32165,0.229793,-0.923599,0.322246,0.980756,,70132,False
2,RevolvingUtilizationOfUnsecuredLines,"(1.0, 1.5]",2721,1081,0.10782,1640,0.011716,-2.219465,0.213298,0.980756,,4361,False
3,RevolvingUtilizationOfUnsecuredLines,"(1.5, 60000.0]",600,156,0.01556,444,0.003172,-1.590306,0.0197,0.980756,,1044,False
4,age,"(-100, 18]",1,0,5e-05,1,7e-06,-1.943128,8.3e-05,0.219824,,2,False


In [18]:
train = pd.read_csv("../../data/cs-training.csv")

In [19]:
bins = sc.woebin(train, y="SeriousDlqin2yrs")

[INFO] creating woe binning ...


In [20]:
breaks_adj = {
    'NumberOfDependents': [2, 20],
    'RevolvingUtilizationOfUnsecuredLines': [0.5, 1, 1.5],
    'age': [18, 26, 35, 45, 65],
    'NumberOfTime30-59DaysPastDueNotWorse': [1,2],
    'DebtRatio': [.3125, .625, 2.5,],
    'MonthlyIncome': [5000, 10_000],
    'NumberOfOpenCreditLinesAndLoans': [11.6, 17.4],
    'NumberOfTimes90DaysLate': [1, 1.5, 5, 10],
    'NumberRealEstateLoansOrLines': [0.5, 1, 3],
    'NumberOfTime60-89DaysPastDueNotWorse': [0, 1, 2]
}

In [21]:
# binning adjustment
bins_adj = sc.woebin(train, y="SeriousDlqin2yrs", breaks_list=breaks_adj)

[INFO] creating woe binning ...


In [28]:
# Drop columns that were not used in the logreg model
x_woe = woe.drop(columns=['SeriousDlqin2yrs', 'NumberOfOpenCreditLinesAndLoans', 'Qualified'])
xcolumns = x_woe.columns

In [218]:
bins_adj

{'NumberOfTime30-59DaysPastDueNotWorse':                                variable         bin   count  count_distr  \
 0  NumberOfTime30-59DaysPastDueNotWorse  [-inf,1.0)  126018     0.840120   
 1  NumberOfTime30-59DaysPastDueNotWorse   [1.0,2.0)   16033     0.106887   
 2  NumberOfTime30-59DaysPastDueNotWorse   [2.0,inf)    7949     0.052993   
 
      good   bad   badprob       woe    bin_iv  total_iv breaks  \
 0  120977  5041  0.040002 -0.541721  0.195826  0.740481    1.0   
 1   13624  2409  0.150253  0.903654  0.129171  0.740481    2.0   
 2    5373  2576  0.324066  1.901126  0.415484  0.740481    inf   
 
    is_special_values  
 0              False  
 1              False  
 2              False  ,
 'DebtRatio':     variable             bin  count  count_distr   good   bad   badprob  \
 0  DebtRatio   [-inf,0.3125)  65035     0.433567  61240  3795  0.058353   
 1  DebtRatio  [0.3125,0.625)  38561     0.257073  35816  2745  0.071186   
 2  DebtRatio     [0.625,2.5)  15748     0

In [354]:
# create scorecard
card = sc.scorecard(bins_adj, lr, xcolumns, points0=1025, odds0=1/9, pdo=41, basepoints_eq0=False, digits=0)

In [355]:
scorecard = {}

for key, values in card.items():
    values.fillna('missing', inplace=True)
    points_list = []
    for row in values.iterrows():
        if row[1]['bin'] == 'missing':
            points_list.append(0)
        else:
            top_value = float(row[1]['bin'][1:-1].split(',')[1] )
            if top_value == np.inf:
                points_list.append(1_000_000_000)
            else:
                points_list.append(top_value)
        points_list.append(row[1]['points'])
    scorecard[key] = points_list

In [356]:
for key, values in scorecard.items():
    print(key)
    print(values)
    print("===")

basepoints
[0, 741.0]
===
RevolvingUtilizationOfUnsecuredLines
[0.5, 38.0, 1.0, -38.0, 1.5, -90.0, 1000000000, -65.0]
===
age
[18.0, -69.0, 26.0, -15.0, 35.0, -15.0, 45.0, -9.0, 65.0, 2.0, 1000000000, 28.0]
===
NumberOfTime30-59DaysPastDueNotWorse
[1.0, 16.0, 2.0, -27.0, 1000000000, -57.0]
===
DebtRatio
[0.3125, 7.0, 0.625, -3.0, 2.5, -29.0, 1000000000, 10.0]
===
MonthlyIncome
[0, 2.0, 5000.0, -3.0, 10000.0, 1.0, 1000000000, 5.0]
===
NumberOfTimes90DaysLate
[1.0, 12.0, 1.5, -59.0, 5.0, -84.0, 10.0, -100.0, 1000000000, -85.0]
===
NumberRealEstateLoansOrLines
[0.5, -10.0, 3.0, 10.0, 1000000000, -11.0]
===
NumberOfTime60-89DaysPastDueNotWorse
[1.0, 9.0, 2.0, -54.0, 1000000000, -82.0]
===
NumberOfDependents
[0, 11.0, 2.0, 2.0, 20.0, -7.0, 1000000000, -66.0]
===


In [361]:
scorecard

{'basepoints': [0, 741.0],
 'RevolvingUtilizationOfUnsecuredLines': [0.5,
  38.0,
  1.0,
  -38.0,
  1.5,
  -90.0,
  1000000000,
  -65.0],
 'age': [18.0,
  -69.0,
  26.0,
  -15.0,
  35.0,
  -15.0,
  45.0,
  -9.0,
  65.0,
  2.0,
  1000000000,
  28.0],
 'NumberOfTime30-59DaysPastDueNotWorse': [1.0,
  16.0,
  2.0,
  -27.0,
  1000000000,
  -57.0],
 'DebtRatio': [0.3125, 7.0, 0.625, -3.0, 2.5, -29.0, 1000000000, 10.0],
 'MonthlyIncome': [0, 2.0, 5000.0, -3.0, 10000.0, 1.0, 1000000000, 5.0],
 'NumberOfTimes90DaysLate': [1.0,
  12.0,
  1.5,
  -59.0,
  5.0,
  -84.0,
  10.0,
  -100.0,
  1000000000,
  -85.0],
 'NumberRealEstateLoansOrLines': [0.5, -10.0, 3.0, 10.0, 1000000000, -11.0],
 'NumberOfTime60-89DaysPastDueNotWorse': [1.0,
  9.0,
  2.0,
  -54.0,
  1000000000,
  -82.0],
 'NumberOfDependents': [0, 11.0, 2.0, 2.0, 20.0, -7.0, 1000000000, -66.0]}

In [357]:
{"":"DOB",
"":"NumberOfDependents",
"":"MonthlyIncome",
"":"MonthlyExpenses",
"":"Accounts",
"":"RevolvingUtilization",
"":"RealEstateLoans",
"":"ThirtyFiftyNinePastDue",
"":"SixtyEightyNinePastDue",
"":"NinetyDaysLate",}

{'': 'NinetyDaysLate'}

lowest values = -167
highest_value = 624

In [358]:
# credit score
train_score = sc.scorecard_ply(train, card, print_step=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_score.loc[:,'score'] = card_basepoints + dat_score.sum(axis=1)


In [359]:
train_score.min()

score    364.0
dtype: float64

In [360]:
train_score.max()

score    877.0
dtype: float64