# Ensemble Voting Predictor

For each predictor, independently calculate the median value. Choose to vote "up" or "down" depending on the value of each predictor. At the end, assign 0 or less to "approve" and 1 or more to "reject".

This notebook uses a similar logic for categorical variables.

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas
import numpy
import matplotlib.pyplot as plt
import itertools
import sklearn
from sklearn.metrics import log_loss

In [4]:
import loan_approval_lib
from loan_approval_lib import logspace, linspace

In [5]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB

In [6]:
import itertools

In [7]:
data = loan_approval_lib.load_original_data()

In [8]:
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3,1
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0


# Data Cleaning

In [9]:
columns = [
    'person_age',
    'person_income',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_default_on_file'
]

for column in columns:
    row_count_1 = len(data)

    if column == 'person_age':
        data = data[data['person_age'] <= 100]

    if column == 'person_emp_length':
        data = data[data['person_emp_length'] <= 100]

    data = data[data[column].isna() == False]
    
    row_count_2 = len(data)
    row_count_diff = row_count_1 - row_count_2
    print(f'column {column}, number of removed rows: {row_count_diff}')
    

column person_age, number of removed rows: 5
column person_income, number of removed rows: 0
column person_emp_length, number of removed rows: 897
column loan_amnt, number of removed rows: 0
column loan_int_rate, number of removed rows: 3047
column loan_percent_income, number of removed rows: 0
column cb_person_default_on_file, number of removed rows: 0


# Binary voting logic for `loan_grade`

In [81]:
loan_grades = list(sorted(data['loan_grade'].unique()))

data['loan_grade_vote'] = 0

for loan_grade in loan_grades:
    data.loc[data['loan_grade'] == loan_grade, 'loan_grade_vote'] = -1 if loan_grade <= 'C' else 1

data['loan_grade_predict'] = data['loan_grade_vote'].map(lambda x: 0 if x == -1 else 1)

In [82]:
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,loan_grade_vote,person_home_ownership_vote,loan_intent_vote,loan_grade_predict
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0,-1,-1.0,-1.0,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1,-1,-1.0,1.0,0
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1,-1,1.0,1.0,0
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1,-1,1.0,1.0,0
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,0.25,N,2,1,-1,-1.0,-11.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0,-1,-1.0,-1.0,0
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0,-1,-1.0,-1.0,0
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1,-1,1.0,1.0,0
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0,-1,-1.0,-1.0,0


In [55]:
data[data['loan_grade'] == 'E']

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,loan_grade_vote,person_home_ownership_vote
13,26,108160,RENT,4.0,EDUCATION,E,35000,18.39,0.32,N,4,1,1,1.0
21,25,137000,RENT,9.0,PERSONAL,E,34800,16.77,0.25,Y,2,0,1,1.0
26,21,11000,MORTGAGE,3.0,VENTURE,E,4575,17.74,0.42,Y,3,1,1,-1.0
36,22,48000,RENT,1.0,EDUCATION,E,30000,18.39,0.63,N,2,1,1,1.0
40,26,62050,RENT,6.0,MEDICAL,E,30000,17.99,0.41,N,2,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32411,52,71520,RENT,0.0,VENTURE,E,7000,16.82,0.10,N,28,1,1,1.0
32426,66,40000,RENT,2.0,EDUCATION,E,7000,15.68,0.17,Y,29,1,1,1.0
32473,64,56000,RENT,0.0,DEBTCONSOLIDATION,E,10000,16.82,0.18,Y,18,1,1,1.0
32559,51,58000,RENT,2.0,MEDICAL,E,1500,19.03,0.03,N,24,1,1,1.0


In [56]:
p_baseline = (data['loan_status'] == 0).sum() / len(data)

for loan_grade in loan_grades:
    p_tmp = ((data['loan_grade'] == loan_grade) & (data['loan_status'] == 0)).sum() / (data['loan_grade'] == loan_grade).sum()
    print(f'loan_grade={loan_grade}, {p_tmp}, {p_tmp - p_baseline}')

loan_grade=A, 0.903840017019466, 0.12045080215497872
loan_grade=B, 0.8411848289430539, 0.05779561407856659
loan_grade=C, 0.79691065473056, 0.013521439866072682
loan_grade=D, 0.4080689867570065, -0.37532022810748084
loan_grade=E, 0.35402298850574715, -0.42936622635874017
loan_grade=F, 0.3014354066985646, -0.4819538081659227
loan_grade=G, 0.01694915254237288, -0.7664400623221145


In [83]:
(data['loan_status'] == data['loan_grade_predict']).sum() / len(data)

np.float64(0.8180008382229673)

# Binary voting logic for `person_home_ownership`

- This one is different to the above
- I'm not sure if this is valid logic, really
- In the case of the above, some categories have probabilities of loan rejection greater than 0.5
- Whereas in this case, all categories are below 0.5
- However, the "baseline" probability is 0.7834
- So subtract this value from the probability measured for each category
- This gives an indication as to whether the category is more likely than the average or less likely than the average to predict a loan rejection

I wonder whether I should apply the same logic to the above categories? (See above. Answer is it doesn't matter, result is the same.)

In [29]:
list(sorted(data['person_home_ownership'].unique()))

['MORTGAGE', 'OTHER', 'OWN', 'RENT']

In [43]:
# Are other values greater or smaller than this?
(data['loan_status'] == 0).sum() / len(data)

np.float64(0.7833892148644873)

In [40]:
(data['person_home_ownership'] == 'OWN').sum()

np.int64(2192)

In [41]:
((data['person_home_ownership'] == 'OWN') & (data['loan_status'] == 0)).sum()

np.int64(2046)

In [45]:
# own = accept
((data['person_home_ownership'] == 'OWN') & (data['loan_status'] == 0)).sum() / (data['person_home_ownership'] == 'OWN').sum()

np.float64(0.9333941605839416)

In [32]:
len(data[data['person_home_ownership'] == 'MORTGAGE'])

11798

In [31]:
(data['person_home_ownership'] == 'MORTGAGE').sum()

np.int64(11798)

In [33]:
((data['person_home_ownership'] == 'MORTGAGE') & (data['loan_status'] == 0)).sum()

np.int64(10313)

In [47]:
# mortgage = accept
((data['person_home_ownership'] == 'MORTGAGE') & (data['loan_status'] == 0)).sum() / (data['person_home_ownership'] == 'MORTGAGE').sum()

np.float64(0.8741312086794372)

In [37]:
(data['person_home_ownership'] == 'RENT').sum()

np.int64(14548)

In [39]:
((data['person_home_ownership'] == 'RENT') & (data['loan_status'] == 0)).sum()

np.int64(10004)

In [49]:
# rent = reject
((data['person_home_ownership'] == 'RENT') & (data['loan_status'] == 0)).sum() / (data['person_home_ownership'] == 'RENT').sum()

np.float64(0.687654660434424)

In [50]:
# other = reject
((data['person_home_ownership'] == 'OTHER') & (data['loan_status'] == 0)).sum() / (data['person_home_ownership'] == 'OTHER').sum()

np.float64(0.7127659574468085)

In [52]:
data.loc[data['person_home_ownership'] == 'OWN', 'person_home_ownership_vote'] = -1
data.loc[data['person_home_ownership'] == 'MORTGAGE', 'person_home_ownership_vote'] = -1
data.loc[data['person_home_ownership'] == 'RENT', 'person_home_ownership_vote'] = 1
data.loc[data['person_home_ownership'] == 'OTHER', 'person_home_ownership_vote'] = 1

In [85]:
data['person_home_ownership_predict'] = data['person_home_ownership_vote'].map(lambda x: 0 if x == -1 else 1)

(data['loan_status'] == data['person_home_ownership_predict']).sum() / len(data)

np.float64(0.5912964515227718)

# Binary voting logic for `loan_intent`

In [57]:
list(sorted(data['loan_intent'].unique()))

['DEBTCONSOLIDATION',
 'EDUCATION',
 'HOMEIMPROVEMENT',
 'MEDICAL',
 'PERSONAL',
 'VENTURE']

In [73]:
loan_intentions = list(sorted(data['loan_intent'].unique()))

In [74]:
loan_intentions

['DEBTCONSOLIDATION',
 'EDUCATION',
 'HOMEIMPROVEMENT',
 'MEDICAL',
 'PERSONAL',
 'VENTURE']

In [76]:
p_baseline = (data['loan_status'] == 0).sum() / len(data)

for loan_intent in loan_intentions:
    p_tmp = ((data['loan_intent'] == loan_intent) & (data['loan_status'] == 0)).sum() / (data['loan_intent'] == loan_intent).sum()
    print(f'loan_intent={loan_intent}, {p_tmp}, {p_tmp - p_baseline}')

loan_intent=DEBTCONSOLIDATION, 0.7161007667031764, -0.06728844816131097
loan_intent=EDUCATION, 0.8297088740792704, 0.04631965921478309
loan_intent=HOMEIMPROVEMENT, 0.743277048155097, -0.04011216670939033
loan_intent=MEDICAL, 0.7315322123559418, -0.051857002508545524
loan_intent=PERSONAL, 0.8026666666666666, 0.019277451802179324
loan_intent=VENTURE, 0.8537707541508301, 0.07038153928634283


In [None]:
# DEBT = reject
# EDUCATION = accept
# HOME = reject
# MEDICAL = reject
# PERSONAL = accept
# VENTURE = accept

In [65]:
((data['loan_intent'] == 'EDUCATION') & (data['loan_status'] == 0)).sum()

np.int64(4731)

In [66]:
(data['loan_intent'] == 'EDUCATION').sum()

np.int64(5702)

In [68]:
((data['loan_intent'] == 'EDUCATION') & (data['loan_status'] == 0)).sum()

np.int64(4731)

In [84]:
data.loc[data['loan_intent'] == 'DEBTCONSOLIDATION', 'loan_intent_vote'] = 1
data.loc[data['loan_intent'] == 'EDUCATION', 'loan_intent_vote'] = -1
data.loc[data['loan_intent'] == 'HOMEIMPROVEMENT', 'loan_intent_vote'] = 1
data.loc[data['loan_intent'] == 'MEDICAL', 'loan_intent_vote'] = 1
data.loc[data['loan_intent'] == 'PERSONAL', 'loan_intent_vote'] = -1
data.loc[data['loan_intent'] == 'VENTURE', 'loan_intent_vote'] = -1

In [86]:
data['loan_intent_predict'] = data['loan_intent_vote'].map(lambda x: 0 if x == -1 else 1)

(data['loan_status'] == data['loan_intent_predict']).sum() / len(data)

np.float64(0.5745319921765857)

# Binary voting logic for `cb_person_default_on_file`

In [100]:
p_baseline = (data['loan_status'] == 0).sum() / len(data)

for cb_person_default_on_file in sorted(data['cb_person_default_on_file'].unique()):
    p_tmp = ((data['cb_person_default_on_file'] == cb_person_default_on_file) & (data['loan_status'] == 0)).sum() / (data['cb_person_default_on_file'] == cb_person_default_on_file).sum()
    print(f'cb_person_default_on_file={cb_person_default_on_file}, {p_tmp}, {p_tmp - p_baseline}')

cb_person_default_on_file=N, 0.8182745431364216, 0.034885328271934246
cb_person_default_on_file=Y, 0.62250098000784, -0.16088823485664727


In [101]:
data.loc[data['cb_person_default_on_file'] == 'Y', 'cb_person_default_on_file_vote'] = 1
data.loc[data['cb_person_default_on_file'] == 'N', 'cb_person_default_on_file_vote'] = -1

data['cb_person_default_on_file_predict'] = data['cb_person_default_on_file_vote'].map(lambda x: 0 if x == -1 else 1)

(data['loan_status'] == data['cb_person_default_on_file_predict']).sum() / len(data)

np.float64(0.739731768650461)

In [27]:
data.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length', 'loan_status', 'loan_grade_vote'],
      dtype='object')

# Voting combinations

In [89]:
'loan_status_predict' in data.columns

False

In [118]:
(data['loan_status_vote'] < 0).sum()

# 3913 > 0
# 1239 == 0
# 23480 < 0

np.int64(23480)

In [120]:
data['loan_status_vote'] = (
    data['person_home_ownership_vote'] +
    data['loan_intent_vote'] +
    4 * data['loan_grade_vote'] +
    2 * data['cb_person_default_on_file_vote']
)

data['loan_status_predict'] = data['loan_status_vote'] > 0

(data['loan_status'] == data['loan_status_predict']).sum() / len(data)

np.float64(0.8285484772282761)

### Notes

- `loan_intent_vote` by itself produces accuracy of 0.575
- `person_home_ownership_vote` by itself produces accuracy of 0.591
- both together produces accuracy 0.713

This is less than just predicting that all loans are approved, but the significant improvement produced by combining two predictors suggests this voting based method may be useful.

- Adding `loan_grade_vote` increases the accuracy to 0.736, which is only a marginal improvement
- Removing all except `loan_grade_vote` increases the accuracy to 0.818. This suggests the other variables are reducing the effectiveness of the prediction by introducing noise

###### `cb_person_default_on_file_vote`:

- `cb_person_default_on_file_vote`: 0.740
- `cb_person_default_on_file_vote` + `person_home_ownership_vote`: 0.777
- `cb_person_default_on_file_vote` + `loan_intent_vote`: 0.781
- `cb_person_default_on_file_vote` + `loan_grade_vote`: 0.801

Managed to get an accuracy of 0.829 by using the following combination:

- `person_home_ownership_vote` + `loan_intent_vote` + 4 * `loan_grade_vote` + 2 * `cb_person_default_on_file_vote`