# Ensemble Voting Predictor

For each predictor, independently calculate the median value. Choose to vote "up" or "down" depending on the value of each predictor. At the end, assign 0 or less to "approve" and 1 or more to "reject".

In [127]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pandas
import numpy
import matplotlib.pyplot as plt
import itertools
import sklearn
from sklearn.metrics import log_loss

In [4]:
import loan_approval_lib
from loan_approval_lib import logspace, linspace

In [5]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB

In [6]:
import itertools

In [7]:
data = loan_approval_lib.load_original_data()

In [8]:
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3,1
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0


# Data Cleaning

In [30]:
columns = [
    'person_age',
    'person_income',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_default_on_file'
]

for column in columns:
    row_count_1 = len(data)

    if column == 'person_age':
        data = data[data['person_age'] <= 100]

    if column == 'person_emp_length':
        data = data[data['person_emp_length'] <= 100]

    data = data[data[column].isna() == False]
    
    row_count_2 = len(data)
    row_count_diff = row_count_1 - row_count_2
    print(f'column {column}, number of removed rows: {row_count_diff}')
    

column person_age, number of removed rows: 5
column person_income, number of removed rows: 0
column person_emp_length, number of removed rows: 897
column loan_amnt, number of removed rows: 0
column loan_int_rate, number of removed rows: 3047
column loan_percent_income, number of removed rows: 0
column cb_person_default_on_file, number of removed rows: 0


# Calculate median for all numerical values

# Calculate median and vote direction for `person_age`

The value of `P` is > 0.5, therefore if `person_age` is above the median value, vote "reject"

In [31]:
data_person_age = data['person_age']
median_person_age = numpy.median(data_person_age)
median_person_age

np.float64(26.0)

In [32]:
data_person_age_2 = data[['person_age', 'loan_status']].copy()
data_person_age_2['person_age_sub_median'] = data_person_age_2['person_age'] - median_person_age
data_person_age_2['person_age_direction'] = (data_person_age_2['person_age_sub_median'] > 0).astype(int)
P_person_age = (data_person_age_2['person_age_direction'] == data_person_age_2['loan_status']).sum() / len(data_person_age_2)
P_person_age

np.float64(0.5166946074322436)

In [33]:
len(
    data[
        (data_person_age_2['person_age_direction'] == 0) &
        (data_person_age_2['loan_status'] == 0)
    ]
)

12136

In [34]:
len(
    data[
        (data_person_age_2['person_age_direction'] == 1) &
        (data_person_age_2['loan_status'] == 1)
    ]
)

2658

In [36]:
12136 + 2658

14794

In [37]:
len(
    data[
        (data_person_age_2['person_age_direction'] == 0) &
        (data_person_age_2['loan_status'] == 1)
    ]
)

3544

In [38]:
len(
    data[
        (data_person_age_2['person_age_direction'] == 1) &
        (data_person_age_2['loan_status'] == 0)
    ]
)

10294

In [39]:
3544 + 10294

13838

# Calculate median and vote direction for `person_income`

The value of `P` is <= 0.5, therefore if `person_income` is below or equal to the median value, vote "reject"

In [89]:
data_person_income = data[['person_income', 'loan_status']].copy()
median_person_income = numpy.median(data_person_income['person_income'])
print(median_person_income)
data_person_income['person_income_sub_median'] = data_person_income['person_income'] - median_person_income
data_person_income['person_income_direction'] = (data_person_income['person_income_sub_median'] > 0).astype(int)
P_person_income = (data_person_income['person_income_direction'] == data_person_income['loan_status']).sum() / len(data_person_income)
print(P_person_income)

55900.0
0.4150600726459905


# Calculate median and vote direction for `person_emp_length`

The value of `P` is <= 0.5, therefore if `person_emp_length` is below or equal to the median value, vote "reject"

In [90]:
data_person_emp_length = data[['person_emp_length', 'loan_status']].copy()
median_person_emp_length = numpy.median(data_person_emp_length['person_emp_length'])
print(median_person_emp_length)
data_person_emp_length['person_emp_length_sub_median'] = data_person_emp_length['person_emp_length'] - median_person_emp_length
data_person_emp_length['person_emp_length_direction'] = (data_person_emp_length['person_emp_length_sub_median'] > 0).astype(int)
P_person_emp_length = (data_person_emp_length['person_emp_length_direction'] == data_person_emp_length['loan_status']).sum() / len(data_person_emp_length)
print(P_person_emp_length)

4.0
0.4895920648225761


# Calculate median and vote direction for `loan_amnt`

The value of `P` is > 0.5, therefore if `loan_amnt` is below or equal to the median value, vote "reject"

In [91]:
data_loan_amnt = data[['loan_amnt', 'loan_status']].copy()
median_loan_amnt = numpy.median(data_loan_amnt['loan_amnt'])
print(median_loan_amnt)
data_loan_amnt['loan_amnt_sub_median'] = data_loan_amnt['loan_amnt'] - median_loan_amnt
data_loan_amnt['loan_amnt_direction'] = (data_loan_amnt['loan_amnt_sub_median'] > 0).astype(int)
P_loan_amnt = (data_loan_amnt['loan_amnt_direction'] == data_loan_amnt['loan_status']).sum() / len(data_loan_amnt)
print(P_loan_amnt)

8000.0
0.5398505169041632


# Calculate median and vote direction for `loan_int_rate`

The value of `P` is > 0.5, therefore if `loan_int_rate` is below or equal to the median value, vote "reject"

In [92]:
data_loan_int_rate = data[['loan_int_rate', 'loan_status']].copy()
median_loan_int_rate = numpy.median(data_loan_int_rate['loan_int_rate'])
print(median_loan_int_rate)
data_loan_int_rate['loan_int_rate_sub_median'] = data_loan_int_rate['loan_int_rate'] - median_loan_int_rate
data_loan_int_rate['loan_int_rate_direction'] = (data_loan_int_rate['loan_int_rate_sub_median'] > 0).astype(int)
P_loan_int_rate = (data_loan_int_rate['loan_int_rate_direction'] == data_loan_int_rate['loan_status']).sum() / len(data_loan_int_rate)
print(P_loan_int_rate)

10.99
0.6015996088292819


# Calculate median and vote direction for `loan_percent_income`

The value of `P` is > 0.5, therefore if `loan_percent_income` is below or equal to the median value, vote "reject"

In [93]:
data_loan_percent_income = data[['loan_percent_income', 'loan_status']].copy()
median_loan_percent_income = numpy.median(data_loan_percent_income['loan_percent_income'])
print(median_loan_percent_income)
data_loan_percent_income['loan_percent_income_sub_median'] = data_loan_percent_income['loan_percent_income'] - median_loan_percent_income
data_loan_percent_income['loan_percent_income_direction'] = (data_loan_percent_income['loan_percent_income_sub_median'] > 0).astype(int)
P_loan_percent_income = (data_loan_percent_income['loan_percent_income_direction'] == data_loan_percent_income['loan_status']).sum() / len(data_loan_percent_income)
print(P_loan_percent_income)

0.15
0.6175956971221012


# Calculate median and vote direction for `cb_person_cred_hist_length`

The value of `P` is > 0.5, therefore if `cb_person_cred_hist_length` is below or equal to the median value, vote "reject"

In [94]:
data_cb_person_cred_hist_length = data[['cb_person_cred_hist_length', 'loan_status']].copy()
median_cb_person_cred_hist_length = numpy.median(data_cb_person_cred_hist_length['cb_person_cred_hist_length'])
print(median_cb_person_cred_hist_length)
data_cb_person_cred_hist_length['cb_person_cred_hist_length_sub_median'] = data_cb_person_cred_hist_length['cb_person_cred_hist_length'] - median_cb_person_cred_hist_length
data_cb_person_cred_hist_length['cb_person_cred_hist_length_direction'] = (data_cb_person_cred_hist_length['cb_person_cred_hist_length_sub_median'] > 0).astype(int)
P_cb_person_cred_hist_length = (data_cb_person_cred_hist_length['cb_person_cred_hist_length_direction'] == data_cb_person_cred_hist_length['loan_status']).sum() / len(data_cb_person_cred_hist_length)
print(P_cb_person_cred_hist_length)

4.0
0.5166946074322436


# Apply vote logic

In [113]:
data_with_vote = data.copy()

data_with_vote['person_age_vote'] = 2 * (data_with_vote['person_age'] > median_person_age).astype(int) - 1

data_with_vote['person_income_vote'] = 2 * (data_with_vote['person_income'] <= median_person_income).astype(int) - 1

data_with_vote['person_emp_length_vote'] = 2 * (data_with_vote['person_emp_length'] <= median_person_emp_length).astype(int) - 1

data_with_vote['loan_amnt_vote'] = 2 * (data_with_vote['loan_amnt'] > median_loan_amnt).astype(int) - 1

data_with_vote['loan_int_rate_vote'] = 2 * (data_with_vote['loan_int_rate'] > median_loan_int_rate).astype(int) - 1

data_with_vote['loan_percent_income_vote'] = 2 * (data_with_vote['loan_percent_income'] > median_loan_percent_income).astype(int) - 1

data_with_vote['cb_person_cred_hist_length_vote'] = 2 * (data_with_vote['cb_person_cred_hist_length'] > median_cb_person_cred_hist_length).astype(int) - 1

data_with_vote['loan_status_vote'] = (
    data_with_vote['person_age_vote'] +
    data_with_vote['person_income_vote'] +
    data_with_vote['person_emp_length_vote'] +
    data_with_vote['loan_amnt_vote'] +
    data_with_vote['loan_int_rate_vote'] +
    data_with_vote['loan_percent_income_vote'] +
    data_with_vote['cb_person_cred_hist_length_vote']
)

data_with_vote

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,person_age_vote,person_income_vote,person_emp_length_vote,loan_amnt_vote,loan_int_rate_vote,loan_percent_income_vote,cb_person_cred_hist_length_vote,loan_status_vote
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0,-1,1,-1,-1,1,-1,-1,-3
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1,-1,1,1,-1,1,1,-1,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1,-1,-1,1,1,1,1,-1,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1,-1,1,-1,1,1,1,-1,1
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,0.25,N,2,1,-1,1,1,-1,-1,1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0,1,1,1,-1,1,-1,1,3
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0,1,-1,1,1,-1,-1,1,1
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1,1,-1,1,1,-1,1,1,3
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0,1,-1,-1,1,1,-1,1,1


In [114]:
data_with_vote['loan_status_predict'] = (data_with_vote['loan_status_vote'] > 0).astype(int)
accuracy = (data_with_vote['loan_status_predict'] == data_with_vote['loan_status']).sum() / len(data_with_vote)
accuracy

np.float64(0.5997834590667784)

In [145]:
# test different combinations

data_with_vote['loan_status_vote'] = (
    #data_with_vote['person_age_vote'] #+
    data_with_vote['person_income_vote'] +
    #data_with_vote['person_emp_length_vote'] #+
    #data_with_vote['loan_amnt_vote'] #+
    data_with_vote['loan_int_rate_vote'] +
    data_with_vote['loan_percent_income_vote'] #+
    #data_with_vote['cb_person_cred_hist_length_vote']
)

data_with_vote['loan_status_predict'] = (data_with_vote['loan_status_vote'] > 0).astype(int)
accuracy = (data_with_vote['loan_status_predict'] == data_with_vote['loan_status']).sum() / len(data_with_vote)
accuracy

np.float64(0.6346046381670858)

In [146]:
# all = 0.600
# no person age: 0.690
# no person income: 0.642
# no person emp length: 0.666
# no loan amount: 0.662
# no loan int rate: 0.641
# no loan percent income: 0.637
# no credit history length: 0.690

# single variable
# person age: 0.517
# person income: 0.585
# person emp length: 0.510
# loan amount: 0.540
# loan int rate: 0.602
# loan percent income: 0.618
# credit hist length: 0.517

# best two
# loan int rate and loan percent income: 0.740

# best three
# loan int rate, loan percent income, person income: 0.635 (gets worse)

# TODO: for every possible combination of these 6 columns, calculate the accuracy

# the simple way to do this is to introduce a coefficient for each column, which is either set to 0 or 1

In [115]:
data_with_vote.to_csv('data_with_vote.csv', index=False)

# Build model using for loop

In [98]:
columns = [
    'person_age',
    'person_income',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length',
]

median_by_column = {}
P_by_column = {}
sign_by_column = {}

for column in columns:
    data_copy = data[[column, 'loan_status']].copy()
    median_value = numpy.median(data_copy[column])

    median_by_column[column] = median_value

    data_copy[f'{column}_sub_median'] = data_copy[column] - median_value
    data_copy[f'{column}_direction'] = (data_copy[f'{column}_sub_median'] > 0).astype(int)

    P_value = (data_copy[f'{column}_direction'] == data_copy['loan_status']).sum() / len(data_copy)
    P_by_column[column] = P_value

    if P_value < 0.5:
        sign_by_column[column] = -1
    else:
        sign_by_column[column] = 1

In [99]:
median_by_column

{'person_age': np.float64(26.0),
 'person_income': np.float64(55900.0),
 'person_emp_length': np.float64(4.0),
 'loan_amnt': np.float64(8000.0),
 'loan_int_rate': np.float64(10.99),
 'loan_percent_income': np.float64(0.15),
 'cb_person_cred_hist_length': np.float64(4.0)}

In [100]:
data_with_vote = data.copy()

for column in columns:
    data_with_vote[f'{column}_vote'] = sign_by_column[column] * (2 * (data_with_vote[column] > median_by_column[column]).astype(int) - 1)

data_with_vote['loan_status_vote'] = 0

for column in columns:
    data_with_vote['loan_status_vote'] += data_with_vote[f'{column}_vote']

data_with_vote

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,person_age_vote,person_income_vote,person_emp_length_vote,loan_amnt_vote,loan_int_rate_vote,loan_percent_income_vote,cb_person_cred_hist_length_vote,loan_status_vote
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0,-1,1,-1,-1,1,-1,-1,-3
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1,-1,1,1,-1,1,1,-1,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1,-1,-1,1,1,1,1,-1,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1,-1,1,-1,1,1,1,-1,1
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,0.25,N,2,1,-1,1,1,-1,-1,1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0,1,1,1,-1,1,-1,1,3
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0,1,-1,1,1,-1,-1,1,1
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1,1,-1,1,1,-1,1,1,3
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0,1,-1,-1,1,1,-1,1,1


In [101]:
data_with_vote['loan_status_predict'] = (data_with_vote['loan_status_vote'] > 0).astype(int)
accuracy = (data_with_vote['loan_status_predict'] == data_with_vote['loan_status']).sum() / len(data_with_vote)
accuracy

np.float64(0.5997834590667784)

In [120]:
# check that the new model which is built using a for loop produces the same results
data_with_vote_loaded = pandas.read_csv('data_with_vote.csv')
data_with_vote_loaded.index = data_with_vote.index

In [121]:
# check a single column of values
column = 'person_emp_length_vote'

(data_with_vote[column] != data_with_vote_loaded[column]).sum()

np.int64(0)

In [122]:
# view different values for single column
data_with_vote[
    data_with_vote[column] != data_with_vote_loaded[column]
]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,...,loan_status,person_age_vote,person_income_vote,person_emp_length_vote,loan_amnt_vote,loan_int_rate_vote,loan_percent_income_vote,cb_person_cred_hist_length_vote,loan_status_vote,loan_status_predict


In [123]:
# check lengths
len(data_with_vote)

28632

In [106]:
len(data_with_vote_loaded)

28632

In [124]:
# check data

In [107]:
data_with_vote

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,...,loan_status,person_age_vote,person_income_vote,person_emp_length_vote,loan_amnt_vote,loan_int_rate_vote,loan_percent_income_vote,cb_person_cred_hist_length_vote,loan_status_vote,loan_status_predict
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,...,0,-1,1,-1,-1,1,-1,-1,-3,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,...,1,-1,1,1,-1,1,1,-1,1,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,...,1,-1,-1,1,1,1,1,-1,1,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,...,1,-1,1,-1,1,1,1,-1,1,1
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,0.25,N,...,1,-1,1,1,-1,-1,1,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,...,0,1,1,1,-1,1,-1,1,3,1
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,...,0,1,-1,1,1,-1,-1,1,1,1
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,...,1,1,-1,1,1,-1,1,1,3,1
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,...,0,1,-1,-1,1,1,-1,1,1,1


In [108]:
data_with_vote_loaded

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,...,loan_status,person_age_vote,person_income_vote,person_emp_length_vote,loan_amnt_vote,loan_int_rate_vote,loan_percent_income_vote,cb_person_cred_hist_length_vote,loan_status_vote,loan_status_predict
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,...,0,-1,1,1,-1,1,-1,-1,-1,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,...,1,-1,1,-1,-1,1,1,-1,-1,0
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,...,1,-1,-1,-1,1,1,1,-1,-1,0
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,...,1,-1,1,1,1,1,1,-1,3,1
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,0.25,N,...,1,-1,1,-1,-1,-1,1,-1,-3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,...,0,1,1,-1,-1,1,-1,1,1,1
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,...,0,1,-1,-1,1,-1,-1,1,-1,0
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,...,1,1,-1,-1,1,-1,1,1,1,1
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,...,0,1,-1,1,1,1,-1,1,3,1


In [119]:
# validation, all ok

for column in data_with_vote.columns:
    print(f'{column}: {(data_with_vote[column] != data_with_vote_loaded[column]).sum()}')


person_age: 0
person_income: 0
person_home_ownership: 0
person_emp_length: 0
loan_intent: 0
loan_grade: 0
loan_amnt: 0
loan_int_rate: 0
loan_percent_income: 0
cb_person_default_on_file: 0
cb_person_cred_hist_length: 0
loan_status: 0
person_age_vote: 0
person_income_vote: 0
person_emp_length_vote: 0
loan_amnt_vote: 0
loan_int_rate_vote: 0
loan_percent_income_vote: 0
cb_person_cred_hist_length_vote: 0
loan_status_vote: 0
loan_status_predict: 0


# Findings

- Model accuracy is worse than just assuming all loans are "accept"

## TODO:

- Add categorical variables. Some have obvious ordering, some do not
- For these variables, may need to assign +X or -Y values to each category
- When this has been done, try some manual optimization procedure to tune the boost/penalty values for each category
- This is some form of hyperparameter tuning
- When this is done, do some hyperparameter tuning on numerical values
- This might be adjusting the boost/penalty magnitude
- and/or: add more categories (eg 3/4 levels not binary)

What is the accuracy of the best numerical predictor?

- `loan_percent_income`

In [126]:
data_copy_logistic = data.copy()

column = 'loan_percent_income'

x = data_copy_logistic[column].to_numpy().reshape(-1, 1)
y = data_copy_logistic['loan_status'].to_numpy()

model = sklearn.linear_model.LogisticRegression()
clf = model.fit(x, y)

print(f'clf score: {clf.score(x, y)}')

data_copy_logistic['loan_status_logistic'] = clf.predict(x)

accuracy = (data_copy_logistic['loan_status_logistic'] == data_copy_logistic['loan_status']).sum() / len(data_copy_logistic)

accuracy

clf score: 0.8147177982676725


np.float64(0.8147177982676725)

# Binary model built from successive categorical variables

See `binary_vote_model_categorical`