Compare several methods of predicting the loan rejection.

1. Use a simple probability. The probability of rejection is < 50 %, therefore always predict 0. This has an accuracy of 0.782.
2. Use a categorical predictor such as the loan intent. This does not have much predictive power, however the accuracy is also X. Why? (All categorical predictors except Loan Grade predict the same class (approved), and therefore these models all have an accuracy of 0.782.)
3. Use a stronger categorical predictor. The loan grade has more predictive power. What accuracy does it have? 0.815.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import numpy
import matplotlib.pyplot as plt
import itertools
import sklearn
from sklearn.metrics import log_loss

In [3]:
import loan_approval_lib
from loan_approval_lib import logspace, linspace

In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB

In [5]:
data = loan_approval_lib.load_original_data()

In [6]:
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3,1
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0


In [7]:
data_categorical = data[
    ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
].copy()

In [8]:
data_categorical

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
0,RENT,PERSONAL,D,Y
1,OWN,EDUCATION,B,N
2,MORTGAGE,MEDICAL,C,N
3,RENT,MEDICAL,C,N
4,RENT,MEDICAL,C,Y
...,...,...,...,...
32576,MORTGAGE,PERSONAL,C,N
32577,MORTGAGE,PERSONAL,A,N
32578,RENT,HOMEIMPROVEMENT,B,N
32579,MORTGAGE,PERSONAL,B,N


In [9]:
data_categorical['loan_status'] = data['loan_status']

In [10]:
data_categorical

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file,loan_status
0,RENT,PERSONAL,D,Y,1
1,OWN,EDUCATION,B,N,0
2,MORTGAGE,MEDICAL,C,N,1
3,RENT,MEDICAL,C,N,1
4,RENT,MEDICAL,C,Y,1
...,...,...,...,...,...
32576,MORTGAGE,PERSONAL,C,N,0
32577,MORTGAGE,PERSONAL,A,N,0
32578,RENT,HOMEIMPROVEMENT,B,N,1
32579,MORTGAGE,PERSONAL,B,N,0


# Create model based on `person_home_ownership`

In [11]:
data_categorical_tmp_person_home_ownership = data_categorical[['person_home_ownership', 'loan_status']].copy()
data_categorical_tmp_person_home_ownership

Unnamed: 0,person_home_ownership,loan_status
0,RENT,1
1,OWN,0
2,MORTGAGE,1
3,RENT,1
4,RENT,1
...,...,...
32576,MORTGAGE,0
32577,MORTGAGE,0
32578,RENT,1
32579,MORTGAGE,0


In [13]:
data_categorical_tmp_person_home_ownership = data_categorical_tmp_person_home_ownership.groupby(
    by=['person_home_ownership']
).aggregate({
    'loan_status': [('loan_status_sum', 'sum'), ('loan_status_count', 'count')],
})
data_categorical_tmp_person_home_ownership

Unnamed: 0_level_0,loan_status,loan_status
Unnamed: 0_level_1,loan_status_sum,loan_status_count
person_home_ownership,Unnamed: 1_level_2,Unnamed: 2_level_2
MORTGAGE,1690,13444
OTHER,33,107
OWN,193,2584
RENT,5192,16446


In [14]:
data_categorical_tmp_person_home_ownership.columns = [column[1] for column in data_categorical_tmp_person_home_ownership.columns]
data_categorical_tmp_person_home_ownership = data_categorical_tmp_person_home_ownership.reset_index()
data_categorical_tmp_person_home_ownership

Unnamed: 0,person_home_ownership,loan_status_sum,loan_status_count
0,MORTGAGE,1690,13444
1,OTHER,33,107
2,OWN,193,2584
3,RENT,5192,16446


In [15]:
data_categorical_tmp_person_home_ownership['loan_status_proba'] = (
    data_categorical_tmp_person_home_ownership['loan_status_sum'] / 
        data_categorical_tmp_person_home_ownership['loan_status_count']
)
data_categorical_tmp_person_home_ownership

Unnamed: 0,person_home_ownership,loan_status_sum,loan_status_count,loan_status_proba
0,MORTGAGE,1690,13444,0.125707
1,OTHER,33,107,0.308411
2,OWN,193,2584,0.07469
3,RENT,5192,16446,0.3157


In [40]:
data_categorical_tmp_person_home_ownership['loan_status_model'] = round(data_categorical_tmp_person_home_ownership['loan_status_proba'])
data_categorical_tmp_person_home_ownership

Unnamed: 0,person_home_ownership,loan_status_sum,loan_status_count,loan_status_proba,loan_status_proba_model,loan_status_model
0,MORTGAGE,1690,13444,0.125707,0.0,0.0
1,OTHER,33,107,0.308411,0.0,0.0
2,OWN,193,2584,0.07469,0.0,0.0
3,RENT,5192,16446,0.3157,0.0,0.0


In [17]:
model_person_home_ownership = data_categorical_tmp_person_home_ownership

# Create model based on `loan_intent`

In [18]:
data_categorical_tmp_loan_intent = data_categorical[['loan_intent', 'loan_status']].copy()
data_categorical_tmp_loan_intent

Unnamed: 0,loan_intent,loan_status
0,PERSONAL,1
1,EDUCATION,0
2,MEDICAL,1
3,MEDICAL,1
4,MEDICAL,1
...,...,...
32576,PERSONAL,0
32577,PERSONAL,0
32578,HOMEIMPROVEMENT,1
32579,PERSONAL,0


In [19]:
data_categorical_tmp_loan_intent = data_categorical_tmp_loan_intent.groupby(
    by=['loan_intent']
).aggregate({
    'loan_status': [('loan_status_sum', 'sum'), ('loan_status_count', 'count')],
})
data_categorical_tmp_loan_intent.columns = [column[1] for column in data_categorical_tmp_loan_intent.columns]
data_categorical_tmp_loan_intent = data_categorical_tmp_loan_intent.reset_index()
data_categorical_tmp_loan_intent

Unnamed: 0,loan_intent,loan_status_sum,loan_status_count
0,DEBTCONSOLIDATION,1490,5212
1,EDUCATION,1111,6453
2,HOMEIMPROVEMENT,941,3605
3,MEDICAL,1621,6071
4,PERSONAL,1098,5521
5,VENTURE,847,5719


In [39]:
data_categorical_tmp_loan_intent['loan_status_proba'] = (
    data_categorical_tmp_loan_intent['loan_status_sum'] / 
        data_categorical_tmp_loan_intent['loan_status_count']
)
data_categorical_tmp_loan_intent['loan_status_model'] = round(data_categorical_tmp_loan_intent['loan_status_proba'])
data_categorical_tmp_loan_intent

Unnamed: 0,loan_intent,loan_status_sum,loan_status_count,loan_status_proba,loan_status_proba_model,loan_statusmodel,loan_status_model
0,DEBTCONSOLIDATION,1490,5212,0.285879,0.0,0.0,0.0
1,EDUCATION,1111,6453,0.172168,0.0,0.0,0.0
2,HOMEIMPROVEMENT,941,3605,0.261026,0.0,0.0,0.0
3,MEDICAL,1621,6071,0.267007,0.0,0.0,0.0
4,PERSONAL,1098,5521,0.198877,0.0,0.0,0.0
5,VENTURE,847,5719,0.148103,0.0,0.0,0.0


In [21]:
model_loan_intent = data_categorical_tmp_loan_intent

# Create model based on `loan_grade`

In [22]:
data_categorical_tmp_loan_grade = data_categorical[['loan_grade', 'loan_status']].copy()
data_categorical_tmp_loan_grade

Unnamed: 0,loan_grade,loan_status
0,D,1
1,B,0
2,C,1
3,C,1
4,C,1
...,...,...
32576,C,0
32577,A,0
32578,B,1
32579,B,0


In [23]:
data_categorical_tmp_loan_grade = data_categorical_tmp_loan_grade.groupby(
    by=['loan_grade']
).aggregate({
    'loan_status': [('loan_status_sum', 'sum'), ('loan_status_count', 'count')],
})
data_categorical_tmp_loan_grade.columns = [column[1] for column in data_categorical_tmp_loan_grade.columns]
data_categorical_tmp_loan_grade = data_categorical_tmp_loan_grade.reset_index()
data_categorical_tmp_loan_grade

Unnamed: 0,loan_grade,loan_status_sum,loan_status_count
0,A,1073,10777
1,B,1701,10451
2,C,1339,6458
3,D,2141,3626
4,E,621,964
5,F,170,241
6,G,63,64


In [37]:
data_categorical_tmp_loan_grade['loan_status_proba'] = (
    data_categorical_tmp_loan_grade['loan_status_sum'] / 
        data_categorical_tmp_loan_grade['loan_status_count']
)
data_categorical_tmp_loan_grade['loan_status_model'] = round(data_categorical_tmp_loan_grade['loan_status_proba'])
data_categorical_tmp_loan_grade

Unnamed: 0,loan_grade,loan_status_sum,loan_status_count,loan_status_proba,loan_status_proba_model,loan_status_model
0,A,1073,10777,0.099564,0.0,0.0
1,B,1701,10451,0.16276,0.0,0.0
2,C,1339,6458,0.20734,0.0,0.0
3,D,2141,3626,0.590458,1.0,1.0
4,E,621,964,0.644191,1.0,1.0
5,F,170,241,0.705394,1.0,1.0
6,G,63,64,0.984375,1.0,1.0


In [25]:
model_loan_grade = data_categorical_tmp_loan_grade

# Create model based on `cb_person_default_on_file`

In [26]:
data_categorical_tmp_cb_person_default_on_file = data_categorical[['cb_person_default_on_file', 'loan_status']].copy()
data_categorical_tmp_cb_person_default_on_file

Unnamed: 0,cb_person_default_on_file,loan_status
0,Y,1
1,N,0
2,N,1
3,N,1
4,Y,1
...,...,...
32576,N,0
32577,N,0
32578,N,1
32579,N,0


In [27]:
data_categorical_tmp_cb_person_default_on_file = data_categorical_tmp_cb_person_default_on_file.groupby(
    by=['cb_person_default_on_file']
).aggregate({
    'loan_status': [('loan_status_sum', 'sum'), ('loan_status_count', 'count')],
})
data_categorical_tmp_cb_person_default_on_file.columns = [column[1] for column in data_categorical_tmp_cb_person_default_on_file.columns]
data_categorical_tmp_cb_person_default_on_file = data_categorical_tmp_cb_person_default_on_file.reset_index()
data_categorical_tmp_cb_person_default_on_file

Unnamed: 0,cb_person_default_on_file,loan_status_sum,loan_status_count
0,N,4936,26836
1,Y,2172,5745


In [36]:
data_categorical_tmp_cb_person_default_on_file['loan_status_proba'] = (
    data_categorical_tmp_cb_person_default_on_file['loan_status_sum'] / 
        data_categorical_tmp_cb_person_default_on_file['loan_status_count']
)
data_categorical_tmp_cb_person_default_on_file['loan_status_model'] = round(data_categorical_tmp_cb_person_default_on_file['loan_status_proba'])
data_categorical_tmp_cb_person_default_on_file

Unnamed: 0,cb_person_default_on_file,loan_status_sum,loan_status_count,loan_status_proba,loan_status_proba_model,loan_status_model
0,N,4936,26836,0.183932,0.0,0.0
1,Y,2172,5745,0.378068,0.0,0.0


In [30]:
model_cb_person_default_on_file = data_categorical_tmp_cb_person_default_on_file

# Apply Models

In [32]:
data_with_model = data.copy()
data_with_model = data_with_model[['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file', 'loan_status']]
data_with_model

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file,loan_status
0,RENT,PERSONAL,D,Y,1
1,OWN,EDUCATION,B,N,0
2,MORTGAGE,MEDICAL,C,N,1
3,RENT,MEDICAL,C,N,1
4,RENT,MEDICAL,C,Y,1
...,...,...,...,...,...
32576,MORTGAGE,PERSONAL,C,N,0
32577,MORTGAGE,PERSONAL,A,N,0
32578,RENT,HOMEIMPROVEMENT,B,N,1
32579,MORTGAGE,PERSONAL,B,N,0


In [42]:
data_with_model['loan_status_person_home_ownership'] = data_with_model['person_home_ownership'].map(
    model_person_home_ownership.set_index('person_home_ownership')['loan_status_model'].to_dict()
)

# all zero, so same as above
#data_with_model['loan_status_loan_intent'] = data_with_model['loan_intent'].map(
#    model_loan_intent.set_index('loan_intent')['loan_status_model'].to_dict()
#)

data_with_model['loan_status_loan_grade'] = data_with_model['loan_grade'].map(
    model_loan_grade.set_index('loan_grade')['loan_status_model'].to_dict()
)

# all zero, so same as top
#data_with_model['loan_status_cb_person_default_on_file'] = data_with_model['cb_person_default_on_file'].map(
#    model_cb_person_default_on_file.set_index('cb_person_default_on_file')['loan_status_model'].to_dict()
#)

data_with_model

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file,loan_status,loan_status_person_home_ownership,loan_status_loan_intent,loan_status_loan_grade,loan_status_cb_person_default_on_file
0,RENT,PERSONAL,D,Y,1,0.0,0.0,1.0,0.0
1,OWN,EDUCATION,B,N,0,0.0,0.0,0.0,0.0
2,MORTGAGE,MEDICAL,C,N,1,0.0,0.0,0.0,0.0
3,RENT,MEDICAL,C,N,1,0.0,0.0,0.0,0.0
4,RENT,MEDICAL,C,Y,1,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
32576,MORTGAGE,PERSONAL,C,N,0,0.0,0.0,0.0,0.0
32577,MORTGAGE,PERSONAL,A,N,0,0.0,0.0,0.0,0.0
32578,RENT,HOMEIMPROVEMENT,B,N,1,0.0,0.0,0.0,0.0
32579,MORTGAGE,PERSONAL,B,N,0,0.0,0.0,0.0,0.0


In [43]:
accuracy_all_zero = (data_with_model['loan_status'] == data_with_model['loan_status_person_home_ownership']).sum() / len(data_with_model)
accuracy_all_zero

np.float64(0.7818360394094718)

In [44]:
accuracy_loan_grade = (data_with_model['loan_status'] == data_with_model['loan_status_loan_grade']).sum() / len(data_with_model)
accuracy_loan_grade

np.float64(0.815444584266904)

In [46]:
accuracy_all_1 = (data_with_model['loan_status'] == 1).sum() / len(data_with_model)
accuracy_all_1

np.float64(0.21816396059052823)

# Summary

- For single variable predictor, loan grade is the best (and only useful) predictor

## TODO:

- Compare with a 2 variable predictor, which is best? Are any effective?
- Compare with Naive Bayes predictor
- Compare with 2 variable Naive Bayes predictor
- If we filter by loan grade first, can we build a 2 level tree which uses another categorical variable to make further improved predictions?