This notebook developed to investigate the results of the Categorical Bayes Models

Compare several methods of predicting the loan rejection.

1. Use a simple probability. The probability of rejection is < 50 %, therefore always predict 0. This has an accuracy of X.
2. Use a categorical predictor such as the loan intent. This does not have much predictive power, however the accuracy is also X. Why?
3. Use a stronger categorical predictor. The loan grade has more predictive power. What accuracy does it have?

There are no useful results in this Notebook. However some of the code may be useful.

For example, this Notebook contains:

- example code to calculate the probability values using a groupby operation (single variable dependent)
- example code to calculate the probability values using a combination of a groupby operation and a map operation using an existing dataframe (two variable dependent)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import numpy
import matplotlib.pyplot as plt
import itertools
import sklearn
from sklearn.metrics import log_loss

In [3]:
import loan_approval_lib
from loan_approval_lib import logspace, linspace

In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB

In [5]:
data = loan_approval_lib.load_original_data()

In [6]:
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3,1
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0


In [10]:
data_categorical = data[
    ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
].copy()

In [11]:
data_categorical

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
0,RENT,PERSONAL,D,Y
1,OWN,EDUCATION,B,N
2,MORTGAGE,MEDICAL,C,N
3,RENT,MEDICAL,C,N
4,RENT,MEDICAL,C,Y
...,...,...,...,...
32576,MORTGAGE,PERSONAL,C,N
32577,MORTGAGE,PERSONAL,A,N
32578,RENT,HOMEIMPROVEMENT,B,N
32579,MORTGAGE,PERSONAL,B,N


In [12]:
data_categorical['loan_status'] = data['loan_status']

In [13]:
data_categorical

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file,loan_status
0,RENT,PERSONAL,D,Y,1
1,OWN,EDUCATION,B,N,0
2,MORTGAGE,MEDICAL,C,N,1
3,RENT,MEDICAL,C,N,1
4,RENT,MEDICAL,C,Y,1
...,...,...,...,...,...
32576,MORTGAGE,PERSONAL,C,N,0
32577,MORTGAGE,PERSONAL,A,N,0
32578,RENT,HOMEIMPROVEMENT,B,N,1
32579,MORTGAGE,PERSONAL,B,N,0


In [73]:
data_categorical_tmp = data_categorical[['person_home_ownership', 'loan_status']].copy()
data_categorical_tmp['dummy'] = 1
data_categorical_tmp

Unnamed: 0,person_home_ownership,loan_status,dummy
0,RENT,1,1
1,OWN,0,1
2,MORTGAGE,1,1
3,RENT,1,1
4,RENT,1,1
...,...,...,...
32576,MORTGAGE,0,1
32577,MORTGAGE,0,1
32578,RENT,1,1
32579,MORTGAGE,0,1


In [77]:
data_categorical_tmp_2 = data_categorical_tmp.groupby(
    by=['person_home_ownership']
).aggregate({
    'dummy': [('person_home_ownership_count', 'count')]
})
data_categorical_tmp_2.columns = [column[1] for column in data_categorical_tmp_2.columns]
data_categorical_tmp_2

Unnamed: 0_level_0,person_home_ownership_count
person_home_ownership,Unnamed: 1_level_1
MORTGAGE,13444
OTHER,107
OWN,2584
RENT,16446


In [80]:
data_categorical_tmp_2['person_home_ownership_proba'] = \
    data_categorical_tmp_2['person_home_ownership_count'] / \
    data_categorical_tmp_2['person_home_ownership_count'].sum()
data_categorical_tmp_2 = data_categorical_tmp_2.reset_index()
data_categorical_tmp_2

Unnamed: 0,person_home_ownership,person_home_ownership_count,person_home_ownership_proba
0,MORTGAGE,13444,0.412633
1,OTHER,107,0.003284
2,OWN,2584,0.07931
3,RENT,16446,0.504773


In [79]:
data_categorical_tmp

Unnamed: 0,person_home_ownership,loan_status,dummy
0,RENT,1,1
1,OWN,0,1
2,MORTGAGE,1,1
3,RENT,1,1
4,RENT,1,1
...,...,...,...
32576,MORTGAGE,0,1
32577,MORTGAGE,0,1
32578,RENT,1,1
32579,MORTGAGE,0,1


In [81]:
data_categorical_tmp['person_home_ownership_proba'] = data_categorical_tmp['person_home_ownership'].map(
    data_categorical_tmp_2.set_index('person_home_ownership')['person_home_ownership_proba'].to_dict()
)
data_categorical_tmp

Unnamed: 0,person_home_ownership,loan_status,dummy,person_home_ownership_proba
0,RENT,1,1,0.504773
1,OWN,0,1,0.079310
2,MORTGAGE,1,1,0.412633
3,RENT,1,1,0.504773
4,RENT,1,1,0.504773
...,...,...,...,...
32576,MORTGAGE,0,1,0.412633
32577,MORTGAGE,0,1,0.412633
32578,RENT,1,1,0.504773
32579,MORTGAGE,0,1,0.412633


In [85]:
model_person_home_ownership = data_categorical_tmp.groupby(
    by=['person_home_ownership', 'loan_status']
).aggregate({
    'dummy': [('count', 'count')]
})
model_person_home_ownership.columns = [column[1] for column in model_person_home_ownership.columns]
model_person_home_ownership = model_person_home_ownership.reset_index()
model_person_home_ownership

Unnamed: 0,person_home_ownership,loan_status,count
0,MORTGAGE,0,11754
1,MORTGAGE,1,1690
2,OTHER,0,74
3,OTHER,1,33
4,OWN,0,2391
5,OWN,1,193
6,RENT,0,11254
7,RENT,1,5192


In [87]:
model_person_home_ownership['proba'] = model_person_home_ownership.groupby(
    by=['person_home_ownership']
)['count'].transform(lambda x: x / x.sum())
model_person_home_ownership

Unnamed: 0,person_home_ownership,loan_status,count,proba
0,MORTGAGE,0,11754,0.874293
1,MORTGAGE,1,1690,0.125707
2,OTHER,0,74,0.691589
3,OTHER,1,33,0.308411
4,OWN,0,2391,0.92531
5,OWN,1,193,0.07469
6,RENT,0,11254,0.6843
7,RENT,1,5192,0.3157


In [88]:
model_person_home_ownership['person_home_ownership_proba'] = model_person_home_ownership['person_home_ownership'].map(
    data_categorical_tmp_2.set_index('person_home_ownership')['person_home_ownership_proba'].to_dict()
)
model_person_home_ownership

Unnamed: 0,person_home_ownership,loan_status,count,proba,person_home_ownership_proba
0,MORTGAGE,0,11754,0.874293,0.412633
1,MORTGAGE,1,1690,0.125707,0.412633
2,OTHER,0,74,0.691589,0.003284
3,OTHER,1,33,0.308411,0.003284
4,OWN,0,2391,0.92531,0.07931
5,OWN,1,193,0.07469,0.07931
6,RENT,0,11254,0.6843,0.504773
7,RENT,1,5192,0.3157,0.504773


In [90]:
model_person_home_ownership['person_home_ownership_loan_status_proba'] = \
    model_person_home_ownership['proba'] * model_person_home_ownership['person_home_ownership_proba']
model_person_home_ownership

Unnamed: 0,person_home_ownership,loan_status,count,proba,person_home_ownership_proba,person_home_ownership_loan_status_proba
0,MORTGAGE,0,11754,0.874293,0.412633,0.360762
1,MORTGAGE,1,1690,0.125707,0.412633,0.051871
2,OTHER,0,74,0.691589,0.003284,0.002271
3,OTHER,1,33,0.308411,0.003284,0.001013
4,OWN,0,2391,0.92531,0.07931,0.073386
5,OWN,1,193,0.07469,0.07931,0.005924
6,RENT,0,11254,0.6843,0.504773,0.345416
7,RENT,1,5192,0.3157,0.504773,0.159357


In [91]:
model_person_home_ownership['person_home_ownership_loan_status_proba'].sum()

np.float64(0.9999999999999999)

In [92]:
# make a mistake, the model probabilities in the above df are wrong. actual probabilities that I wanted are in the 'proba' column
# the problem is simpler than I thought