# Categorical Naive Bayes Classifier

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import numpy
import matplotlib.pyplot as plt
import itertools
import sklearn
from sklearn.metrics import log_loss

In [3]:
import loan_approval_lib
from loan_approval_lib import logspace, linspace

In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB

In [5]:
data = loan_approval_lib.load_original_data()

In [6]:
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3,1
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0


In [7]:
data_test = loan_approval_lib.load_data()['test']

In [8]:
data_test

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.10,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.90,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
39093,97738,22,31200,MORTGAGE,2.0,DEBTCONSOLIDATION,B,3000,10.37,0.10,N,4
39094,97739,22,48000,MORTGAGE,6.0,EDUCATION,A,7000,6.03,0.15,N,3
39095,97740,51,60000,MORTGAGE,0.0,PERSONAL,A,15000,7.51,0.25,N,25
39096,97741,22,36000,MORTGAGE,4.0,PERSONAL,D,14000,15.62,0.39,Y,4


# Categorical NB Classifier

In [9]:
data_categorical = data[
    ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
]

In [10]:
data_categorical

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
0,RENT,PERSONAL,D,Y
1,OWN,EDUCATION,B,N
2,MORTGAGE,MEDICAL,C,N
3,RENT,MEDICAL,C,N
4,RENT,MEDICAL,C,Y
...,...,...,...,...
32576,MORTGAGE,PERSONAL,C,N
32577,MORTGAGE,PERSONAL,A,N
32578,RENT,HOMEIMPROVEMENT,B,N
32579,MORTGAGE,PERSONAL,B,N


In [11]:
# nothing to do
data_categorical.isna().sum()

person_home_ownership        0
loan_intent                  0
loan_grade                   0
cb_person_default_on_file    0
dtype: int64

In [12]:
data_categorical.to_numpy()

array([['RENT', 'PERSONAL', 'D', 'Y'],
       ['OWN', 'EDUCATION', 'B', 'N'],
       ['MORTGAGE', 'MEDICAL', 'C', 'N'],
       ...,
       ['RENT', 'HOMEIMPROVEMENT', 'B', 'N'],
       ['MORTGAGE', 'PERSONAL', 'B', 'N'],
       ['RENT', 'MEDICAL', 'B', 'N']], shape=(32581, 4), dtype=object)

In [13]:
encoder = OrdinalEncoder()
encoder.fit(data_categorical.to_numpy())

In [14]:
encoder.categories_

[array(['MORTGAGE', 'OTHER', 'OWN', 'RENT'], dtype=object),
 array(['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL',
        'PERSONAL', 'VENTURE'], dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype=object),
 array(['N', 'Y'], dtype=object)]

In [15]:
data_categorical_encoded = encoder.transform(data_categorical.to_numpy())

In [16]:
data_categorical_encoded

array([[3., 4., 3., 1.],
       [2., 1., 1., 0.],
       [0., 3., 2., 0.],
       ...,
       [3., 2., 1., 0.],
       [0., 4., 1., 0.],
       [3., 3., 1., 0.]], shape=(32581, 4))

In [17]:
X = data_categorical_encoded
y = data['loan_status']

clf = CategoricalNB()
clf.fit(X, y)

In [18]:
data_categorical_encoded[0, :].reshape(1, -1)

array([[3., 4., 3., 1.]])

In [19]:
clf.predict(data_categorical_encoded[0, :].reshape(1, -1))

array([1])

In [20]:
data['loan_status'][0]

np.int64(1)

In [21]:
data['loan_status_predict'] = clf.predict(data_categorical_encoded)
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,loan_status_predict
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3,1,1
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1,0
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1,0
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0,0
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0,0
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1,0
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0,0


In [29]:
P = (data['loan_status'] == data['loan_status_predict']).sum() / len(data)
P

np.float64(0.8114852214480832)

In [22]:
data_test_categorical = data_test[
    ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
]

In [23]:
data_test_categorical

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
0,RENT,HOMEIMPROVEMENT,F,N
1,MORTGAGE,PERSONAL,C,Y
2,RENT,VENTURE,E,Y
3,RENT,DEBTCONSOLIDATION,A,N
4,MORTGAGE,HOMEIMPROVEMENT,D,Y
...,...,...,...,...
39093,MORTGAGE,DEBTCONSOLIDATION,B,N
39094,MORTGAGE,EDUCATION,A,N
39095,MORTGAGE,PERSONAL,A,N
39096,MORTGAGE,PERSONAL,D,Y


In [24]:
data_test_categorical_encoded = encoder.transform(data_test_categorical.to_numpy())

In [25]:
data_test_categorical_encoded

array([[3., 2., 5., 0.],
       [0., 4., 2., 1.],
       [3., 5., 4., 1.],
       ...,
       [0., 4., 0., 0.],
       [0., 4., 3., 1.],
       [3., 0., 1., 0.]], shape=(39098, 4))

In [26]:
clf.predict(data_test_categorical_encoded)

array([1, 0, 1, ..., 0, 1, 0], shape=(39098,))

In [27]:
data_test_with_loan_status = data_test
data_test_with_loan_status['loan_status'] = clf.predict(data_test_categorical_encoded)

In [43]:
data_test_with_loan_status

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2,1
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.10,Y,4,0
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2,1
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.90,0.14,N,7,0
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39093,97738,22,31200,MORTGAGE,2.0,DEBTCONSOLIDATION,B,3000,10.37,0.10,N,4,0
39094,97739,22,48000,MORTGAGE,6.0,EDUCATION,A,7000,6.03,0.15,N,3,0
39095,97740,51,60000,MORTGAGE,0.0,PERSONAL,A,15000,7.51,0.25,N,25,0
39096,97741,22,36000,MORTGAGE,4.0,PERSONAL,D,14000,15.62,0.39,Y,4,1


In [44]:
data_test_with_loan_status[['id', 'loan_status']].to_csv(
    'data_test_predict-categorical_NB.csv',
    index=False,
)

See the "simple model" Notebook files for the next steps