# Predict the Loan Approval / Rejection Probability using the Loan Grade data

- Same as previous notebook, but include the data cleaning stages which were not previously included
- Also use original data to make predictions rather than the synthetic dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import numpy
import matplotlib
import matplotlib.pyplot as plt
import itertools
import sklearn
from sklearn.metrics import log_loss
from sklearn import preprocessing

In [3]:
import loan_approval_lib
from loan_approval_lib import logspace, linspace

In [4]:
data = loan_approval_lib.load_original_data()

In [5]:
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3,1
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0


# Group data by loan grade and loan status

In [6]:
data_by_grade_and_status = data.groupby(
    by=['loan_grade', 'loan_status']
)['loan_int_rate'].count()

In [7]:
data_by_grade_and_status

loan_grade  loan_status
A           0              8797
            1               977
B           0              7867
            1              1528
C           0              4619
            1              1209
D           0              1343
            1              1971
E           0               310
            1               571
F           0                64
            1               150
G           0                 1
            1                58
Name: loan_int_rate, dtype: int64

In [8]:
data_by_grade_and_status['A', 0]

np.int64(8797)

In [9]:
data_by_grade_and_status['A', 0] / (data_by_grade_and_status['A', 0] + data_by_grade_and_status['A', 1])

np.float64(0.9000409249028034)

In [10]:
data_by_grade_and_status['A', 1] / (data_by_grade_and_status['A', 0] + data_by_grade_and_status['A', 1])

np.float64(0.09995907509719665)

In [11]:
data_by_grade_and_status['B', 0] / (data_by_grade_and_status['B', 0] + data_by_grade_and_status['B', 1])

np.float64(0.8373602980308675)

In [12]:
data_by_grade = data[['loan_grade', 'loan_status']].groupby(
    by=['loan_grade']
)['loan_status'].mean()
data_by_grade

loan_grade
A    0.099564
B    0.162760
C    0.207340
D    0.590458
E    0.644191
F    0.705394
G    0.984375
Name: loan_status, dtype: float64

In [13]:
for group,df in data[['loan_grade', 'loan_status']].groupby(
    by=['loan_grade']
):
    print(f'group={group}')
    print(df)
    print(df['loan_status'].mean())
    print(df['loan_status'].sum())
    print(df['loan_status'].count())
    break

group=('A',)
      loan_grade  loan_status
5              A            1
8              A            1
11             A            1
12             A            1
14             A            0
...          ...          ...
32567          A            0
32569          A            0
32572          A            0
32573          A            0
32577          A            0

[10777 rows x 2 columns]
0.09956388605363274
1073
10777


# Make Prediction

In [14]:
data_2 = loan_approval_lib.load_data()

#data_train = data['train'] # not used

data_test = data_2['test']

In [15]:
model_dict = {
    key: value for (key, value) in data_by_grade.items()
}
model_dict

{'A': 0.09956388605363274,
 'B': 0.16275954454119224,
 'C': 0.20733973366367298,
 'D': 0.590457804743519,
 'E': 0.6441908713692946,
 'F': 0.7053941908713693,
 'G': 0.984375}

In [16]:
f = numpy.vectorize(lambda x: model_dict[x])

In [17]:
data['loan_status_predict'] = f(data['loan_grade']).round()
P = (data['loan_status_predict'] == data['loan_status']).sum() / len(data)
P

np.float64(0.815444584266904)

In [18]:
data_test['loan_status'] = f(data_test['loan_grade'])

In [19]:
data_test

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2,0.705394
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.10,Y,4,0.207340
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2,0.644191
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.90,0.14,N,7,0.099564
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4,0.590458
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39093,97738,22,31200,MORTGAGE,2.0,DEBTCONSOLIDATION,B,3000,10.37,0.10,N,4,0.162760
39094,97739,22,48000,MORTGAGE,6.0,EDUCATION,A,7000,6.03,0.15,N,3,0.099564
39095,97740,51,60000,MORTGAGE,0.0,PERSONAL,A,15000,7.51,0.25,N,25,0.099564
39096,97741,22,36000,MORTGAGE,4.0,PERSONAL,D,14000,15.62,0.39,Y,4,0.590458


In [20]:
data_test[['id', 'loan_status']].to_csv('data_test_submission_predict_using_loan_grade_no_data_cleaning.csv', index=False)