In [1]:
#Importing the required libraries 
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

**Loading the Loan Default dataset**


In [2]:
df1 = pd.read_csv("Loan_Default.csv")
df1.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [3]:
df1.columns

Index(['ID', 'year', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type',
       'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'loan_amount', 'rate_of_interest',
       'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization',
       'interest_only', 'lump_sum_payment', 'property_value',
       'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
       'income', 'credit_type', 'Credit_Score', 'co-applicant_credit_type',
       'age', 'submission_of_application', 'LTV', 'Region', 'Security_Type',
       'Status', 'dtir1'],
      dtype='object')

In [4]:
#Droping the columns that will not effect the output
df2 = df1.drop(['ID','year','rate_of_interest','Interest_rate_spread','Upfront_charges','Gender'],axis='columns')
df2.head()

Unnamed: 0,loan_limit,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,term,Neg_ammortization,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,cf,nopre,type1,p1,l1,nopc,nob/c,116500,360.0,not_neg,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,cf,nopre,type2,p1,l1,nopc,b/c,206500,360.0,not_neg,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,cf,pre,type1,p1,l1,nopc,nob/c,406500,360.0,neg_amm,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,cf,nopre,type1,p4,l1,nopc,nob/c,456500,360.0,not_neg,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,cf,pre,type1,p1,l1,nopc,nob/c,696500,360.0,not_neg,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


**Droping tthe NAN values from the dataframe**

In [5]:
df2.isnull().sum()

loan_limit                    3344
approv_in_adv                  908
loan_type                        0
loan_purpose                   134
Credit_Worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
term                            41
Neg_ammortization              121
interest_only                    0
lump_sum_payment                 0
property_value               15098
construction_type                0
occupancy_type                   0
Secured_by                       0
total_units                      0
income                        9150
credit_type                      0
Credit_Score                     0
co-applicant_credit_type         0
age                            200
submission_of_application      200
LTV                          15098
Region                           0
Security_Type                    0
Status                           0
dtir1                        24121
dtype: int64

In [6]:
df3 = df2.dropna()
df3.isnull().sum()

loan_limit                   0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
loan_amount                  0
term                         0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
property_value               0
construction_type            0
occupancy_type               0
Secured_by                   0
total_units                  0
income                       0
credit_type                  0
Credit_Score                 0
co-applicant_credit_type     0
age                          0
submission_of_application    0
LTV                          0
Region                       0
Security_Type                0
Status                       0
dtir1                        0
dtype: int64

**Converting the age range and string units into a float value**

In [7]:
#Function for the convertion of range string to numerical value
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None   

In [8]:
#applying the function of the dataset
df4 = df3.copy()
df4.age = df4.age.apply(convert_sqft_to_num)
df4 = df4[df4.age.notnull()]
df4.head()

Unnamed: 0,loan_limit,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,term,Neg_ammortization,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,cf,nopre,type1,p1,l1,nopc,nob/c,116500,360.0,not_neg,...,EXP,758,CIB,29.5,to_inst,98.728814,south,direct,1,45.0
2,cf,pre,type1,p1,l1,nopc,nob/c,406500,360.0,neg_amm,...,EXP,834,CIB,39.5,to_inst,80.019685,south,direct,0,46.0
3,cf,nopre,type1,p4,l1,nopc,nob/c,456500,360.0,not_neg,...,EXP,587,CIB,49.5,not_inst,69.3769,North,direct,0,42.0
4,cf,pre,type1,p1,l1,nopc,nob/c,696500,360.0,not_neg,...,CRIF,602,EXP,29.5,not_inst,91.886544,North,direct,0,39.0
5,cf,pre,type1,p1,l1,nopc,nob/c,706500,360.0,not_neg,...,EXP,864,EXP,39.5,not_inst,70.089286,North,direct,0,40.0


In [9]:
#Converting the stings(xu) into integer(x)
df4['total_units'] = df3['total_units'].apply(lambda x: int(x[0]))
df4.total_units.unique()

array([1, 2, 3, 4])

**Converting the Categorical data columns into numerical values using onehot encoding**

In [10]:
#Function  to list the columns having categorical data 
obj=[]
for i in df4.columns:
    if df4[i].dtype=='object':
        obj.append(i)
obj
l=[]
for i in obj:
    l.append(df4.columns.get_loc(i))
l

[0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 13, 14, 15, 18, 20, 22, 24, 25]

In [11]:
#Dividing the input and output values from the dataset
y=df4['Status']
X = df4.drop(['Status'],axis='columns')

In [12]:
#Diving the dataset into train and test 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=10)

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),l)],remainder='passthrough')
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

**Model bulding using logistic regression**

In [14]:
#Fitting the Train Test data for model applying
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [15]:
#Applying logistic regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [16]:
#Predicting output values using the model
y_pred = classifier.predict(X_test)

In [17]:
#Making confusion matrix and calculating the accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[23581   227]
 [ 4172   489]]


0.8454810495626822