<h2> Introduction </h2<

In [1]:
# Importing python modules
import pandas as pd
import warnings
warnings.filterwarnings(action= "ignore", message= "Ignoring Warning")

In [2]:
loan_df = pd.read_csv('./dataset/loan_data_set.csv')  # using pandas read_csv function to load the loan data.

In [3]:
loan_df.head(20)  # returns first 20 samples of the loan data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [4]:
loan_df.describe()  # describes statistical information on integer columns of the data.

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [5]:
loan_df.info()  # shows information on the loan data like columns with null data and datatype of each column.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [6]:
loan_df.isnull().sum()  # counts number null values present in each columns.

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
# creating a list of columns with null values
col_null = []
column = list(loan_df.columns)
print(column)
for col in column:
    if loan_df[col].isna().sum() > 0:
        col_null.append(col)
print(col_null)

['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']
['Gender', 'Married', 'Dependents', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [8]:
# creating a list of the modes for each of the columns with missing values
mode = []
for miss in col_null:
    mode.append(loan_df[miss].mode())
print(mode)

[0    Male
dtype: object, 0    Yes
dtype: object, 0    0
dtype: object, 0    No
dtype: object, 0    120.0
dtype: float64, 0    360.0
dtype: float64, 0    1.0
dtype: float64]


In [9]:
# picks the modes in each values inside the list.
real_mode = []
for modes in mode:
    real_mode.append(modes[0])
print(real_mode)

['Male', 'Yes', '0', 'No', 120.0, 360.0, 1.0]


In [10]:
# creates a dictionary where the keys are the missing columns and the values show the mode of each columns
val = {}
for miss in col_null:
    for m in real_mode:
        val[miss] = m
        real_mode.remove(m)
        break

In [11]:
print(val)

{'Gender': 'Male', 'Married': 'Yes', 'Dependents': '0', 'Self_Employed': 'No', 'LoanAmount': 120.0, 'Loan_Amount_Term': 360.0, 'Credit_History': 1.0}


In [12]:
# fills the missing values in each column
for i,k in val.items():
    loan_df[i].fillna(k, inplace= True)

In [13]:
loan_df.info()  # checking number of missing values present.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      614 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [14]:
loan_df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [15]:
#  identifying columns with object datatype
s = (loan_df.dtypes == 'object')

In [16]:
# creating list of objects with object datatypes.
obj_col = list(s[s].index)
obj_col.remove("Loan_ID")  # removing the id column

In [17]:
obj_col

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Loan_Status']

In [18]:
# importing the ordinal encoder class from the sklearn preprocessing module
from sklearn.preprocessing import OrdinalEncoder
ordinate = OrdinalEncoder()  # the ordinate object takes the OrdinalEncoder class as it's values.

In [19]:
loan_data = loan_df.copy()  # making a copy of the loan_df to be stored with loan_data

In [20]:
# transforming the columns with object datatypes.
loan_data[obj_col] = ordinate.fit_transform(loan_data[obj_col])

In [21]:
loan_data.head()  # outputs top 5 rows of the dataset.

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1.0,0.0,0.0,0.0,0.0,5849,0.0,120.0,360.0,1.0,2.0,1.0
1,LP001003,1.0,1.0,1.0,0.0,0.0,4583,1508.0,128.0,360.0,1.0,0.0,0.0
2,LP001005,1.0,1.0,0.0,0.0,1.0,3000,0.0,66.0,360.0,1.0,2.0,1.0
3,LP001006,1.0,1.0,0.0,1.0,0.0,2583,2358.0,120.0,360.0,1.0,2.0,1.0
4,LP001008,1.0,0.0,0.0,0.0,0.0,6000,0.0,141.0,360.0,1.0,2.0,1.0


In [22]:
X = loan_data.drop(["Loan_Status", "Loan_ID"], axis= 1)  # variable X after dropping off the Loan_Status and Loan_ID
y = loan_data.Loan_Status  # variable y takes the Loan Status column.

In [23]:
# importing the Standard Scaler and Robust Scaler classes from the sklearn preprocessing module
from sklearn.preprocessing import StandardScaler, RobustScaler
scale = StandardScaler()  # creating a variable named scale saving the StandardScaler class
rob = RobustScaler()  # creating a variable named rob saving the RobustScaler class.

In [24]:
# transforming the variable X using the scale and robust class
scaled_X = scale.fit_transform(X)
rob_X = rob.fit_transform(X, y)

In [25]:
# importing train_test_split and the necessary algorithms used in building the voting classifier.
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
train_X, test_X, train_y, test_y = train_test_split(rob_X, y, random_state=1234, test_size= 0.2)

In [26]:
# creating variables to store algorithm classes
lr = LogisticRegression(random_state= 1234, n_jobs= 1)
svc = SVC(random_state= 1234, C =1, kernel= 'poly', degree= 0, max_iter= 1)
rf = RandomForestClassifier(n_estimators= 100, n_jobs= 1, max_leaf_nodes= 6)

In [27]:
voting_clf = VotingClassifier(
    estimators= [('lr', lr), ('rf', rf), ('svc', svc)],
    voting = "hard",
)

In [28]:
voting_clf.fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier(estimators=[('lr',
                              LogisticRegression(n_jobs=1, random_state=1234)),
                             ('rf',
                              RandomForestClassifier(max_leaf_nodes=6,
                                                     n_jobs=1)),
                             ('svc',
                              SVC(C=1, degree=0, kernel='poly', max_iter=1,
                                  random_state=1234))])

In [29]:
from sklearn.metrics import accuracy_score, f1_score

In [30]:
for clf in (lr, rf, svc, voting_clf):
    clf.fit(train_X, train_y)
    y_pred = clf.predict(test_X)
    print(clf.__class__.__name__, accuracy_score(y_pred, test_y), f1_score(y_pred, test_y))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression 0.8617886178861789 0.9081081081081082
RandomForestClassifier 0.8617886178861789 0.9081081081081082
SVC 0.6829268292682927 0.8115942028985507


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier 0.8617886178861789 0.9081081081081082




In [31]:
new_file = {}
details = ['Male', 'No', 1, 'Not Graduate', 'No', int(2000), int(0), int(55), int(180), int(0), 'Rural']
column.remove('Loan_ID')
column.remove('Loan_Status')
for col in column :
    for det in details:
        new_file[col] = det
        details.remove(det)
        break

In [32]:
import numpy as np
detail = ['Male', 'No', 1, 'Not Graduate', 'No', int(2000), int(0), int(55), int(180), int(0), 'Rural']
np_det = np.array([detail])

In [33]:
print(np_det.shape)
new_data = pd.DataFrame(new_file, columns= column, index = range(1))


(1, 11)


In [34]:
new_fil = np.array(new_file)

In [35]:
new_file

{'Gender': 'Male',
 'Married': 'No',
 'Dependents': 1,
 'Education': 'Not Graduate',
 'Self_Employed': 'No',
 'ApplicantIncome': 2000,
 'CoapplicantIncome': 0,
 'LoanAmount': 55,
 'Loan_Amount_Term': 180,
 'Credit_History': 0,
 'Property_Area': 'Rural'}

In [36]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Gender             1 non-null      object
 1   Married            1 non-null      object
 2   Dependents         1 non-null      int64 
 3   Education          1 non-null      object
 4   Self_Employed      1 non-null      object
 5   ApplicantIncome    1 non-null      int64 
 6   CoapplicantIncome  1 non-null      int64 
 7   LoanAmount         1 non-null      int64 
 8   Loan_Amount_Term   1 non-null      int64 
 9   Credit_History     1 non-null      int64 
 10  Property_Area      1 non-null      object
dtypes: int64(6), object(5)
memory usage: 216.0+ bytes


In [37]:
new_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,1,Not Graduate,No,2000,0,55,180,0,Rural


In [38]:
y = (new_data.dtypes == 'object')
obj = list(y[y].index)
new_data[obj] = ordinate.fit_transform(new_data[obj])

In [39]:
new_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0.0,0.0,1,0.0,0.0,2000,0,55,180,0,0.0


In [40]:
new_pred = voting_clf.predict(new_data)



In [41]:
new_pred

array([1.])