In [173]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

from warnings import simplefilter
simplefilter(action='ignore')

<b>Importing DataSet

In [104]:
df = pd.read_csv('loan_predict.csv')

In [105]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [106]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [107]:
df['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [108]:
df.shape

(614, 13)

In [109]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [110]:
df.dropna(axis=0, inplace=True)

In [111]:
df['Property_Area'].value_counts()

Semiurban    191
Urban        150
Rural        139
Name: Property_Area, dtype: int64

In [112]:
df['Dependents'][df['Dependents']=='3+']=4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [128]:
df.drop(['Loan_ID'], axis=1, inplace=True)

<b>Encoding categorical data values (Transforming categorical data/ Strings to integers)

In [113]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder_col = ['Gender','Married','Education','Self_Employed','Loan_Status']
for col in encoder_col:
    df[col] = encoder.fit_transform(df[col])

In [114]:
df['Dependents'] = df['Dependents'].astype('int32')

In [124]:
df = pd.concat([df,pd.get_dummies(df['Property_Area'])], axis=1).drop(['Property_Area'], axis=1)

In [176]:
df.drop(['Rural'],axis=1,inplace=True)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Semiurban,Urban
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,1,0,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,1,0,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,1,0,1
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,1,0,1


<b>Split the data into independent 'X' and dependent 'Y' variables

In [179]:
X = df.drop(['Loan_Status'], axis=1).values
Y = df.iloc[:, 10:11].values

<b>Split the dataset into 75% Training set and 25% Testing set

In [180]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.25)

<b>Feature scalling

In [181]:
from sklearn.preprocessing import StandardScaler

scaller = StandardScaler()
X_train = scaller.fit_transform(X_train)
X_test = scaller.transform(X_test)

<h2>Classification</h2>

In [182]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

def models(X_train,Y_tarin):
    #Using Logistic Regression Algorithm to the Training Set
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression()
    log.fit(X_train,Y_train)
    parameters = [{'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}]
    logGS = GridSearchCV(estimator = log, param_grid = parameters, scoring = 'accuracy', n_jobs = -1)
    logGS.fit(X_train, Y_train)
    
    #Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier()
    knn.fit(X_train, Y_train)
    parameters = [{'n_neighbors':[1,2,3,4,5,6,7,8]}]
    knnGS = GridSearchCV(estimator = knn, param_grid = parameters, scoring = 'accuracy', n_jobs = -1)
    knnGS.fit(X_train, Y_train)
    
    #Using SVC method of svm class to use Support Vector Machine Algorithm
    from sklearn.svm import SVC
    svc = SVC()
    svc.fit(X_train, Y_train)
    parameters = [{'kernel':['linear', 'poly', 'rbf', 'sigmoid']}]
    svcGS = GridSearchCV(estimator = svc, param_grid = parameters, scoring = 'accuracy', n_jobs = -1)
    svcGS.fit(X_train, Y_train)
    
    #Using GaussianNB method of naïve_bayes class to use Naïve Bayes Algorithm
    from sklearn.naive_bayes import GaussianNB
    gauss = GaussianNB()
    gauss.fit(X_train, Y_train)
    
    #Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier()
    tree.fit(X_train, Y_train)
    parameters = [{'criterion':['entropy', 'gini']}]
    treeGS = GridSearchCV(estimator = tree, param_grid = parameters, scoring = 'accuracy', n_jobs = -1)
    treeGS.fit(X_train, Y_train)
    
    #Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm
    from sklearn.ensemble import RandomForestClassifier
    forest = DecisionTreeClassifier()
    forest.fit(X_train, Y_train)
    parameters = [{'criterion':['entropy', 'gini']}]
    forestGS = GridSearchCV(estimator = forest, param_grid = parameters, scoring = 'accuracy', n_jobs = -1)
    forestGS.fit(X_train, Y_train)
    
    model_name = ['Logistic Regression','KNeighbors','SVM', 'GaussianNB', 'Decision Tree', 'Random Forest']
    
    return logGS, knnGS, svcGS, gauss, treeGS, forestGS, model_name

In [183]:
model = models(X_train, Y_train)

In [184]:
for i in range(len(model)-1):
    print('Model : ',model[6][i])
    #Check precision, recall, f1-score
    print(classification_report(Y_test, model[i].predict(X_test)))
    print('------------------')
    print( accuracy_score(Y_test, model[i].predict(X_test)))
    print('-----------------------------------------------------')

Model :  Logistic Regression
              precision    recall  f1-score   support

           0       1.00      0.36      0.53        42
           1       0.74      1.00      0.85        78

    accuracy                           0.78       120
   macro avg       0.87      0.68      0.69       120
weighted avg       0.83      0.78      0.74       120

------------------
0.775
-----------------------------------------------------
Model :  KNeighbors
              precision    recall  f1-score   support

           0       0.89      0.38      0.53        42
           1       0.75      0.97      0.84        78

    accuracy                           0.77       120
   macro avg       0.82      0.68      0.69       120
weighted avg       0.80      0.77      0.74       120

------------------
0.7666666666666667
-----------------------------------------------------
Model :  SVM
              precision    recall  f1-score   support

           0       1.00      0.36      0.53        42
    

As the dataset is imbalanced, there are more number loan approval than disapproved loans.<br>
i.e Model does not have sufficient data to learn from..

<u>DataFrame in Loan_Status</u>
<br>
count 1 is more (332)<br>
count 0 is less (148)

In [185]:
df['Loan_Status'].value_counts()

1    332
0    148
Name: Loan_Status, dtype: int64