In [1]:
import pandas as pd
data = pd.read_csv('LoanApprovalPrediction.csv')
data.shape

(598, 13)

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            598 non-null    object 
 1   Gender             598 non-null    object 
 2   Married            598 non-null    object 
 3   Dependents         586 non-null    float64
 4   Education          598 non-null    object 
 5   Self_Employed      598 non-null    object 
 6   ApplicantIncome    598 non-null    int64  
 7   CoapplicantIncome  598 non-null    float64
 8   LoanAmount         577 non-null    float64
 9   Loan_Amount_Term   584 non-null    float64
 10  Credit_History     549 non-null    float64
 11  Property_Area      598 non-null    object 
 12  Loan_Status        598 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 60.9+ KB


In [3]:
data.isna().sum()

Loan_ID               0
Gender                0
Married               0
Dependents           12
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     14
Credit_History       49
Property_Area         0
Loan_Status           0
dtype: int64

In [4]:
data.Loan_ID.nunique()

598

In [5]:
# Dropping Loan_ID column
data.drop(['Loan_ID'], axis=1, inplace=True)

# Data Cleaning and Preparation


In [6]:
data.isna().sum().sum()
# 96

96

In [7]:
data.Gender = data.Gender.map({'Male': 0, 'Female':1})

In [8]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
obj = (data.dtypes == 'object')
for col in list(obj[obj].index):
    data[col] = label_encoder.fit_transform(data[col])

In [9]:
for col in data.columns:
    data[col] = data[col].fillna(data[col].mean())

# Model Training

In [10]:
x = data.drop(['Loan_Status'], axis=1)
y = data.Loan_Status

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=7)

In [12]:
pip install --upgrade scikit-learn

Requirement already up-to-date: scikit-learn in /opt/anaconda3/lib/python3.8/site-packages (1.3.0)
Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [14]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

models = []
models.append(('LR', LogisticRegression(max_iter=1000)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVC', SVC()))
models.append(('RC', RidgeClassifier()))
models.append(('RF', RandomForestClassifier()))


def modeling(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    return accuracy_score(y_test, y_pred) * 100
     
for name, model in models:
    print(f'{name} = {modeling(model)}')
     
LR = 80.83333333333333
LDA = 82.5
KNN = 63.74999999999999
CART = 68.33333333333333
NB = 81.66666666666667
SVC = 69.16666666666667
RC = 82.91666666666667
RF = 81.66666666666667

LR = 81.66666666666667
LDA = 82.22222222222221
KNN = 66.11111111111111
CART = 74.44444444444444
NB = 82.77777777777777
SVC = 74.44444444444444
RC = 82.22222222222221
RF = 80.0


In [16]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
import pickle


# load the data
data = pd.read_csv('LoanApprovalPrediction.csv')
# Drop Loan_ID column
data.drop(['Loan_ID'], axis=1, inplace=True)
# convert to int datatype
label_encoder = LabelEncoder()
obj = (data.dtypes == 'object')
for col in list(obj[obj].index):
    data[col] = label_encoder.fit_transform(data[col])

# fill in missing rows
for col in data.columns:
    data[col] = data[col].fillna(data[col].mean())
# divide model into features and target variable
x = data.drop(['Loan_Status'], axis=1)
y = data.Loan_Status

# divide into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=7)
# define the model
model = RidgeClassifier()
# fit the model on the training data
model.fit(x_train, y_train)
#save the train model
with open('train_model.pkl', mode='wb') as pkl:
    pickle.dump(model, pkl)