In [1]:
import pandas as pd

In [2]:
path = 'https://raw.githubusercontent.com/subashgandyer/datasets/main/loan_train.csv'

In [3]:
data = pd.read_csv(path)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
data.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
data['Gender'].mode()
data['Gender'].fillna('male',inplace=True)

# FOR Married
data['Married'].mode()
data['Married'].fillna('Yes',inplace=True)

# FOR Self_Employed
data['Self_Employed'].mode()
data['Self_Employed'].fillna('Yes',inplace=True)

data['Dependents'] = data['Dependents'].replace(['3+'],3)
data['Dependents'].fillna(0,inplace=True)
data['Dependents'].fillna(0,inplace=True)

data.fillna(data.mean(),inplace=True)

  data.fillna(data.mean(),inplace=True)


In [7]:
data.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# CATEGORICAL DATA TO NUMERICAL DATA

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()

In [9]:
data.Gender = le.fit_transform(data.Gender)

data.Married = le.fit_transform(data.Married)

data.Self_Employed = le.fit_transform(data.Self_Employed)

data.Education = le.fit_transform(data.Education)
                                  
data.Loan_Status = le.fit_transform(data.Loan_Status)

In [None]:
ohe = OneHotEncoder()
area = ohe.fit_transform(data[['Property_Area']]).toarray()
a_frame = pd.DataFrame(area)


data_new = pd.concat([data,a_frame],axis=1)

In [None]:
data_new

# DATA SPLITTING, DEPENDENT AND INDEPENDENT VARIABLES

In [None]:
X = data_new.loc[:,['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History',0,1,2]]
y = data['Loan_Status']

In [None]:
X

In [None]:
y

### DATA SPLITTING, TRAIN AND TEST

In [None]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X, y,train_size=0.5,test_size=0.5,random_state=123) 

# DATA PREPROCESSING

In [None]:
from sklearn.preprocessing import scale,MinMaxScaler
min_max = MinMaxScaler()

In [None]:
x_train_minMax = min_max.fit_transform(train_X)
x_test_minMax = min_max.fit_transform(test_X)

# DECISION TREE ALGORITHM 

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='gini',max_depth=None)

In [None]:
tree.fit(x_train_minMax,train_y)

In [None]:
prediction = tree.predict(x_test_minMax)

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_y,prediction))

In [None]:
param_dist = {'criterion': ['gini','entropy'], 'max_depth': [1,2,3,4,5,6,7,None]} 

In [None]:
from sklearn.model_selection import GridSearchCV 

In [None]:
grid = GridSearchCV(tree, param_grid=param_dist, cv=10, n_jobs=-1) 

In [None]:
grid.fit(x_train_minMax,train_y)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid_predictions = grid.predict(x_test_minMax)

In [None]:
print(confusion_matrix(test_y,grid_predictions),'\n with accuracy ',grid.best_score_)

# KNN ALGORITHM 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

Knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
Knn.fit(train_X,train_y)

In [None]:
K_prediction = Knn.predict(test_X)

In [None]:
print(confusion_matrix(test_y,K_prediction))

In [None]:
param_distK = {'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']} 

In [None]:
K_grid = GridSearchCV(KNeighborsClassifier(), param_distK, verbose = 1, cv=3, n_jobs = -1)

In [None]:
K_grid.fit(train_X,train_y)

In [None]:
K_grid.best_params_

In [None]:
K_grid.best_estimator_

In [None]:
grid_predictionsK = K_grid.predict(test_X)

In [None]:
print(confusion_matrix(test_y,grid_predictionsK),'\n with accuracy ',K_grid.best_score_)

# LOGISTIC REGRESSION ALGORITHM 

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression

logReg = LogisticRegression()

In [None]:
logReg.fit(train_X,train_y)

In [None]:
Reg_prediction = logReg.predict(test_X)

In [None]:
print(confusion_matrix(test_y,Reg_prediction))

In [None]:
param_dist_reg = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }

In [None]:
reg_grid = GridSearchCV(logReg, param_grid = param_dist_reg, verbose = 1, cv=3, n_jobs = -1)

In [None]:
reg_grid.fit(train_X,train_y)

In [None]:
reg_grid.best_params_

In [None]:
reg_grid.best_estimator_

In [None]:
grid_predictionsReg = reg_grid.predict(test_X)

In [None]:
print(confusion_matrix(test_y,grid_predictionsReg),'\n with accuracy ',reg_grid.best_score_)

# SVM ALGORITHM

In [None]:
from sklearn.svm import SVC
svm_model = SVC(C= 1, gamma= 0.01, kernel= 'rbf', random_state=50)

In [None]:
svm_model.fit(train_X,train_y)

In [None]:
print(confusion_matrix(test_y,prediction_svm))

In [None]:
param_grid_svm = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [None]:
grid_svm = GridSearchCV(SVC(),param_grid_svm,refit=True,verbose=3)

In [None]:
grid_svm.fit(train_X,train_y)

In [None]:
grid_svm.best_params_

In [None]:
grid_svm.best_estimator_

In [None]:
grid_prediction_svm = grid.predict(test_X)

In [None]:
print(confusion_matrix(test_y,grid_prediction_svm),'\n with accuracy ',grid_svm.best_score_)

# RANDOM FOREST ALGORITHM 

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state =0, n_estimators=100)

In [None]:
rf_model.fit(train_X,train_y)

In [None]:
prediction_rf = rf_model.predict(test_X)

In [None]:
print(confusion_matrix(test_y,prediction_rf))

In [None]:
param_grid_rf = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],

 'n_estimators': [200, 400, 600]}

In [None]:
grid_rf = GridSearchCV(estimator = rf_model, param_grid = param_grid_rf,  cv = 3, verbose=2)

In [None]:
grid_rf.fit(train_X,train_y)

In [None]:
grid_rf.best_params_

In [None]:
grid_rf.best_estimator_

In [None]:
grid_prediction_rf = grid_rf.predict(test_X)

In [None]:
print(confusion_matrix(test_y,grid_prediction_rf),'\n with accuracy ',grid_rf.best_score_)