In [53]:
import pandas as pd
import joblib

In [2]:
df = pd.read_csv('../dataset/final_data.csv')

df

Unnamed: 0,Gender,Married,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
0,1,0,5849,146.412162,1.0,1
1,1,0,4583,128.000000,1.0,0
2,1,0,3000,66.000000,1.0,1
3,1,0,2583,120.000000,1.0,1
4,1,0,6000,141.000000,1.0,1
...,...,...,...,...,...,...
609,0,0,2900,71.000000,1.0,1
610,1,0,4106,40.000000,1.0,1
611,1,0,8072,253.000000,1.0,1
612,1,0,7583,187.000000,1.0,1


In [3]:
from sklearn.model_selection import train_test_split

X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

column_names = ['ApplicantIncome', 'LoanAmount']
ct = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), column_names)
    ], remainder='passthrough'
)

In [6]:
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [7]:
X_train

array([[ 0.14263852, -0.02970205,  1.        ,  0.        ,  0.        ],
       [-0.40170105, -0.96189107,  1.        ,  0.        ,  1.        ],
       [-0.30340938, -0.45181503,  1.        ,  0.        ,  0.        ],
       ...,
       [ 2.22273054,  3.83746073,  1.        ,  0.        ,  1.        ],
       [-0.03548767, -0.069258  ,  0.        ,  0.        ,  0.        ],
       [ 1.61247255, -0.73003832,  1.        ,  0.        ,  1.        ]],
      shape=(460, 5))

In [8]:
X_test

array([[-4.07706951e-01, -6.83667776e-01,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [-3.53507371e-01, -1.27221190e-01,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [-4.72306992e-01, -8.57557334e-01,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [-4.63957327e-01, -2.97020461e-02,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [-3.39737748e-01,  1.18904563e-02,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [-3.52921429e-01,  2.43743200e-01,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [-1.39492270e-01, -1.12948181e-02,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [ 3.91663621e-01,  4.40818033e-01,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [-3.10587162e-01, -1.08941008e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [ 1.05714658e+00,  2.96801294e+00,  1.00000000e+00,
         0.00000000e+00

In [33]:
def create_model(model, xtrain, ytrain, xtest):
    model.fit(xtrain, ytrain)
    prediction = model.predict(xtest)
    return prediction

In [41]:
def accuracy(ytest, prediction, accuracy):
    return accuracy(ytest, prediction)

In [43]:
def f1score(ytest, prediction, f1):
    return f1(ytest, prediction)

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

metric_accuracy = []
metric_f1 = []
models = [LogisticRegression(), KNeighborsClassifier(), GaussianNB(), SVC(), RandomForestClassifier()]

for model in models:
    model_prediction = create_model(model=model, xtrain=X_train, ytrain=y_train, xtest=X_test)
    model_accuracy = accuracy(ytest=y_test, prediction=model_prediction, accuracy=accuracy_score)
    model_f1 = f1score(ytest=y_test, prediction=model_prediction, f1=f1_score)
    metric_accuracy.append(model_accuracy)
    metric_f1.append(model_f1)

In [50]:
for i, accuracy in enumerate(metric_accuracy, start=1):
    print(f'Index:{i} Accuracy: {accuracy}')

Index:1 Accuracy: 0.8051948051948052
Index:2 Accuracy: 0.7142857142857143
Index:3 Accuracy: 0.8051948051948052
Index:4 Accuracy: 0.7987012987012987
Index:5 Accuracy: 0.7207792207792207


In [51]:
for i, f1 in enumerate(metric_f1, start=1):
    print(f'Index:{i} F1_Score: {f1}')

Index:1 F1_Score: 0.88
Index:2 F1_Score: 0.8135593220338984
Index:3 F1_Score: 0.88
Index:4 F1_Score: 0.8764940239043825
Index:5 F1_Score: 0.8170212765957446


In [52]:
# we keep Logistic Regression as ML-Model

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accu = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accu}')
print(f'F1 Score: {f1}')

Accuracy: 0.8051948051948052
F1 Score: 0.88


In [57]:
joblib.dump(model, 'ml_pipeline.pkl')

['ml_pipeline.pkl']