In [1]:
import os

import mlflow
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## About data set
This data set contains information about load applicants
We will solve a binary classification problem to determine if their loan will be approved
Note: The point of this project is to explore MLflow, so I will not be putting too much effort into EDA

In [2]:
loan_df = pd.read_csv("data/loan_train.csv")
loan_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,Male,No,0,Graduate,No,584900,0.0,15000000,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,458300,150800.0,12800000,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,300000,0.0,6600000,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,258300,235800.0,12000000,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,600000,0.0,14100000,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,290000,0.0,7100000,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,410600,0.0,4000000,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,807200,24000.0,25300000,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,758300,0.0,18700000,360.0,1.0,Urban,Y


## EDA

In [3]:
loan_df.shape

(614, 12)

In [4]:
loan_df.describe()

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History
count,614.0,614.0,614.0,600.0,564.0
mean,540345.9,162124.6,14141040.0,342.0,0.842199
std,610904.2,292624.8,8815682.0,65.12041,0.364878
min,15000.0,0.0,0.0,12.0,0.0
25%,287750.0,0.0,9800000.0,360.0,1.0
50%,381250.0,118850.0,12500000.0,360.0,1.0
75%,579500.0,229725.0,16475000.0,360.0,1.0
max,8100000.0,4166700.0,70000000.0,480.0,1.0


In [5]:
loan_df.isnull().sum()

Gender                13
Married                3
Dependents            15
Education              0
Self_Employed         32
Applicant_Income       0
Coapplicant_Income     0
Loan_Amount            0
Term                  14
Credit_History        50
Area                   0
Status                 0
dtype: int64

### Data Preparation

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

y = loan_df[["Status"]]
X = loan_df.drop(["Status"], axis=1)

# I want to use KNN for the categorical variables. This can not be done directly
# First label encode the categories while keeping NaNs
X = X.apply(lambda series: pd.Series(
    LabelEncoder().fit_transform(series[series.notnull()]),
    index=series[series.notnull()].index
))

X_train, X_test, y_train , y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [7]:
from sklearn.impute import KNNImputer

# We will perform KNN imputation to fill in our data. Why? Quick and easy for out purposes

knn_imputer = KNNImputer()


X_train_tr = knn_imputer.fit_transform(X_train)

X_test_tr = knn_imputer.transform(X_test)





### MLflow

Logging a logistic regression model

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
with mlflow.start_run():
    lr = LogisticRegression()
    lr.fit(X_train_tr, y_train)

    mlflow.sklearn.log_model(lr, "logistic regression model")
    y_pred = lr.predict(X_test_tr)
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("model_accuracy", accuracy)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Now I can view the mlflow UI on my local host by running "mlflow ui" in the terminal