In [120]:
import pandas as pd

In [121]:
df = pd.read_csv("Loan_Data.csv")

In [122]:
df.drop(columns="Loan_ID")
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## PROCESSING

In [123]:
from sklearn.model_selection import train_test_split

In [124]:
X = df.drop(columns=["Loan_ID","Loan_Status"], axis=1)
y = df["Loan_Status"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [125]:
y_train = y_train.apply(lambda x: 1 if x == "Y" else 0)
y_test = y_test.apply(lambda x: 1 if x == "Y" else 0)


In [126]:
num_cols = []
cat_cols = []

for i in range(len(X.dtypes)):
    if(X.dtypes[i] == "object"):
        cat_cols.append(X.dtypes.index[i])
    else:
        num_cols.append(X.dtypes.index[i])

In [141]:
cat_cols

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area']

In [142]:
num_cols

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [2]:
pip freeze | Select-String 'sklearn'

Note: you may need to restart the kernel to use updated packages.


'Select-String' is not recognized as an internal or external command,
operable program or batch file.


## MODELING

In [127]:
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [128]:
cat_transformer = Pipeline([
    ("c_i", SimpleImputer(strategy="most_frequent")),
    ("e", OneHotEncoder())
])

In [129]:
num_transformer = Pipeline([
    ("n_i", SimpleImputer(strategy="mean"))
])

In [130]:
transformer = [
    ("n_t", num_transformer, num_cols),
    ("c_t", cat_transformer, cat_cols)
]

Logistic Regression

In [131]:
model_lr = Pipeline([
        ("pre", ColumnTransformer(transformers=transformer)),
        ("model", LogisticRegression())
    ])

In [132]:
# print(model_lr)
model_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [133]:
print(classification_report(y_test, model_lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.40      0.55        45
           1       0.74      0.96      0.83        78

    accuracy                           0.76       123
   macro avg       0.80      0.68      0.69       123
weighted avg       0.78      0.76      0.73       123



RandomForest Classifier

In [134]:
model_rf = Pipeline([
        ("pre", ColumnTransformer(transformers=transformer)),
        ("model", RandomForestClassifier())
    ])

In [135]:
model_rf.fit(X_train, y_train)

In [136]:
print(classification_report(y_test, model_rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.47      0.60        45
           1       0.76      0.95      0.84        78

    accuracy                           0.77       123
   macro avg       0.80      0.71      0.72       123
weighted avg       0.79      0.77      0.75       123



## Model Export

In [137]:
import joblib

In [138]:
joblib.dump(model_lr, "model_lr.joblib")

['model_lr.joblib']

In [139]:
joblib.dump(model_rf, "model_rf.joblib")

['model_rf.joblib']