In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, Pool, cv, CatBoost

from sklearn.inspection import permutation_importance

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score

In [5]:
train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)  # does not contain targets

#Gender
##train_no_gender = train.copy().drop(columns="Gender")
##test_no_gedner = test.copy().drop(columns="Gender")

train_no_gender = train.copy().dropna(subset=["Gender"])
test_no_gender = test.copy().dropna(subset=["Gender"])

le = LabelEncoder()
train_no_gender.Gender = le.fit_transform(train_no_gender.Gender)
test_no_gender.Gender = le.transform(test_no_gender.Gender)


# Married
train_no_nan_married = train_no_gender.copy().dropna(axis=0, subset=["Married"])
train_no_nan_married = pd.get_dummies(train_no_nan_married, columns=["Married"], drop_first=True)

test_no_nan_married = test_no_gender.copy().dropna(axis=0, subset=["Married"])
test_no_nan_married = pd.get_dummies(test_no_nan_married, columns=["Married"], drop_first=True)

# Dependents
train_dependent_only_int = train_no_nan_married.copy().replace("3+", 3)
for number in range(0, 3):
    train_dependent_only_int = train_dependent_only_int.replace(f"{number}", number)

train_dependents_no_nan = train_dependent_only_int.copy()

test_dependent_only_int = test_no_nan_married.copy().replace("3+", 3)
for number in range(0, 3):
    train_dependent_only_int = test_dependent_only_int.replace(f"{number}", number)

test_dependents_no_nan = test_dependent_only_int.copy()

# Education
train_education_dummies = pd.get_dummies(train_dependents_no_nan.copy(), columns=["Education"], drop_first=True)

test_education_dummies = pd.get_dummies(test_dependents_no_nan.copy(), columns=["Education"], drop_first=True)

# Self_Employed
train_self_employed_encoded = train_education_dummies.copy()

train_self_employed_encoded.Self_Employed = train_self_employed_encoded.Self_Employed.replace(["No", "Yes"], [0, 1])

test_self_employed_encoded = test_education_dummies.copy()

test_self_employed_encoded.Self_Employed = test_self_employed_encoded.Self_Employed.replace(["No", "Yes"], [0, 1]) 

# Loan_Amount_Term
si = SimpleImputer(strategy="median")

train_imputed_loan_amount_term = train_self_employed_encoded.copy()
train_imputed_loan_amount_term.Loan_Amount_Term = si.fit_transform(train_imputed_loan_amount_term.Loan_Amount_Term.values.reshape(-1, 1))


test_imputed_loan_amount_term = test_self_employed_encoded.copy()
test_imputed_loan_amount_term.Loan_Amount_Term = si.transform(test_imputed_loan_amount_term.Loan_Amount_Term.values.reshape(-1, 1))
# Credit_History



# Property_Area and Loan_Status
train_property_area_n_target = pd.get_dummies(train_imputed_loan_amount_term.copy(), columns=["Property_Area", "Loan_Status"], drop_first=True)

test_property_area_n_target = pd.get_dummies(test_imputed_loan_amount_term.copy(), columns=["Property_Area"], drop_first=True)

# Loan amount
train_LoanAmount_itterative_imputer = train_property_area_n_target.copy()

X = train_LoanAmount_itterative_imputer.iloc[:, :-1]
y = train_LoanAmount_itterative_imputer.iloc[:, -1]

imp_mean = IterativeImputer(random_state=0)
X = imp_mean.fit_transform(X)

X = pd.DataFrame(X, columns=train_LoanAmount_itterative_imputer.iloc[:, :-1].columns)

test_LoanAmount_itterative_imputer = test_property_area_n_target.copy()

X_test = test_LoanAmount_itterative_imputer
X_test = imp_mean.transform(X_test)
X_test = pd.DataFrame(X_test, columns=test_LoanAmount_itterative_imputer.columns)

## Train our model on the Train data, then make predictions on our Test data

In [7]:
model = CatBoostClassifier(depth=6, l2_leaf_reg=1, learning_rate=0.1, iterations=16)  # model for Streamlit
model.fit(X.iloc[:, [0, 3, 4, 5, 6, 7, 10]], y)  # The columns for our model

0:	learn: 0.6595022	total: 143ms	remaining: 2.14s
1:	learn: 0.6300864	total: 144ms	remaining: 1.01s
2:	learn: 0.6068011	total: 146ms	remaining: 633ms
3:	learn: 0.5848395	total: 147ms	remaining: 442ms
4:	learn: 0.5668290	total: 149ms	remaining: 328ms
5:	learn: 0.5496625	total: 150ms	remaining: 251ms
6:	learn: 0.5339164	total: 152ms	remaining: 195ms
7:	learn: 0.5216715	total: 154ms	remaining: 154ms
8:	learn: 0.5106223	total: 156ms	remaining: 121ms
9:	learn: 0.5018110	total: 157ms	remaining: 94.4ms
10:	learn: 0.4938930	total: 159ms	remaining: 72.2ms
11:	learn: 0.4869199	total: 160ms	remaining: 53.4ms
12:	learn: 0.4812908	total: 162ms	remaining: 37.3ms
13:	learn: 0.4757017	total: 163ms	remaining: 23.3ms
14:	learn: 0.4708437	total: 165ms	remaining: 11ms
15:	learn: 0.4651778	total: 166ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1b9755d2b20>

In [20]:
predictions = model.predict(X_test.iloc[:, [0, 3, 4, 5, 6, 7, 10]])
predictions = pd.Series(predictions, dtype=int)

test_data = X_test.copy()
test_data["Loan_Status"] = predictions.replace([0, 1], ["N", "Y"])

In [21]:
test_data

Unnamed: 0,Gender,Dependents,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Married_Yes,Education_Not Graduate,Property_Area_Semiurban,Property_Area_Urban,Loan_Status
0,1.0,0.0,0.0,5720.0,0.0,110.0,360.0,1.000000,1.0,0.0,0.0,1.0,Y
1,1.0,1.0,0.0,3076.0,1500.0,126.0,360.0,1.000000,1.0,0.0,0.0,1.0,Y
2,1.0,2.0,0.0,5000.0,1800.0,208.0,360.0,1.000000,1.0,0.0,0.0,1.0,Y
3,1.0,2.0,0.0,2340.0,2546.0,100.0,360.0,0.848999,1.0,0.0,0.0,1.0,Y
4,1.0,0.0,0.0,3276.0,0.0,78.0,360.0,1.000000,0.0,1.0,0.0,1.0,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,1.0,3.0,1.0,4009.0,1777.0,113.0,360.0,1.000000,1.0,1.0,0.0,1.0,Y
352,1.0,0.0,0.0,4158.0,709.0,115.0,360.0,1.000000,1.0,0.0,0.0,1.0,Y
353,1.0,0.0,0.0,3250.0,1993.0,126.0,360.0,0.847238,0.0,0.0,1.0,0.0,Y
354,1.0,0.0,0.0,5000.0,2393.0,158.0,360.0,1.000000,1.0,0.0,0.0,0.0,Y


## Summary and Visualization

In [22]:
test_data.Loan_Status.value_counts()

Y    295
N     61
Name: Loan_Status, dtype: int64