In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, Pool, cv, CatBoost

from sklearn.inspection import permutation_importance

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score

In [5]:
train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)  # does not contain targets

#Gender
##train_no_gender = train.copy().drop(columns="Gender")
##test_no_gedner = test.copy().drop(columns="Gender")

train_no_gender = train.copy().dropna(subset=["Gender"])
test_no_gender = test.copy().dropna(subset=["Gender"])

le = LabelEncoder()
train_no_gender.Gender = le.fit_transform(train_no_gender.Gender)
test_no_gender.Gender = le.transform(test_no_gender.Gender)


# Married
train_no_nan_married = train_no_gender.copy().dropna(axis=0, subset=["Married"])
train_no_nan_married = pd.get_dummies(train_no_nan_married, columns=["Married"], drop_first=True)

test_no_nan_married = test_no_gender.copy().dropna(axis=0, subset=["Married"])
test_no_nan_married = pd.get_dummies(test_no_nan_married, columns=["Married"], drop_first=True)

# Dependents
train_dependent_only_int = train_no_nan_married.copy().replace("3+", 3)
for number in range(0, 3):
    train_dependent_only_int = train_dependent_only_int.replace(f"{number}", number)

train_dependents_no_nan = train_dependent_only_int.copy()

test_dependent_only_int = test_no_nan_married.copy().replace("3+", 3)
for number in range(0, 3):
    train_dependent_only_int = test_dependent_only_int.replace(f"{number}", number)

test_dependents_no_nan = test_dependent_only_int.copy()

# Education
train_education_dummies = pd.get_dummies(train_dependents_no_nan.copy(), columns=["Education"], drop_first=True)

test_education_dummies = pd.get_dummies(test_dependents_no_nan.copy(), columns=["Education"], drop_first=True)

# Self_Employed
train_self_employed_encoded = train_education_dummies.copy()

train_self_employed_encoded.Self_Employed = train_self_employed_encoded.Self_Employed.replace(["No", "Yes"], [0, 1])

test_self_employed_encoded = test_education_dummies.copy()

test_self_employed_encoded.Self_Employed = test_self_employed_encoded.Self_Employed.replace(["No", "Yes"], [0, 1]) 

# Loan_Amount_Term
si = SimpleImputer(strategy="median")

train_imputed_loan_amount_term = train_self_employed_encoded.copy()
train_imputed_loan_amount_term.Loan_Amount_Term = si.fit_transform(train_imputed_loan_amount_term.Loan_Amount_Term.values.reshape(-1, 1))


test_imputed_loan_amount_term = test_self_employed_encoded.copy()
test_imputed_loan_amount_term.Loan_Amount_Term = si.transform(test_imputed_loan_amount_term.Loan_Amount_Term.values.reshape(-1, 1))
# Credit_History



# Property_Area and Loan_Status
train_property_area_n_target = pd.get_dummies(train_imputed_loan_amount_term.copy(), columns=["Property_Area", "Loan_Status"], drop_first=True)

test_property_area_n_target = pd.get_dummies(test_imputed_loan_amount_term.copy(), columns=["Property_Area"], drop_first=True)

# Loan amount
train_LoanAmount_itterative_imputer = train_property_area_n_target.copy()

X = train_LoanAmount_itterative_imputer.iloc[:, :-1]
y = train_LoanAmount_itterative_imputer.iloc[:, -1]

imp_mean = IterativeImputer(random_state=0)
X = imp_mean.fit_transform(X)

X = pd.DataFrame(X, columns=train_LoanAmount_itterative_imputer.iloc[:, :-1].columns)

test_LoanAmount_itterative_imputer = test_property_area_n_target.copy()

X_test = test_LoanAmount_itterative_imputer
X_test = imp_mean.transform(X_test)
X_test = pd.DataFrame(X_test, columns=test_LoanAmount_itterative_imputer.columns)