In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## Preprocessing - with missing indicators

In [None]:
train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)  # does not contain targets

#Gender
train_no_gender = train.copy().drop(columns="Gender")
test_no_gedner = test.copy().drop(columns="Gender")

# Married
train_no_nan_married = train_no_gender.copy().dropna(axis=0, subset=["Married"])
train_no_nan_married = pd.get_dummies(train_no_nan_married, columns=["Married"], drop_first=True)

# Dependents
train_dependent_only_int = train_no_nan_married.copy().replace("3+", 3)
for number in range(0, 3):
    train_dependent_only_int = train_dependent_only_int.replace(f"{number}", number)

train_dependents_no_nan = train_dependent_only_int.copy()
median = np.nanmedian(train_dependent_only_int.Dependents)
train_dependents_no_nan["Missing_Dependents"] = [int(x) for x in train_dependent_only_int.Dependents.isnull().values]
train_dependents_no_nan.Dependents = train_dependent_only_int.copy().Dependents.fillna(median)

# Education
train_education_dummies = pd.get_dummies(train_dependents_no_nan.copy(), columns=["Education"], drop_first=True)

missing_ind = MissingIndicator(error_on_new=True, features="missing-only")
train_self_employed_encoded = train_education_dummies.copy()

# Self_Employed
train_self_employed_encoded["Missing_Self_Employed"] = missing_ind.fit_transform(train_self_employed_encoded.Self_Employed.values.reshape(-1, 1))
train_self_employed_encoded["Missing_Self_Employed"] = train_self_employed_encoded["Missing_Self_Employed"].replace([True, False], [1, 0])
train_self_employed_encoded.Self_Employed = train_self_employed_encoded.Self_Employed.replace([np.nan, "No", "Yes"], [0, 0, 1]) 

# Loan_Amount_Term
si = SimpleImputer(strategy="median")

train_imputed_loan_amount_term = train_self_employed_encoded.copy()
train_imputed_loan_amount_term.Loan_Amount_Term = si.fit_transform(train_imputed_loan_amount_term.Loan_Amount_Term.values.reshape(-1, 1))

# Credit_History
train_credit_history_no_nan = train_imputed_loan_amount_term.copy()

missing_ind = MissingIndicator(error_on_new=True, features="missing-only")
si = SimpleImputer(strategy="median")

train_credit_history_no_nan["Missing_Credit_History"] = missing_ind.fit_transform(train_credit_history_no_nan.Credit_History.values.reshape(-1, 1))
train_credit_history_no_nan["Missing_Credit_History"] = train_credit_history_no_nan["Missing_Credit_History"].replace([True, False], [1, 0])

train_credit_history_no_nan.Credit_History = si.fit_transform(train_credit_history_no_nan.Credit_History.values.reshape(-1, 1))

# Property_Area and Loan_Status
train_property_area_n_target = pd.get_dummies(train_credit_history_no_nan.copy(), columns=["Property_Area", "Loan_Status"], drop_first=True)

# Loan amount
train_LoanAmount_itterative_imputer = train_property_area_n_target.copy()

X = train_LoanAmount_itterative_imputer.iloc[:, :-1]
y = train_LoanAmount_itterative_imputer.iloc[:, -1]

imp_mean = IterativeImputer(random_state=0)
X = imp_mean.fit_transform(X)

X = pd.DataFrame(X, columns=train_LoanAmount_itterative_imputer.iloc[:, :-1].columns)

## Preprocessing - containing nans

In [3]:
train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)  # does not contain targets

#Gender
train_no_gender = train.copy().drop(columns="Gender")
test_no_gedner = test.copy().drop(columns="Gender")

# Married
train_no_nan_married = train_no_gender.copy().dropna(axis=0, subset=["Married"])
train_no_nan_married = pd.get_dummies(train_no_nan_married, columns=["Married"], drop_first=True)

# Dependents
train_dependent_only_int = train_no_nan_married.copy().replace("3+", 3)
for number in range(0, 3):
    train_dependent_only_int = train_dependent_only_int.replace(f"{number}", number)

train_dependents_no_nan = train_dependent_only_int.copy()
#median = np.nanmedian(train_dependent_only_int.Dependents)
#train_dependents_no_nan["Missing_Dependents"] = [int(x) for x in train_dependent_only_int.Dependents.isnull().values]
#train_dependents_no_nan.Dependents = train_dependent_only_int.copy().Dependents.fillna(median)

# Education
train_education_dummies = pd.get_dummies(train_dependents_no_nan.copy(), columns=["Education"], drop_first=True)

#missing_ind = MissingIndicator(error_on_new=True, features="missing-only")
train_self_employed_encoded = train_education_dummies.copy()

# Self_Employed
#train_self_employed_encoded["Missing_Self_Employed"] = missing_ind.fit_transform(train_self_employed_encoded.Self_Employed.values.reshape(-1, 1))
#train_self_employed_encoded["Missing_Self_Employed"] = train_self_employed_encoded["Missing_Self_Employed"].replace([True, False], [1, 0])
train_self_employed_encoded.Self_Employed = train_self_employed_encoded.Self_Employed.replace(["No", "Yes"], [0, 1]) 

# Loan_Amount_Term
si = SimpleImputer(strategy="median")

train_imputed_loan_amount_term = train_self_employed_encoded.copy()
train_imputed_loan_amount_term.Loan_Amount_Term = si.fit_transform(train_imputed_loan_amount_term.Loan_Amount_Term.values.reshape(-1, 1))

# Credit_History
##train_credit_history_no_nan = train_imputed_loan_amount_term.copy()

##missing_ind = MissingIndicator(error_on_new=True, features="missing-only")
##si = SimpleImputer(strategy="median")

##train_credit_history_no_nan["Missing_Credit_History"] = missing_ind.fit_transform(train_credit_history_no_nan.Credit_History.values.reshape(-1, 1))
##train_credit_history_no_nan["Missing_Credit_History"] = train_credit_history_no_nan["Missing_Credit_History"].replace([True, False], [1, 0])

##train_credit_history_no_nan.Credit_History = si.fit_transform(train_credit_history_no_nan.Credit_History.values.reshape(-1, 1))

# Property_Area and Loan_Status
train_property_area_n_target = pd.get_dummies(train_imputed_loan_amount_term.copy(), columns=["Property_Area", "Loan_Status"], drop_first=True)

# Loan amount
train_LoanAmount_itterative_imputer = train_property_area_n_target.copy()

X = train_LoanAmount_itterative_imputer.iloc[:, :-1]
y = train_LoanAmount_itterative_imputer.iloc[:, -1]

imp_mean = IterativeImputer(random_state=0)
X = imp_mean.fit_transform(X)

X = pd.DataFrame(X, columns=train_LoanAmount_itterative_imputer.iloc[:, :-1].columns)

In [48]:
X

Unnamed: 0,Dependents,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Married_Yes,Missing_Dependents,Education_Not Graduate,Missing_Self_Employed,Missing_Credit_History,Property_Area_Semiurban,Property_Area_Urban
0,0.0,0.0,5849.0,0.0,138.028436,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,4583.0,1508.0,128.000000,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,3000.0,0.0,66.000000,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,2583.0,2358.0,120.000000,360.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,6000.0,0.0,141.000000,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,2900.0,0.0,71.000000,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,3.0,0.0,4106.0,0.0,40.000000,180.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
608,1.0,0.0,8072.0,240.0,253.000000,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
609,2.0,0.0,7583.0,0.0,187.000000,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


## Split data into train and test sets

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=123)

In [5]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=100, n_estimators=1000, use_label_encoder=False)

xgb.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 1000,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 100,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [6]:
from sklearn.model_selection import GridSearchCV

param_grid   = [
    {'learning_rate': [0.1, 0.05, 0.001], 
     'max_depth': [2,3,4,5,6,7,8,9],
     #'logisticregression__penalty': ['l2'],
     #'logisticregression__solver': ['newton-cg', 'lbfgs', 'saga', 'sag']
    }
    ]  # param_grid containing all parameters that I want to change to get the most optimal solution for the SVM algorithm

gs = GridSearchCV(estimator  = xgb,
                  param_grid = param_grid,
                  scoring    = 'accuracy',
                  cv         = 5,
                  n_jobs     = -1,
                  verbose    = 3)  # Searching through all possible combinations from param_grid. Cross-validation with 10 fold

In [7]:
%%time
gs = gs.fit(X_train, y_train)  # Fit the training data to grid search to find the parameters that gives the highest score

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.1min finished


Wall time: 1min 4s


In [8]:

print(f"Best score: {gs.best_score_}")
print(f"Best params:\n{gs.best_params_}")


Best score: 0.8245417236662107
Best params:
{'learning_rate': 0.001, 'max_depth': 6}


In [9]:
for r, _ in enumerate(gs.cv_results_['mean_test_score']):
    print("%0.4f +/- %0.4f %r"
          % (gs.cv_results_['mean_test_score'][r],
             gs.cv_results_['std_test_score'][r],
             gs.cv_results_['params'][r]))

0.7637 +/- 0.0407 {'learning_rate': 0.1, 'max_depth': 2}
0.7685 +/- 0.0688 {'learning_rate': 0.1, 'max_depth': 3}
0.7777 +/- 0.0452 {'learning_rate': 0.1, 'max_depth': 4}
0.7801 +/- 0.0522 {'learning_rate': 0.1, 'max_depth': 5}
0.7801 +/- 0.0538 {'learning_rate': 0.1, 'max_depth': 6}
0.7825 +/- 0.0552 {'learning_rate': 0.1, 'max_depth': 7}
0.7848 +/- 0.0559 {'learning_rate': 0.1, 'max_depth': 8}
0.7731 +/- 0.0533 {'learning_rate': 0.1, 'max_depth': 9}
0.7871 +/- 0.0416 {'learning_rate': 0.05, 'max_depth': 2}
0.7895 +/- 0.0622 {'learning_rate': 0.05, 'max_depth': 3}
0.7731 +/- 0.0488 {'learning_rate': 0.05, 'max_depth': 4}
0.7801 +/- 0.0544 {'learning_rate': 0.05, 'max_depth': 5}
0.7661 +/- 0.0599 {'learning_rate': 0.05, 'max_depth': 6}
0.7824 +/- 0.0538 {'learning_rate': 0.05, 'max_depth': 7}
0.7871 +/- 0.0587 {'learning_rate': 0.05, 'max_depth': 8}
0.7731 +/- 0.0528 {'learning_rate': 0.05, 'max_depth': 9}
0.8175 +/- 0.0393 {'learning_rate': 0.001, 'max_depth': 2}
0.8175 +/- 0.0393 {'l

In [10]:
clf = gs.best_estimator_
clf.fit(X_train, y_train)
print(f"Test accuracy: {clf.score(X_test, y_test):.3f}")  # Looking at how the model preformed against the test set

Test accuracy: 0.755


