In [None]:
# import relevant libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import xgboost as xgb

In [None]:
# read the dataset and split into training and test sets
loans = pd.read_csv("../data/loan_data.csv")
X = loans.drop(columns="loan_status")
y = loans["loan_status"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=666)
X_train.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
10591,24.0,male,Master,84007.0,2,MORTGAGE,7200.0,MEDICAL,7.88,0.09,2.0,601,Yes
20512,27.0,male,Bachelor,60018.0,3,RENT,4800.0,DEBTCONSOLIDATION,14.65,0.08,7.0,676,Yes
42547,24.0,female,Bachelor,71962.0,0,RENT,9958.0,MEDICAL,14.71,0.14,4.0,674,No
1526,26.0,male,High School,49040.0,4,RENT,1400.0,VENTURE,11.86,0.03,4.0,611,Yes
33678,26.0,male,Associate,116414.0,2,MORTGAGE,5000.0,VENTURE,11.84,0.04,5.0,661,No


In [49]:
# build preprocessor that one hot encodes categorical features and passes through numerical features
categorical_features = X.select_dtypes(include="object").columns

preprocessor = ColumnTransformer(
    transformers=[("ohe", OneHotEncoder(drop="first"), categorical_features)],
    remainder="passthrough"
)

In [None]:
# dictionary of models to be compared
models = {"lr": LogisticRegression(max_iter=500),
          "rf": RandomForestClassifier(),
          "xgb": xgb.XGBClassifier()}

In [None]:
# define model preprocessing and evaluation pipelines
def create_pipeline(name, model):
    return Pipeline(
        steps=[("preprocessor", preprocessor),
               ("scaler", StandardScaler()),
               (name, model)])

def fit_predict_f1_params(model, ensemble=False):
    model.fit(X_train, y_train)
    if ensemble:
        y_pred = model.best_estimator_.predict(X_test)
        return round(f1_score(y_test, y_pred), 2), model.best_params_
    y_pred = model.predict(X_test)
    return round(f1_score(y_test, y_pred), 2), None

In [None]:
# random forest and xgboost hyperparameters
rf_params = {"rf__n_estimators": [300, 400, 500], "rf__max_depth": [20, 30, 50]}
xgb_params = {"xgb__n_estimators": [300, 400, 500], "xgb__max_depth": [20, 30, 50]}

In [None]:
# train and compare model f1 metrics
model = []
f1_scores = []
best_params = []

for name, clf in models.items():
    pipeline = create_pipeline(name, clf)
    if name == "rf":
        rf_grid_search = GridSearchCV(pipeline, param_grid=rf_params, cv=10, scoring="f1", n_jobs=-1)
        f1, params = fit_predict_f1_params(rf_grid_search, True)
    elif name == "xgb":
        xgb_grid_search = GridSearchCV(pipeline, param_grid=xgb_params, cv=10, scoring="f1", n_jobs=-1)
        f1, params = fit_predict_f1_params(xgb_grid_search, True)
    else:
        f1, params = fit_predict_f1_params(pipeline)
    model.append(name)
    f1_scores.append(f1)
    best_params.append(params)
scores = pd.DataFrame({"Models": model, "F1": f1_scores, "Best Params": best_params})
scores

Unnamed: 0,Models,F1,Best Params
0,lr,0.77,
1,rf,0.83,"{'rf__max_depth': 30, 'rf__n_estimators': 500}"
2,xgb,0.84,"{'xgb__max_depth': 20, 'xgb__n_estimators': 400}"
