In [None]:
import pandas as pd
import numpy as np
import random
import os
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import RFECV

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv("data/modified_0420.csv")
x_train = df.drop(columns=['loan_status','Unnamed: 0'])
y_train = df['loan_status']
nan_indices = np.isnan(x_train).any(axis=1)
x_train = x_train[~nan_indices]
y_train = y_train[~nan_indices]
scale = StandardScaler()
x_scaled = scale.fit_transform(x_train)

x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)

In [6]:
rf = RandomForestClassifier(random_state = 42
                         , n_estimators = 305
                         , criterion = 'gini'
                         , max_depth = 62
                         , min_samples_split = 7
                         , min_samples_leaf = 1)
dt = DecisionTreeClassifier(random_state = 42
                         , criterion = 'entropy'
                         , max_depth = 25
                         , min_samples_split = 2
                         , min_samples_leaf = 1)
et = ExtraTreesClassifier(random_state = 42
                         , n_estimators = 930
                         , criterion = 'entropy'
                         , max_depth = 65
                         , min_samples_split = 6
                         , min_samples_leaf = 1
                         )
xgb = XGBClassifier(random_state = 42
                   , n_estimators = 665
                   , reg_lambda = 0.04614513317156364
                   , reg_alpha = 0.8831857977740336
                   , tree_method = "exact"
                   , colsample_bytree = 0.7664006730032823
                   , subsample = 0.6579847353498132
                   , learning_rate = 0.4046062291148477
                   , max_depth = 64
                   , min_child_weight = 2
                   )

In [7]:
estimators = [('et',et), ('xgb',xgb), ('dt',dt), ('rf',rf)]
stack = StackingClassifier(estimators, final_estimator=LogisticRegression(), verbose=1)
stack.fit(x_train,y_train)
pred = stack.predict(x_validation)

KeyboardInterrupt: 

In [None]:
conf_matrix = confusion_matrix(y_validation, pred)
accuracy = accuracy_score(y_validation, pred)
precision = precision_score(y_validation, pred)
recall = recall_score(y_validation, pred)
f1score = f1_score(y_validation, pred)

print(f"Accuracy = {accuracy}")
print(f"Precision = {precision}")
print(f"Recall = {recall}")
print(f"F1 Score = {f1score}")

In [None]:
xgb.save_model("statistical_model/xgboost_stacking.model")