In [1]:
import pandas as pd
import numpy as np
# https://www.kaggle.com/code/carlmcbrideellis/feature-selection-using-the-boruta-shap-package
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import random
import pickle

In [2]:
data = pd.read_csv("data_with_new_features.csv")
features = pd.read_csv('feature_importances_xgb.csv', nrows = 20)
features.head()
selected_features = list[features['Unnamed: 0'].values]
selected_features

list[array(['months_employed', 'credit_score', 'age', 'interest_rate',
       'credit_age_factor', 'dtiratio', 'interest_payment_burden',
       'income_to_interest_ratio', 'credit_income_inter',
       'income_to_loan_ratio', 'loan_pay_to_inc_ratio', 'loan_amount',
       'empl_sta', 'credit_util_ratio', 'income', 'has_co_signer=No',
       'employment_type=Full-time', 'loan_term',
       'employment_type=Unemployed', 'has_dependents=No'], dtype=object)]

In [3]:
data_new = data.copy()
y = data_new["default"]
del data_new["default"]

data_new = data_new[selected_features]

In [4]:
df_full_train, df_test, y_full_train, y_test = train_test_split(data_new, y, stratify = y, test_size = 0.2, random_state = 1)

df_train, df_val, y_train, y_val = train_test_split(df_full_train, y_full_train, stratify =y_full_train,  test_size = 0.25, random_state = 1)

In [5]:
df_full_train = df_full_train.reset_index(drop = True)
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [6]:
y_full_train = y_full_train.values
y_train = y_train.values
y_val = y_val.values
y_test = y_test.values

In [7]:
full_train_dicts = df_full_train.to_dict(orient = 'records')
train_dicts = df_train.to_dict(orient = 'records')
val_dicts = df_val.to_dict(orient = 'records')
test_dicts = df_test.to_dict(orient = 'records')

In [8]:
# I was getting a memory error, so I set sparse=True for the DictVectorizer.
dv = DictVectorizer(sparse = True)
dv.fit(train_dicts)
X_full_train_new = dv.transform(full_train_dicts)
X_train_new = dv.transform(train_dicts)
X_val_new = dv.transform(val_dicts)
X_test_new = dv.transform(test_dicts)

In [9]:



from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight(class_weight='balanced', y=y_full_train)

In [10]:
# fit model no training data
model = XGBClassifier(params = {'subsample': 0.9, 'scale_pos_weight': 1, 'reg_lambda': 1,'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 5,
'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.9}, objective="binary:logistic", feature_name=dv.get_feature_names_out(), random_state = 1)
model.fit(X_full_train_new, y_full_train,sample_weight=sample_weights)

Parameters: { "feature_name", "params" } are not used.



In [11]:
y_pred = model.predict_proba(X_test_new)[:,1]

In [12]:
print(y_pred)

[0.75899124 0.67681766 0.54432464 ... 0.37157747 0.14638917 0.1884372 ]


In [13]:
y_pred > 0.5
y_decision = (y_pred >= 0.5).astype(int)
acc = (y_test == y_decision).mean().round(2)
print(f"Accuracy is calculated as \033[1m{acc}\033[0m.")
f_macro = f1_score(y_test, y_decision, average='macro').round(2)
print(f"Macro F1 score is calculated as \033[1m{f_macro}\033[0m.")
f_weighted = f1_score(y_test, y_decision, average='weighted').round(2)
print(f"Weighted F1 score is calculated as \033[1m{f_weighted}\033[0m.")

Accuracy is calculated as [1m0.71[0m.
Macro F1 score is calculated as [1m0.58[0m.
Weighted F1 score is calculated as [1m0.76[0m.


In [14]:
seeds = [5, 13, 29, 43, 57, 87, 29, 42,1, 93]

In [31]:
#I initiannly get same results for each run. I tried the change subsample, colsample_bytree parameters but it did not help.
#To devaluate non deterministic results I st booster to gblinear readinf the xgboost documentation 
#https://readthedocs.org/projects/xgboost/downloads/pdf/latest/

results = []
for i in [5, 13, 29, 43, 57, 87, 29, 42,1, 93]:
    print(i)
    # I have deleted subsample and colsample_bytree parameters since I have evaluated same result at each run
    model = XGBClassifier(params = {'subsample': 0.9, 'scale_pos_weight': 1, 'reg_lambda': 1,'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 5,
'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.9}, objective="binary:logitraw", feature_selector= "shuffle", booster = 'gblinear',
                          feature_name=dv.get_feature_names_out(), random_state = i)
    model.fit(X_full_train_new, y_full_train,sample_weight=sample_weights)
    y_pred = model.predict_proba(X_test_new)[:,1]
    y_pred > 0.5
    y_decision = (y_pred >= 0.5).astype(int)
    acc = (y_test == y_decision).mean().round(4)
    print(f"Accuracy is calculated as \033[1m{acc}\033[0m.")
    f_macro = f1_score(y_test, y_decision, average='macro').round(4)
    print(f"Macro F1 score is calculated as \033[1m{f_macro}\033[0m.")
    f_weighted = f1_score(y_test, y_decision, average='weighted').round(4)
    print(f"Weighted F1 score is calculated as \033[1m{f_weighted}\033[0m.")
    results.append((acc,f_macro,f_weighted))
    output_file = f'modelxgb_i={i}.bin'
    with open(output_file, 'wb') as f_out:
        pickle.dump((dv, model), f_out)
    

5


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.7999[0m.
Macro F1 score is calculated as [1m0.6245[0m.
Weighted F1 score is calculated as [1m0.8215[0m.
13


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.8004[0m.
Macro F1 score is calculated as [1m0.6244[0m.
Weighted F1 score is calculated as [1m0.8218[0m.
29


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.7999[0m.
Macro F1 score is calculated as [1m0.6244[0m.
Weighted F1 score is calculated as [1m0.8215[0m.
43


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.7995[0m.
Macro F1 score is calculated as [1m0.6241[0m.
Weighted F1 score is calculated as [1m0.8213[0m.
57


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.7995[0m.
Macro F1 score is calculated as [1m0.6245[0m.
Weighted F1 score is calculated as [1m0.8213[0m.
87


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.8[0m.
Macro F1 score is calculated as [1m0.6245[0m.
Weighted F1 score is calculated as [1m0.8216[0m.
29


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.7996[0m.
Macro F1 score is calculated as [1m0.6242[0m.
Weighted F1 score is calculated as [1m0.8213[0m.
42


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.8005[0m.
Macro F1 score is calculated as [1m0.6251[0m.
Weighted F1 score is calculated as [1m0.822[0m.
1


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.8[0m.
Macro F1 score is calculated as [1m0.6247[0m.
Weighted F1 score is calculated as [1m0.8216[0m.
93


Parameters: { "feature_name", "params" } are not used.



Accuracy is calculated as [1m0.7999[0m.
Macro F1 score is calculated as [1m0.6241[0m.
Weighted F1 score is calculated as [1m0.8215[0m.
