In [5]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier

In [2]:
X_train = pd.read_csv('data/X_train.csv', index_col=0)
X_test = pd.read_csv('data/X_test.csv', index_col=0)
y_train = pd.read_csv('data/y_train.csv', index_col=0)
y_test = pd.read_csv('data/y_test.csv', index_col=0)

In [3]:
y_train = y_train['loan_status']
y_test = y_test['loan_status']

In [4]:
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

In [7]:
model = BalancedRandomForestClassifier(random_state=0)

In [8]:
model.fit(X_resampled, y_resampled)

BalancedRandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                               criterion='gini', max_depth=None,
                               max_features='auto', max_leaf_nodes=None,
                               max_samples=None, min_impurity_decrease=0.0,
                               min_samples_leaf=2, min_samples_split=2,
                               min_weight_fraction_leaf=0.0, n_estimators=100,
                               n_jobs=None, oob_score=False, random_state=0,
                               replacement=False, sampling_strategy='auto',
                               verbose=0, warm_start=False)

In [9]:
print(model.feature_importances_)

[1.11600057e-02 3.08663175e-02 1.02995668e-02 6.75430497e-03
 4.97239946e-03 9.36632416e-03 3.43549771e-02 3.69698206e-03
 7.86804987e-03 3.25539496e-03 3.36665850e-03 1.17549383e-02
 1.01563375e-02 3.75765226e-02 3.75064116e-02 4.62933227e-02
 2.80611295e-02 1.43110929e-02 0.00000000e+00 0.00000000e+00
 2.41930139e-02 8.21325505e-04 0.00000000e+00 0.00000000e+00
 5.25279470e-03 8.13737663e-03 4.14630761e-02 8.49416759e-03
 2.34631960e-02 1.58077787e-02 9.17012428e-03 5.40403967e-03
 3.03351790e-03 1.03466011e-02 1.15745297e-02 3.54245981e-03
 4.43983092e-03 5.19498775e-03 2.38978772e-02 1.54209251e-02
 2.51811609e-02 7.04016459e-03 6.28723790e-03 4.15946374e-03
 3.40621987e-03 4.99168574e-04 0.00000000e+00 3.26688184e-03
 3.14639924e-03 1.09730330e-02 1.18960909e-02 6.20810835e-03
 5.52289041e-03 8.34996436e-03 8.75614167e-03 6.46365994e-03
 4.23193267e-03 4.51851971e-03 3.67056449e-03 4.79771647e-03
 3.13477794e-03 3.31600623e-03 4.22707482e-03 3.57842345e-03
 0.00000000e+00 0.000000

In [10]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[   35,    66],
       [    6, 17098]])

In [11]:
balanced_accuracy_score(y_test, y_pred)

0.6730919291648529

In [12]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.85      0.35      1.00      0.49      0.59      0.32       101
          1       1.00      1.00      0.35      1.00      0.59      0.37     17104

avg / total       1.00      1.00      0.35      0.99      0.59      0.37     17205

