In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv("data/modified_0420.csv")

In [4]:
df.columns

Index(['Unnamed: 0', 'loan_amnt', 'term', 'int_rate', 'installment',
       'sub_grade', 'emp_length', 'loan_status', 'delinq_2yrs',
       'inq_last_6mths', 'pub_rec', 'revol_util', 'collections_12_mths_ex_med',
       'application_type', 'dti_joint', 'acc_now_delinq',
       'chargeoff_within_12_mths', 'mths_since_recent_inq', 'num_tl_120dpd_2m',
       'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m',
       'pct_tl_nvr_dlq', 'is_after_2015', 'is_after_2012', 'MORTGAGE',
       'OTHERS', 'OWN', 'RENT', 'car', 'credit_card', 'debt_consolidation',
       'educational', 'home_improvement', 'house', 'major_purchase', 'medical',
       'moving', 'other', 'renewable_energy', 'small_business', 'vacation',
       'wedding', 'Not Verified', 'Source Verified', 'Verified', 'fico_avg',
       'all_util_log', 'annual_inc_log', 'annual_inc_joint_log',
       'bc_open_to_buy_log', 'delinq_amnt_log', 'dti_log', 'max_bal_bc_log',
       'mo_sin_old_il_acct_log', 'mo_sin_old_rev_tl_op_log',

In [None]:
x_train = df.drop(columns=['loan_status','Unnamed: 0'])
y_train = df['loan_status']

In [8]:
nan_indices = np.isnan(x_train).any(axis=1)
x_train = x_train[~nan_indices]
y_train = y_train[~nan_indices]
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)

In [9]:
model = RandomForestClassifier()

In [10]:
params = { 'n_estimators' : [10, 30, 50],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }
kfold = KFold(random_state=30,
           n_splits=5,
           shuffle=True
          )
# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_search = GridSearchCV(rf_clf, param_grid = params, cv = kfold, n_jobs = -1)
grid_search.fit(x_train, y_train)

# 최적의 param
print(f"Best params: {grid_search.best_params_}")
# 최적의 param일 경우 최적의 accuracy
print(f"Best average accuracy: {grid_search.best_score_}")

In [None]:
rf_clf1 = RandomForestClassifier(n_estimators = 100, 
                                max_depth = 12,
                                min_samples_leaf = 8,
                                min_samples_split = 8,
                                random_state = 0,
                                n_jobs = -1)
rf_clf1.fit(x_train, y_train)
pred = rf_clf1.predict(X_test)
print('예측 정확도: {:.4f}'.format(accuracy_score(y_test,pred)))

In [43]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf1.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = x_train.columns)
ftr_top30 = ftr_importances.sort_values(ascending=False)[:30]

plt.figure(figsize=(8,6))
plt.title('top 30 Feature Importances')
sns.barplot(x=ftr_top30, y=ftr_top30.index)
plt.show()

{'mean_fit_time': array([ 45.42095404,  47.89461579, 104.45527601,  97.54565234,
        149.18702364, 140.41665769, 247.88077388, 140.99477043,
         19.06272731,  18.97233076,  43.81525879,  43.86375208,
         60.45980515,  60.96489749, 106.32526121, 101.73821311,
         18.62472057,  18.23737502,  41.62129273,  41.66838169,
         54.65958519,  57.03063121,  92.40877023,  95.03137202]),
 'std_fit_time': array([ 4.18659111,  4.7309525 ,  1.78704312,  0.94590561,  3.2688449 ,
         3.31714687, 47.99313522,  6.83385589,  0.17794671,  0.29221182,
         1.35748998,  2.25478417,  2.81170764,  5.39197645,  5.52072732,
         2.64836496,  0.95918566,  0.34961127,  1.1002516 ,  0.55731421,
         0.5650627 ,  1.46955747,  1.76527115,  1.66301756]),
 'mean_score_time': array([0.28272729, 0.35230808, 0.31787553, 0.30284367, 0.33635268,
        0.29905705, 0.34909291, 0.22709961, 0.19568734, 0.19638619,
        0.18784876, 0.23205476, 0.2659811 , 0.24409332, 0.22987838,
    

아래는 위의 모델을 생성한 결과를 바탕으로 hyperparameter를 설정한 모델입니다.