In [2]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import random
import pandas as pd
import numpy as np


In [3]:
data = pd.read_csv('./datasets/부정기사비율_재무비율.csv')
data.rename(columns={'운전자산총자본비율':'운전자본비율'}, inplace=True)
data.fillna(0, inplace=True)
#data.drop(['회사명', '회계년도', '폐지일자'], axis=1, inplace=True)
input = data.iloc[:,2:8]
target = data.iloc[:,8]

np.random.seed(42)

In [19]:
부실기업부정기사비율 = np.mean(data[data.부실기업여부==1].부정기사비율)
정상기업부정기사비율 = np.mean(data[data.부실기업여부==0].부정기사비율)
부정기사비율차이 = 부실기업부정기사비율 - 정상기업부정기사비율

print(f'부실기업의 부정기사 비율 : {부실기업부정기사비율}')
print(f'정상기업의 부정기사 비율 : {정상기업부정기사비율}')
print(f'부정기사 비율의 차이 : {부정기사비율차이}')

부실기업의 부정기사 비율 : 29.711582791883014
정상기업의 부정기사 비율 : 13.883113450994268
부정기사 비율의 차이 : 15.828469340888747


## XGBClassifier
##### 설명 참고 : https://wooono.tistory.com/97

In [4]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

rs = RobustScaler()
x_train = rs.fit_transform(x_train)
x_test = rs.fit_transform(x_test)

xgb = XGBClassifier(random_state=42)

xgb_param_grid = {'n_estimators' : [100, 200],
                'learning_rate' : [0.01, 0.05, 0.1],
                'max_depth' : [3, 5, 7],
                'gamma' : [0, 1, 2]}

xgb_grid = GridSearchCV(xgb, param_grid=xgb_param_grid, scoring='roc_auc', verbose=0, n_jobs=1)
xgb_grid.fit(x_train, y_train)

print(f'best roc : {xgb_grid.best_score_}')
print('best param : ', xgb_grid.best_params_)

## 참고 : https://cjh34544.tistory.com/m/4
## http://aispiration.com/model/model-python-xgboost-hyper.html

best roc : 0.9258756395120031
best param :  {'gamma': 2, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}


## Logistic

In [5]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

rs = RobustScaler()
x_train = rs.fit_transform(x_train)
x_test = rs.fit_transform(x_test)

lr = LogisticRegression(random_state=42)

lr_param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10],
                'penalty' : ['l1', 'l2']}

lr_grid = GridSearchCV(lr, param_grid=lr_param_grid, scoring='roc_auc', verbose=0, n_jobs=1)
lr_grid.fit(x_train, y_train)

print(f'best roc : {lr_grid.best_score_}')
print('best param : ', lr_grid.best_params_)

# 참고 : https://wikidocs.net/16594


best roc : 0.9432703659976387
best param :  {'C': 0.1, 'penalty': 'l2'}


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dbswo\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\dbswo\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\dbswo\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs

## RandomForest

In [6]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

rs = RobustScaler()
x_train = rs.fit_transform(x_train)
x_test = rs.fit_transform(x_test)

rf = RandomForestClassifier(random_state=42)

rf_param_grid = {'n_estimators' : [100, 200],
                'max_depth' : [3, 5, 7],
                'min_samples_leaf' : [8, 12, 16],
                'min_samples_split' : [8, 16, 20]}

rf_grid = GridSearchCV(rf, param_grid=rf_param_grid, scoring='roc_auc', verbose=0, n_jobs=1)
rf_grid.fit(x_train, y_train)

print(f'best roc : {rf_grid.best_score_}')
print('best param : ', rf_grid.best_params_)

## 참고 : https://techblog-history-younghunjo1.tistory.com/102
## https://jaaamj.tistory.com/35


best roc : 0.9454348681621407
best param :  {'max_depth': 3, 'min_samples_leaf': 16, 'min_samples_split': 8, 'n_estimators': 100}


## SVM

In [7]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

rs = RobustScaler()
x_train = rs.fit_transform(x_train)
x_test = rs.fit_transform(x_test)

svc = SVC(random_state=42, probability=True)

svc_param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10],
                'gamma' : [0.001, 0.01, 0.1, 1, 10]}

svc_grid = GridSearchCV(svc, param_grid=svc_param_grid, scoring='roc_auc', verbose=0, n_jobs=1)
svc_grid.fit(x_train, y_train)

print(f'best roc : {svc_grid.best_score_}')
print('best param : ', svc_grid.best_params_)

best roc : 0.9440377804014168
best param :  {'C': 10, 'gamma': 0.01}


### CatBoostClassifier


In [8]:
x_train, x_test, y_train, y_test = train_test_split(input, target, random_state=42, test_size=0.2)

rs = RobustScaler()
x_train = rs.fit_transform(x_train)
x_test = rs.fit_transform(x_test)

cat = CatBoostClassifier(random_state=42, verbose=0)
cat.fit(x_train, y_train)
pred = cat.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
roc = roc_auc_score(y_test, pred)

print(roc)

0.8159340659340659


##### https://velog.io/@jus6886/Catboost
##### https://undeadkwandoll.tistory.com/61
##### https://www.kci.go.kr/kciportal/ci/sereArticleSearch/ciSereArtiView.kci?sereArticleSearchBean.artiId=ART002698429
#### CatBoost 설명
##### https://dailyheumsi.tistory.com/136
##### https://techblog-history-younghunjo1.tistory.com/199
##### https://heeya-stupidbutstudying.tistory.com/43?category=950711

#### Boosting Model 비교
##### https://medium.com/@divyagera2402/boosting-algorithms-adaboost-gradient-boosting-xgb-light-gbm-and-catboost-e7d2dbc4e4ca
##### http://dmqm.korea.ac.kr/activity/seminar/323
##### https://hyunlee103.tistory.com/25
##### https://neptune.ai/blog/when-to-choose-catboost-over-xgboost-or-lightgbm


