In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel,SequentialFeatureSelector,SelectKBest

from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix,roc_auc_score

import warnings
warnings.simplefilter('ignore')

In [2]:
def model_basic(x_train, y_train, x_test, y_test):
    models = [
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        GaussianNB(),
        RandomForestClassifier(),
        XGBClassifier(),
        
    ]

    rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

    for clf in models:
        clf = clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
        auc_score = roc_auc_score(y_test, pred_prob_rf)
        results = (
            round(accuracy_score(y_test, pred), 4),
            round(precision_score(y_test, pred), 4),
            round(recall_score(y_test, pred), 4),
            round(f1_score(y_test, pred), 4),
            round(auc_score, 4)
        )

        rdict['model'].append(clf)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['auc_score'].append(results[4])

        confusion = confusion_matrix(y_test, pred)
        print(confusion)

    rdf = pd.DataFrame(data=rdict)
    return rdf

In [3]:
train = pd.read_csv('./datasets/통계검증완료/코스닥_원본_train.csv',index_col=0)
test =  pd.read_csv('./datasets/통계검증완료/코스닥_원본_test.csv',index_col=0)



In [4]:
print(test["target_1"].value_counts())
print(test["target_2"].value_counts())
print(test["target_3"].value_counts())

0    2475
1      74
Name: target_1, dtype: int64
0    2477
1      72
Name: target_2, dtype: int64
0    2477
1      72
Name: target_3, dtype: int64


---
# 산업별 더미변수 - 제조업,건설업,정보통신업,그외
---

In [5]:
# train.loc[~(train['산업군'].str.contains('제조업') | train['산업군'].str.contains('건설업') | train['산업군'].str.contains('정보통신업')), '산업군'] = '그 외'
# dummy_df = pd.get_dummies(train['산업군'])
# train = pd.concat([train,dummy_df],axis=1)

In [6]:
# test.loc[~(test['산업군'].str.contains('제조업') | test['산업군'].str.contains('건설업') | test['산업군'].str.contains('정보통신업')), '산업군'] = '그 외'
# dummy_df = pd.get_dummies(test['산업군'])
# test = pd.concat([test,dummy_df],axis=1)

----

In [7]:
train["회계년도"] = train["회계년도"].astype("object")
train["거래소코드"] = train["거래소코드"].astype("object")
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4510 entries, 0 to 4509
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   회사명                    4510 non-null   object 
 1   거래소코드                  4510 non-null   object 
 2   회계년도                   4510 non-null   object 
 3   산업군                    4510 non-null   object 
 4   WW지수                   4510 non-null   float64
 5   총자본회전률                 4510 non-null   float64
 6   타인자본회전률                4510 non-null   float64
 7   차입금의존도                 4510 non-null   float64
 8   총자본순이익률                4510 non-null   float64
 9   대주주지분율                 4510 non-null   float64
 10  EBIT/총자산               4510 non-null   float64
 11  총자본정상영업이익률             4510 non-null   float64
 12  순운전자본비율                4510 non-null   float64
 13  EBITDA마진율              4510 non-null   float64
 14  현금흐름 대 자산              4510 non-null   float64
 15  매출액총

In [8]:
col_int = train.select_dtypes("float").columns
col_int

Index(['WW지수', '총자본회전률', '타인자본회전률', '차입금의존도', '총자본순이익률', '대주주지분율', 'EBIT/총자산',
       '총자본정상영업이익률', '순운전자본비율', 'EBITDA마진율', '현금흐름 대 자산', '매출액총이익률', '자본금회전률',
       '현금흐름 대 매출액', 'abs(영업현금흐름-당기순이익)/매출액', '누적수익성비율', '당기전기영업손익',
       '합계_기말인원(명)', '금융비용부담률', 'TMD', '총자본증가율', 'FINDEP', '자기자본증가율', '매출액증가율',
       '정상영업이익증가율', '자기자본순이익률'],
      dtype='object')

---
#target1
---

In [9]:
x_train = train[col_int]
y_train = train[['target_1']]

x_test = test[col_int]
y_test = test[['target_1']]

In [10]:
y_test.value_counts()

target_1
0           2475
1             74
dtype: int64

In [11]:
#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select = pd.DataFrame()
# 생성된 데이터프레임의 인덱스 지정
df_select.index = x_train.columns

---
# lasso - 임베디드기법
---

In [12]:
#  lasso
selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.01)).fit(x_train, y_train)
df_select["lasso_0.01"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.05)).fit(x_train, y_train)
df_select["lasso_0.05"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.1)).fit(x_train, y_train)
df_select["lasso_0.1"] = selector.get_support()


---
# stepwise - 래퍼기법
---

In [13]:
# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.05)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select['logit_bwd'] = selector.get_support().tolist()

In [14]:
# lda foward
estimator = LinearDiscriminantAnalysis(solver='svd')
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select['lda_fwd'] = selector.get_support().tolist()
# lda backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1,)
selector.fit(x_train, y_train)
df_select['lda_bwd'] = selector.get_support().tolist()

---
## 비모수적 방법
---

In [15]:
# # RandomForest_foward
# estimator = RandomForestClassifier()
# selector = SequentialFeatureSelector(estimator,direction='forward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select['RFC_fwd'] = selector.get_support().tolist()
# # RandomForest _ backward
# selector = SequentialFeatureSelector(estimator,direction='backward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select['RFC_bwd'] = selector.get_support().tolist()

---
## 최종 select
---

In [16]:
for col in df_select.columns.to_list():
    df_select[col] = df_select[col].apply(lambda x : 1 if x==True else 0)
df_select['total'] = df_select.sum(axis=1)
df_select[df_select['total']>=5].index

Index(['WW지수', '총자본순이익률', '대주주지분율', '매출액총이익률', '누적수익성비율', '당기전기영업손익',
       '금융비용부담률', 'TMD', '총자본증가율', '매출액증가율', '자기자본순이익률'],
      dtype='object')

In [17]:
df_select[df_select['total']>=5]

Unnamed: 0,lasso_0.01,lasso_0.05,lasso_0.1,logit_fwd,logit_bwd,lda_fwd,lda_bwd,total
WW지수,1,1,1,1,1,0,0,5
총자본순이익률,1,1,1,1,1,0,0,5
대주주지분율,1,1,1,0,0,1,1,5
매출액총이익률,1,1,1,0,1,0,1,5
누적수익성비율,0,1,1,1,0,1,1,5
당기전기영업손익,1,1,1,0,1,1,1,6
금융비용부담률,1,1,1,1,1,1,1,7
TMD,1,1,1,1,1,1,1,7
총자본증가율,1,1,1,1,0,1,1,6
매출액증가율,1,1,1,0,1,0,1,5


In [18]:
최종_col_1 =df_select[df_select['total']>=5].index.to_list()


x_train_1 = x_train[최종_col_1]
x_test_1 = x_test[최종_col_1]

In [19]:
model_basic(x_train_1,y_train,x_train_1,y_train)

[[4323   17]
 [  86   84]]
[[4323   17]
 [  82   88]]
[[4220  120]
 [  45  125]]
[[4340    0]
 [   0  170]]
[[4340    0]
 [   0  170]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9772,0.8317,0.4941,0.6199,0.9379
1,LinearDiscriminantAnalysis(),0.978,0.8381,0.5176,0.64,0.9382
2,GaussianNB(),0.9634,0.5102,0.7353,0.6024,0.9516
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [20]:
model_basic(x_train_1,y_train,x_test_1,y_test)

[[2458   17]
 [  29   45]]
[[2458   17]
 [  26   48]]
[[2381   94]
 [  11   63]]
[[2466    9]
 [  13   61]]
[[2466    9]
 [  18   56]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.982,0.7258,0.6081,0.6618,0.9434
1,LinearDiscriminantAnalysis(),0.9831,0.7385,0.6486,0.6906,0.9622
2,GaussianNB(),0.9588,0.4013,0.8514,0.5455,0.9833
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9914,0.8714,0.8243,0.8472,0.9907
4,"XGBClassifier(base_score=None, booster=None, c...",0.9894,0.8615,0.7568,0.8058,0.9853


In [21]:
최종_col_1 =df_select[df_select['total']>=6].index.to_list()

x_train_1 = x_train[최종_col_1]
x_test_1 = x_test[최종_col_1]

In [22]:
model_basic(x_train_1,y_train,x_train_1,y_train)

[[4331    9]
 [  95   75]]
[[4334    6]
 [ 114   56]]
[[4245   95]
 [  60  110]]
[[4340    0]
 [   0  170]]
[[4340    0]
 [   0  170]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9769,0.8929,0.4412,0.5906,0.9061
1,LinearDiscriminantAnalysis(),0.9734,0.9032,0.3294,0.4828,0.8857
2,GaussianNB(),0.9656,0.5366,0.6471,0.5867,0.9444
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [23]:
model_basic(x_train_1,y_train,x_test_1,y_test)

[[2458   17]
 [  32   42]]
[[2464   11]
 [  36   38]]
[[2386   89]
 [  19   55]]
[[2460   15]
 [  28   46]]
[[2459   16]
 [  24   50]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9808,0.7119,0.5676,0.6316,0.9477
1,LinearDiscriminantAnalysis(),0.9816,0.7755,0.5135,0.6179,0.9425
2,GaussianNB(),0.9576,0.3819,0.7432,0.5046,0.9622
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9831,0.7541,0.6216,0.6815,0.9554
4,"XGBClassifier(base_score=None, booster=None, c...",0.9843,0.7576,0.6757,0.7143,0.9546


-----
# target2

In [24]:
x_train = train[col_int]
y_train = train[['target_2']]

x_test = test[col_int]
y_test = test[['target_2']]

#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select_2 = pd.DataFrame()
# 생성된 데이터프레임의 인덱스 지정
df_select_2.index = x_train.columns

In [25]:
#  lasso
selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.01)).fit(x_train, y_train)
df_select_2["lasso_0.01"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.05)).fit(x_train, y_train)
df_select_2["lasso_0.05"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.1)).fit(x_train, y_train)
df_select_2["lasso_0.1"] = selector.get_support()

# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.05)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_2['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_2['logit_bwd'] = selector.get_support().tolist()


# lda foward
estimator = LinearDiscriminantAnalysis(solver='svd')
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_2['lda_fwd'] = selector.get_support().tolist()
# lda backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1,)
selector.fit(x_train, y_train)
df_select_2['lda_bwd'] = selector.get_support().tolist()

# # RandomForest_foward
# estimator = RandomForestClassifier()
# selector = SequentialFeatureSelector(estimator,direction='forward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_2['RFC_fwd'] = selector.get_support().tolist()
# # RandomForest _ backward
# selector = SequentialFeatureSelector(estimator,direction='backward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_2['RFC_bwd'] = selector.get_support().tolist()

In [26]:
for col in df_select_2.columns.to_list():
    df_select_2[col] = df_select_2[col].apply(lambda x : 1 if x==True else 0)
df_select_2['total'] = df_select_2.sum(axis=1)
df_select_2[df_select_2['total']>=5].index


Index(['WW지수', '대주주지분율', '순운전자본비율', 'EBITDA마진율', '매출액총이익률', '자본금회전률',
       '누적수익성비율', '당기전기영업손익', '금융비용부담률', 'TMD', '총자본증가율', '매출액증가율',
       '자기자본순이익률'],
      dtype='object')

In [27]:
df_select_2[df_select_2['total']>=5]

Unnamed: 0,lasso_0.01,lasso_0.05,lasso_0.1,logit_fwd,logit_bwd,lda_fwd,lda_bwd,total
WW지수,1,1,1,0,1,0,1,5
대주주지분율,1,1,1,0,0,1,1,5
순운전자본비율,1,1,1,0,1,0,1,5
EBITDA마진율,1,1,1,0,1,0,1,5
매출액총이익률,1,1,1,1,1,1,0,6
자본금회전률,1,1,1,1,0,1,0,5
누적수익성비율,0,1,1,1,1,1,0,5
당기전기영업손익,1,1,1,0,1,1,0,5
금융비용부담률,1,1,1,0,1,1,0,5
TMD,1,1,1,1,1,1,1,7


In [28]:
최종_col =df_select_2[df_select_2['total']>=5].index.to_list()

x_train_2 = x_train[최종_col]
x_test_2 = x_test[최종_col]

In [29]:
model_basic(x_train_2,y_train,x_train_2,y_train)

[[4286   22]
 [ 119   83]]
[[4281   27]
 [ 117   85]]
[[4204  104]
 [  65  137]]
[[4308    0]
 [   0  202]]
[[4308    0]
 [   0  202]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9687,0.7905,0.4109,0.5407,0.9249
1,LinearDiscriminantAnalysis(),0.9681,0.7589,0.4208,0.5414,0.9215
2,GaussianNB(),0.9625,0.5685,0.6782,0.6185,0.9231
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [30]:
model_basic(x_train_2,y_train,x_test_2,y_test)

[[2463   14]
 [  30   42]]
[[2451   26]
 [  15   57]]
[[2378   99]
 [  10   62]]
[[2466   11]
 [  12   60]]
[[2458   19]
 [  21   51]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9827,0.75,0.5833,0.6562,0.9564
1,LinearDiscriminantAnalysis(),0.9839,0.6867,0.7917,0.7355,0.9759
2,GaussianNB(),0.9572,0.3851,0.8611,0.5322,0.9824
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.991,0.8451,0.8333,0.8392,0.9959
4,"XGBClassifier(base_score=None, booster=None, c...",0.9843,0.7286,0.7083,0.7183,0.9872


In [31]:
최종_col =df_select_2[df_select_2['total']>=6].index.to_list()

x_train_2 = x_train[최종_col]
x_test_2 = x_test[최종_col]

In [32]:
model_basic(x_train_2,y_train,x_train_2,y_train)

[[4291   17]
 [ 122   80]]
[[4304    4]
 [ 158   44]]
[[4194  114]
 [  91  111]]
[[4308    0]
 [   1  201]]
[[4308    0]
 [   0  202]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9692,0.8247,0.396,0.5351,0.8911
1,LinearDiscriminantAnalysis(),0.9641,0.9167,0.2178,0.352,0.7941
2,GaussianNB(),0.9545,0.4933,0.5495,0.5199,0.8813
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9998,1.0,0.995,0.9975,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [33]:
model_basic(x_train_2,y_train,x_test_2,y_test)

[[2462   15]
 [  44   28]]
[[2477    0]
 [  58   14]]
[[2404   73]
 [  23   49]]
[[2461   16]
 [  25   47]]
[[2458   19]
 [  25   47]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9769,0.6512,0.3889,0.487,0.8598
1,LinearDiscriminantAnalysis(),0.9772,1.0,0.1944,0.3256,0.8509
2,GaussianNB(),0.9623,0.4016,0.6806,0.5052,0.9562
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9839,0.746,0.6528,0.6963,0.9528
4,"XGBClassifier(base_score=None, booster=None, c...",0.9827,0.7121,0.6528,0.6812,0.9294


----
# target3

In [34]:
x_train = train[col_int]
y_train = train[['target_3']]

x_test = test[col_int]
y_test = test[['target_3']]

#  선정된 피쳐를 저장하기 위한 데이터프레임 생성 
df_select_3 = pd.DataFrame()
# 생성된 데이터프레임의 인덱스 지정
df_select_3.index = x_train.columns

In [35]:
#  lasso
selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.01)).fit(x_train, y_train)
df_select_3["lasso_0.01"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.05)).fit(x_train, y_train)
df_select_3["lasso_0.05"] = selector.get_support()

selector = SelectFromModel(estimator=LogisticRegression(penalty='l1',solver='liblinear',C=0.1)).fit(x_train, y_train)
df_select_3["lasso_0.1"] = selector.get_support()

# logit foward
estimator = LogisticRegression(penalty='l1',solver='liblinear',C=0.05)
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_3['logit_fwd'] = selector.get_support().tolist()
# logit backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_3['logit_bwd'] = selector.get_support().tolist()


# lda foward
estimator = LinearDiscriminantAnalysis(solver='svd')
selector = SequentialFeatureSelector(estimator,direction='forward',cv=5,n_jobs=-1)
selector.fit(x_train, y_train)
df_select_3['lda_fwd'] = selector.get_support().tolist()
# lda backward
selector = SequentialFeatureSelector(estimator,direction='backward',cv=5,n_jobs=-1,)
selector.fit(x_train, y_train)
df_select_3['lda_bwd'] = selector.get_support().tolist()

# # RandomForest_foward
# estimator = RandomForestClassifier()
# selector = SequentialFeatureSelector(estimator,direction='forward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_3['RFC_fwd'] = selector.get_support().tolist()
# # RandomForest _ backward
# selector = SequentialFeatureSelector(estimator,direction='backward',cv=2,n_jobs=-1)
# selector.fit(x_train, y_train)
# df_select_3['RFC_bwd'] = selector.get_support().tolist()

In [36]:
for col in df_select_3.columns.to_list():
    df_select_3[col] = df_select_3[col].apply(lambda x : 1 if x==True else 0)
df_select_3['total'] = df_select_3.sum(axis=1)
df_select_3[df_select_3['total']>=5].index

Index(['WW지수', '순운전자본비율', '자본금회전률', 'abs(영업현금흐름-당기순이익)/매출액', '누적수익성비율',
       '당기전기영업손익', '합계_기말인원(명)', '금융비용부담률', 'TMD', '총자본증가율', '정상영업이익증가율',
       '자기자본순이익률'],
      dtype='object')

In [37]:
df_select_3[df_select_3['total']>=5]

Unnamed: 0,lasso_0.01,lasso_0.05,lasso_0.1,logit_fwd,logit_bwd,lda_fwd,lda_bwd,total
WW지수,1,1,1,1,1,1,0,6
순운전자본비율,1,1,1,0,1,0,1,5
자본금회전률,1,1,1,0,1,1,0,5
abs(영업현금흐름-당기순이익)/매출액,0,0,1,1,1,1,1,5
누적수익성비율,0,1,1,1,1,1,1,6
당기전기영업손익,1,1,1,0,1,0,1,5
합계_기말인원(명),1,1,1,0,1,0,1,5
금융비용부담률,1,1,1,1,0,1,0,5
TMD,1,1,1,1,1,1,1,7
총자본증가율,1,1,1,1,0,0,1,5


In [38]:
최종_col =df_select_3[df_select_3['total']>=5].index.to_list()

x_train_3 = x_train[최종_col]
x_test_3 = x_test[최종_col]

In [39]:
model_basic(x_train_3,y_train,x_train_3,y_train)

[[4336    9]
 [  90   75]]
[[4338    7]
 [  87   78]]
[[4269   76]
 [  48  117]]
[[4345    0]
 [   0  165]]
[[4345    0]
 [   0  165]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.978,0.8929,0.4545,0.6024,0.9319
1,LinearDiscriminantAnalysis(),0.9792,0.9176,0.4727,0.624,0.9373
2,GaussianNB(),0.9725,0.6062,0.7091,0.6536,0.9425
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [40]:
model_basic(x_train_3,y_train,x_test_3,y_test)

[[2466   11]
 [  34   38]]
[[2464   13]
 [  22   50]]
[[2395   82]
 [  12   60]]
[[2468    9]
 [  19   53]]
[[2467   10]
 [  21   51]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9823,0.7755,0.5278,0.6281,0.9507
1,LinearDiscriminantAnalysis(),0.9863,0.7937,0.6944,0.7407,0.9748
2,GaussianNB(),0.9631,0.4225,0.8333,0.5607,0.9757
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.989,0.8548,0.7361,0.791,0.9764
4,"XGBClassifier(base_score=None, booster=None, c...",0.9878,0.8361,0.7083,0.7669,0.9628


In [41]:
최종_col =df_select_3[df_select_3['total']>=6].index.to_list()

x_train_3 = x_train[최종_col]
x_test_3 = x_test[최종_col]

In [42]:
model_basic(x_train_3,y_train,x_train_3,y_train)

[[4334   11]
 [  79   86]]
[[4330   15]
 [ 100   65]]
[[4250   95]
 [  57  108]]
[[4345    0]
 [   1  164]]
[[4345    0]
 [   0  165]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.98,0.8866,0.5212,0.6565,0.9417
1,LinearDiscriminantAnalysis(),0.9745,0.8125,0.3939,0.5306,0.9306
2,GaussianNB(),0.9663,0.532,0.6545,0.587,0.9422
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9998,1.0,0.9939,0.997,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,1.0,1.0


In [43]:
model_basic(x_train_3,y_train,x_test_3,y_test)

[[2471    6]
 [  30   42]]
[[2470    7]
 [  29   43]]
[[2428   49]
 [  17   55]]
[[2467   10]
 [  24   48]]
[[2466   11]
 [  28   44]]


Unnamed: 0,model,accuracy,precision,recall,f1_score,auc_score
0,LogisticRegression(),0.9859,0.875,0.5833,0.7,0.9677
1,LinearDiscriminantAnalysis(),0.9859,0.86,0.5972,0.7049,0.967
2,GaussianNB(),0.9741,0.5288,0.7639,0.625,0.9657
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9867,0.8276,0.6667,0.7385,0.9622
4,"XGBClassifier(base_score=None, booster=None, c...",0.9847,0.8,0.6111,0.6929,0.968


In [44]:
print("최종_col_1 : ",x_train_1.columns)
print("최종_col_2 : ",x_train_2.columns)
print("최종_col_3 : ",x_train_3.columns)


최종_col_1 :  Index(['당기전기영업손익', '금융비용부담률', 'TMD', '총자본증가율', '자기자본순이익률'], dtype='object')
최종_col_2 :  Index(['매출액총이익률', 'TMD', '총자본증가율', '매출액증가율', '자기자본순이익률'], dtype='object')
최종_col_3 :  Index(['WW지수', '누적수익성비율', 'TMD', '자기자본순이익률'], dtype='object')
