In [1]:
import pandas as pd

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids, EditedNearestNeighbours

import pandas as pd 
import numpy as np


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix,roc_auc_score

import warnings
warnings.simplefilter('ignore')

In [2]:
def model_basic(x_train, y_train, x_test, y_test): 
    models = [
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        GaussianNB(),
        RandomForestClassifier(),
        XGBClassifier()
    ]

    rdict={'model':[],'acc_train':[], 'auc_train':[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}


    for clf in models:
        clf = clf.fit(x_train, y_train)
    #1열:Train
        y_hat = clf.predict(x_train)
        results_train  = (round(accuracy_score(y_train,y_hat),2),round(roc_auc_score(y_train,y_hat),2))
    #2열:Test
        y_hat = clf.predict(x_test)
        results = (round(accuracy_score(y_test,y_hat),2),
                        round(precision_score(y_test,y_hat),2),
                        round(recall_score(y_test,y_hat),2),
                        round(f1_score(y_test,y_hat),2),
                        round(roc_auc_score(y_test,y_hat),2))

        rdict['model'].append(clf); 
        rdict['acc_train'].append(results_train[0])
        rdict['auc_train'].append(results_train[1])
        
        rdict['acc_test'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['AUC_test'].append(results[4])   

        confusion = confusion_matrix(y_test, y_hat)

        print(confusion)

    rdf_final = pd.DataFrame(data=rdict)
    return rdf_final

In [3]:
# def model_basic(x_train, y_train, x_test, y_test):
#     models = [
#         LogisticRegression(),
#         LinearDiscriminantAnalysis(),
#         GaussianNB(),
#         RandomForestClassifier(),
#         XGBClassifier(),
        
#     ]

#     rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

#     for clf in models:
#         clf = clf.fit(x_train, y_train)
#         pred = clf.predict(x_test)
#         pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
#         auc_score = roc_auc_score(y_test, pred_prob_rf)
#         results = (
#             round(accuracy_score(y_test, pred), 4),
#             round(precision_score(y_test, pred), 4),
#             round(recall_score(y_test, pred), 4),
#             round(f1_score(y_test, pred), 4),
#             round(auc_score, 4)
#         )

#         rdict['model'].append(clf)
#         rdict['accuracy'].append(results[0])
#         rdict['precision'].append(results[1])
#         rdict['recall'].append(results[2])
#         rdict['f1_score'].append(results[3])
#         rdict['auc_score'].append(results[4])

#         confusion = confusion_matrix(y_test, pred)
#         print(confusion)

#     rdf = pd.DataFrame(data=rdict)
#     return rdf

In [4]:
def over(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target
    
    if method == 'random':
        # RandomOverSampler 객체 생성, sampling_strategy 값 설정
        oversampler = RandomOverSampler(sampling_strategy=sampling_strategy)

    elif method == 'smote':
        # SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTE(sampling_strategy=sampling_strategy)

    elif method == 'adasyn':
        # adasyn 객체 생성, sampling_strategy 값 설정
        oversampler = ADASYN(sampling_strategy=sampling_strategy)

    elif method == 'Borderline-SMOTE':
        # Borderline-SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = BorderlineSMOTE(sampling_strategy=sampling_strategy)

    elif method == 'SMOTENC':
        # SMOTENC 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTENC(sampling_strategy=sampling_strategy)

    
    # 오버샘플링 수행
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # 오버샘플링된 데이터 프레임 생성
    df_over = pd.concat([X_resampled, y_resampled], axis=1)

    return df_over


In [5]:
def under(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target

    if method == 'random':
        # RandomUnderSampler 객체 생성, sampling_strategy 값 설정
        undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy)

    elif method == 'tomek':
        # TomekLinks 객체 생성, sampling_strategy 값 설정
        undersampler = TomekLinks(sampling_strategy=sampling_strategy)

    elif method == 'NearMiss':
        # NearMiss 객체 생성, sampling_strategy 값 설정
        undersampler = NearMiss(sampling_strategy=sampling_strategy)

    elif method == 'cluster_centroids':
        # ClusterCentroids 객체 생성, sampling_strategy 값 설정
        undersampler = ClusterCentroids(sampling_strategy=sampling_strategy)

    elif method == 'edited_nn':
        # EditedNearestNeighbours 객체 생성, sampling_strategy 값 설정
        undersampler = EditedNearestNeighbours(sampling_strategy=sampling_strategy)

    # 언더샘플링 수행
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    # 언더샘플링된 데이터 프레임 생성
    df_under = pd.concat([X_resampled, y_resampled], axis=1)

    return df_under

In [6]:
def evaluate_resampling_model(x_train, y_train, x_test, y_test, method, num_iterations=10, over_ratio = 0.5 ,under_ratio=0.5,random_state=None):
    

    # 모델 성능 저장을 위한 리스트
    score = {"accuracy_scores_mean" : [] ,"f1_scores_mean" :[], "recall_scores_mean" : [] , "precision_scores_mean" : [],  "auc_mean" : []}
    std_score = {"accuracy_scores_std" : [] ,"f1_scores_std" :[], "recall_scores_std" : [] , "precision_scores_std" : [],  "auc_std" : []}  

    recall_scores=[]
    accuracy_scores=[]
    f1_scores=[]
    precision_scores=[]
    auc_scores=[]
    
    
    # 랜덤샘플러 생성
    sampler_over = BorderlineSMOTE(sampling_strategy=over_ratio, random_state=random_state)
    sampler_under = RandomUnderSampler(sampling_strategy=under_ratio, random_state=random_state)

    

    # 랜덤 샘플링 반복 수행
    for i in range(num_iterations):
        
        x_train_over, y_train_over = sampler_over.fit_resample(x_train, y_train)
        x_test_under, y_test_under = sampler_under.fit_resample(x_test, y_test)

        if method == "Logistic":
            model = LogisticRegression().fit(x_train_over, y_train_over)
            y_pred = model.predict(x_test_under)
            y_pred_proba = model.predict_proba(x_test_under)[:,1].reshape(-1,1)

        if method == "LDA":
            model = LinearDiscriminantAnalysis().fit(x_train_over, y_train_over)
            y_pred = model.predict(x_test_under)
            y_pred_proba = model.predict_proba(x_test_under)[:,1].reshape(-1,1)
        
        if method == "Gaussian":
            model = GaussianNB().fit(x_train_over, y_train_over)
            y_pred = model.predict(x_test_under)
            y_pred_proba = model.predict_proba(x_test_under)[:,1].reshape(-1,1)
        
        #모델 성능 평가 
        accuracy = accuracy_score(y_test_under, y_pred)
        f1 = f1_score(y_test_under, y_pred)
        recall = recall_score(y_test_under, y_pred)
        precision = precision_score(y_test_under, y_pred)
        auc = roc_auc_score(y_test_under, y_pred_proba)
        
        # 리스트에 결과값 담아주기
        accuracy_scores.append(accuracy)
        f1_scores.append(f1)
        recall_scores.append(recall)
        precision_scores.append(precision)
        auc_scores.append(auc)

    

    # 모델 성능 평균 및 표준편차 계산
    accuracy_mean = np.mean(accuracy_scores)
    f1_mean = np.mean(f1_scores)
    recall_mean = np.mean(recall_scores)
    precision_mean = np.mean(precision_scores)
    auc_scores_mean = np.mean(auc_scores)


    score["accuracy_scores_mean"].append(accuracy_mean)
    score["f1_scores_mean"].append(f1_mean)
    score["recall_scores_mean"].append(recall_mean)
    score["precision_scores_mean"].append(precision_mean)
    score["auc_mean"].append(auc_scores_mean)

    accuracy_std = np.std(accuracy_scores)
    f1_std = np.std(f1_scores)
    recall_std = np.std(recall_scores)
    precision_std = np.std(precision_scores)
    auc_scores_std = np.std(auc_scores)

    std_score["accuracy_scores_std"].append(accuracy_std)
    std_score["f1_scores_std"].append(f1_std)
    std_score["recall_scores_std"].append(recall_std)
    std_score["precision_scores_std"].append(precision_std)
    std_score["auc_std"].append(auc_scores_std)
    
    print(score)
    print(std_score)

---
# target1

In [7]:
train = pd.read_csv("./datasets/통계검증완료/코스피_standard_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스피_standard_test.csv", index_col=0)


최종_col_1=['EBITDA마진율', '누적수익성비율', 'TMD', '자기자본순이익률', '금융비용부담률', '정상영업이익증가율']


x_train = train[최종_col_1]
y_train = train["target_1"]

x_test = test[최종_col_1]
y_test = test["target_1"]




In [8]:
y_train.value_counts()

0.0    3428
1.0      40
Name: target_1, dtype: int64

In [9]:
y_test.value_counts()

0.0    1648
1.0       6
Name: target_1, dtype: int64

In [10]:
## 불균형 처리 안한 적합
model_basic(x_train, y_train, x_test, y_test)

[[1645    3]
 [   4    2]]
[[1648    0]
 [   4    2]]
[[1603   45]
 [   1    5]]
[[1648    0]
 [   4    2]]
[[1646    2]
 [   4    2]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),1.0,0.86,1.0,0.4,0.33,0.36,0.67
1,LinearDiscriminantAnalysis(),1.0,0.85,1.0,1.0,0.33,0.5,0.67
2,GaussianNB(),0.97,0.91,0.97,0.1,0.83,0.18,0.9
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,0.33,0.5,0.67
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,0.5,0.33,0.4,0.67


In [11]:
## train 데이터만 1:2 오버샘플링후 적합

train_1 =over(train[최종_col_1],train["target_1"], method='Borderline-SMOTE', sampling_strategy=0.5)

model_basic(train_1[최종_col_1], train_1["target_1"], x_test, y_test)

[[1598   50]
 [   0    6]]
[[1632   16]
 [   1    5]]
[[1562   86]
 [   1    5]]
[[1625   23]
 [   1    5]]
[[1631   17]
 [   2    4]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.96,0.96,0.97,0.11,1.0,0.19,0.98
1,LinearDiscriminantAnalysis(),0.94,0.91,0.99,0.24,0.83,0.37,0.91
2,GaussianNB(),0.93,0.92,0.95,0.05,0.83,0.1,0.89
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.99,0.18,0.83,0.29,0.91
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.99,0.19,0.67,0.3,0.83


In [12]:
## train 데이터 오버 1:0.5, test 데이터 언더 1:0.5 

train_1 =over(train[최종_col_1],train["target_1"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_1],test["target_1"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_1], train_1["target_1"], test_1[최종_col_1], test_1["target_1"])


[[12  0]
 [ 0  6]]
[[12  0]
 [ 1  5]]
[[12  0]
 [ 1  5]]
[[12  0]
 [ 1  5]]
[[12  0]
 [ 2  4]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.96,0.95,1.0,1.0,1.0,1.0,1.0
1,LinearDiscriminantAnalysis(),0.95,0.93,0.94,1.0,0.83,0.91,0.92
2,GaussianNB(),0.93,0.92,0.94,1.0,0.83,0.91,0.92
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.94,1.0,0.83,0.91,0.92
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.89,1.0,0.67,0.8,0.83


In [13]:
# 반복샘플링
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='Logistic',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.9800555555555557], 'f1_scores_mean': [0.9729494505494506], 'recall_scores_mean': [1.0], 'precision_scores_mean': [0.9505357142857144], 'auc_mean': [0.9959861111111111]}
{'accuracy_scores_std': [0.03211279311242612], 'f1_scores_std': [0.04275451990545955], 'recall_scores_std': [0.0], 'precision_scores_std': [0.07721316516080443], 'auc_std': [0.01173162307348271]}


In [14]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='LDA',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.938388888888889], 'f1_scores_mean': [0.9009265734265732], 'recall_scores_mean': [0.8333333333333335], 'precision_scores_mean': [0.9822142857142858], 'auc_mean': [0.9946944444444443]}
{'accuracy_scores_std': [0.018685093418267486], 'f1_scores_std': [0.024971524771376465], 'recall_scores_std': [1.1102230246251565e-16], 'precision_scores_std': [0.054037606582546444], 'auc_std': [0.014712234557639857]}


--- 
# targer2

In [15]:
train = pd.read_csv("./datasets/통계검증완료/코스피_standard_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스피_standard_test.csv", index_col=0)


최종_col_2= ['EBITDA마진율', '누적수익성비율', 'TMD', '자기자본순이익률', '금융비용부담률', '정상영업이익증가율']

x_train = train[최종_col_2]
y_train = train["target_2"]

x_test = test[최종_col_2]
y_test = test["target_2"]



In [16]:
## 불균형 처리 안한 적합
model_basic(x_train, y_train, x_test, y_test)

[[1645    3]
 [   4    2]]
[[1648    0]
 [   4    2]]
[[1603   45]
 [   1    5]]
[[1647    1]
 [   4    2]]
[[1646    2]
 [   4    2]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),1.0,0.86,1.0,0.4,0.33,0.36,0.67
1,LinearDiscriminantAnalysis(),1.0,0.85,1.0,1.0,0.33,0.5,0.67
2,GaussianNB(),0.97,0.91,0.97,0.1,0.83,0.18,0.9
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,0.67,0.33,0.44,0.67
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,0.5,0.33,0.4,0.67


In [17]:
## train 데이터만 1:2 오버샘플링후 적합

train_1 =over(train[최종_col_2],train["target_2"], method='Borderline-SMOTE', sampling_strategy=0.5)

model_basic(train_1[최종_col_2], train_1["target_2"], x_test, y_test)

[[1603   45]
 [   0    6]]
[[1631   17]
 [   1    5]]
[[1562   86]
 [   1    5]]
[[1627   21]
 [   1    5]]
[[1631   17]
 [   2    4]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.96,0.96,0.97,0.12,1.0,0.21,0.99
1,LinearDiscriminantAnalysis(),0.95,0.92,0.99,0.23,0.83,0.36,0.91
2,GaussianNB(),0.93,0.92,0.95,0.05,0.83,0.1,0.89
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.99,0.19,0.83,0.31,0.91
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.99,0.19,0.67,0.3,0.83


In [18]:
## train 데이터 오버 1:0.5, test 데이터 언더 1:0.5 

train_1 =over(train[최종_col_2],train["target_2"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_2],test["target_2"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_2], train_1["target_2"], test_1[최종_col_2], test_1["target_2"])

[[11  1]
 [ 0  6]]
[[12  0]
 [ 1  5]]
[[11  1]
 [ 1  5]]
[[12  0]
 [ 0  6]]
[[12  0]
 [ 2  4]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.96,0.95,0.94,0.86,1.0,0.92,0.96
1,LinearDiscriminantAnalysis(),0.94,0.91,0.94,1.0,0.83,0.91,0.92
2,GaussianNB(),0.92,0.91,0.89,0.83,0.83,0.83,0.88
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.89,1.0,0.67,0.8,0.83


In [19]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='Logistic',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.9801666666666667], 'f1_scores_mean': [0.973123076923077], 'recall_scores_mean': [1.0], 'precision_scores_mean': [0.9508809523809525], 'auc_mean': [0.995875]}
{'accuracy_scores_std': [0.03227729967134914], 'f1_scores_std': [0.04288534203297056], 'recall_scores_std': [0.0], 'precision_scores_std': [0.07735175639691111], 'auc_std': [0.012870969044163018]}


In [20]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='LDA',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.9366111111111112], 'f1_scores_mean': [0.8985139860139857], 'recall_scores_mean': [0.8333333333333335], 'precision_scores_mean': [0.9769285714285715], 'auc_mean': [0.9939166666666667]}
{'accuracy_scores_std': [0.020721477496299094], 'f1_scores_std': [0.027750662906206515], 'recall_scores_std': [1.1102230246251565e-16], 'precision_scores_std': [0.06015870450642329], 'auc_std': [0.015578826838982854]}


----
# target3 


In [21]:
test.columns

Index(['회사명', '거래소코드', '회계년도', '산업군', 'target_1', 'target_2', 'target_3',
       '현금흐름/총부채비율', '총자본정상영업이익률', '총자본순이익률', '현금흐름 대 자산', '타인자본회전률', '총자본회전률',
       '차입금의존도', 'EBITDA마진율', '누적수익성비율', '순운전자본비율', 'TMD', '자기자본순이익률', '부채비율',
       '금융비용부담률', '외국인지분율', '당기전기영업손익', 'ROA변화율', '총자본증가율', '매출액총이익률',
       '매출액증가율', '영업현금흐름-단기차입금', '자본금회전률', '대주주지분율', '정상영업이익증가율'],
      dtype='object')

In [22]:
train = pd.read_csv("./datasets/통계검증완료/코스피_standard_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스피_standard_test.csv", index_col=0)

최종_col_3= ['EBITDA마진율', '누적수익성비율', 'TMD', '자기자본순이익률', '금융비용부담률', '정상영업이익증가율']


x_train = train[최종_col_3]
y_train = train["target_3"]

x_test = test[최종_col_3]
y_test = test["target_3"]



In [23]:
## 불균형 처리 안한 적합
model_basic(x_train, y_train, x_test, y_test)

[[1645    3]
 [   4    2]]
[[1648    0]
 [   4    2]]
[[1603   45]
 [   1    5]]
[[1648    0]
 [   4    2]]
[[1646    2]
 [   4    2]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),1.0,0.86,1.0,0.4,0.33,0.36,0.67
1,LinearDiscriminantAnalysis(),1.0,0.85,1.0,1.0,0.33,0.5,0.67
2,GaussianNB(),0.97,0.91,0.97,0.1,0.83,0.18,0.9
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,0.33,0.5,0.67
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,1.0,0.5,0.33,0.4,0.67


In [24]:
## train 데이터만 1:2 오버샘플링후 적합

train_1 =over(train[최종_col_3],train["target_3"], method='Borderline-SMOTE', sampling_strategy=0.5)

model_basic(train_1[최종_col_3], train_1["target_3"], x_test, y_test)

[[1601   47]
 [   0    6]]
[[1634   14]
 [   1    5]]
[[1564   84]
 [   1    5]]
[[1628   20]
 [   1    5]]
[[1629   19]
 [   2    4]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.96,0.95,0.97,0.11,1.0,0.2,0.99
1,LinearDiscriminantAnalysis(),0.94,0.92,0.99,0.26,0.83,0.4,0.91
2,GaussianNB(),0.93,0.92,0.95,0.06,0.83,0.11,0.89
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.99,0.2,0.83,0.32,0.91
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.99,0.17,0.67,0.28,0.83


In [25]:
## train 데이터 오버 1:0.5, test 데이터 언더 1:0.5 

train_1 =over(train[최종_col_3],train["target_3"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_3],test["target_3"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_3], train_1["target_3"], test_1[최종_col_3], test_1["target_3"])


[[12  0]
 [ 0  6]]
[[12  0]
 [ 1  5]]
[[12  0]
 [ 1  5]]
[[12  0]
 [ 0  6]]
[[12  0]
 [ 2  4]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.96,0.96,1.0,1.0,1.0,1.0,1.0
1,LinearDiscriminantAnalysis(),0.95,0.93,0.94,1.0,0.83,0.91,0.92
2,GaussianNB(),0.93,0.92,0.94,1.0,0.83,0.91,0.92
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.89,1.0,0.67,0.8,0.83


In [26]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='Logistic',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.9804444444444446], 'f1_scores_mean': [0.9735670329670332], 'recall_scores_mean': [1.0], 'precision_scores_mean': [0.9517738095238095], 'auc_mean': [0.9960138888888888]}
{'accuracy_scores_std': [0.03277758945331964], 'f1_scores_std': [0.043292701940389], 'recall_scores_std': [0.0], 'precision_scores_std': [0.07779959920240222], 'auc_std': [0.012443010520065847]}


In [27]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='LDA',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.9378333333333335], 'f1_scores_mean': [0.9001223776223773], 'recall_scores_mean': [0.8333333333333335], 'precision_scores_mean': [0.9803571428571429], 'auc_mean': [0.9946805555555556]}
{'accuracy_scores_std': [0.018661953770131322], 'f1_scores_std': [0.025196452523769043], 'recall_scores_std': [1.1102230246251565e-16], 'precision_scores_std': [0.054991109361818995], 'auc_std': [0.013835390018908354]}


----
## target1 - 정상:부실 = 1:0.5    - boderine_smote 했을때 성능이 가장 좋음 


In [28]:
id_col = ['회사명', '거래소코드', '회계년도', '산업군','target_3']
가져가는_col = id_col + 최종_col_3 
train_1.to_csv('./datasets/불균형처리/boder_코스피_standard_train.csv',encoding='utf-8-sig')
test[가져가는_col].to_csv('./datasets/불균형처리/코스피_standard_test.csv',encoding='utf-8-sig')