In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids, EditedNearestNeighbours

import pandas as pd 
import numpy as np


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix,roc_auc_score

import warnings
warnings.simplefilter('ignore')



In [11]:
def model_basic(x_train, y_train, x_test, y_test): 
    models = [
         LogisticRegression(),
         LinearDiscriminantAnalysis(),
         GaussianNB(),
         RandomForestClassifier(),
         XGBClassifier()
    ]

    rdict={'model':[],'acc_train':[], 'auc_train':[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}


    for clf in models:
        clf = clf.fit(x_train, y_train)
    #1열:Train
        y_hat = clf.predict(x_train)
        y_proba = clf.predict_proba(x_train)[:,1].reshape(-1,1)
        results_train  = (round(accuracy_score(y_train,y_hat),2),round(roc_auc_score(y_train,y_proba),2))
    #2열:Test
        y_hat = clf.predict(x_test)
        y_proba = clf.predict_proba(x_test)[:,1].reshape(-1,1)
        results = (round(accuracy_score(y_test,y_hat),2),
                        round(precision_score(y_test,y_hat),2),
                        round(recall_score(y_test,y_hat),2),
                        round(f1_score(y_test,y_hat),2),
                        round(roc_auc_score(y_test,y_proba),2))

        rdict['model'].append(clf); 
        rdict['acc_train'].append(results_train[0])
        rdict['auc_train'].append(results_train[1])
        
        rdict['acc_test'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['AUC_test'].append(results[4])   

        confusion = confusion_matrix(y_test, y_hat)

        print(confusion)

    rdf_final = pd.DataFrame(data=rdict)
    return rdf_final

In [3]:
# def model_basic(x_train, y_train, x_test, y_test):
#     models = [
#         LogisticRegression(),
#         LinearDiscriminantAnalysis(),
#         GaussianNB(),
#         RandomForestClassifier(class_weight={0:1,1:10}),
#         XGBClassifier(),
        
#     ]

#     rdict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}

#     for clf in models:
#         clf = clf.fit(x_train, y_train)
#         pred = clf.predict(x_test)
#         pred_prob_rf = clf.predict_proba(x_test)[:, 1].reshape(-1, 1)
#         auc_score = roc_auc_score(y_test, pred_prob_rf)
#         results = (
#             round(accuracy_score(y_test, pred), 4),
#             round(precision_score(y_test, pred), 4),
#             round(recall_score(y_test, pred), 4),
#             round(f1_score(y_test, pred), 4),
#             round(auc_score, 4)
#         )

#         rdict['model'].append(clf)
#         rdict['accuracy'].append(results[0])
#         rdict['precision'].append(results[1])
#         rdict['recall'].append(results[2])
#         rdict['f1_score'].append(results[3])
#         rdict['auc_score'].append(results[4])

#         confusion = confusion_matrix(y_test, pred)
#         print(confusion)

#     rdf = pd.DataFrame(data=rdict)
#     return rdf

In [4]:
def over(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target
    
    if method == 'random':
        # RandomOverSampler 객체 생성, sampling_strategy 값 설정
        oversampler = RandomOverSampler(sampling_strategy=sampling_strategy)

    elif method == 'smote':
        # SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTE(sampling_strategy=sampling_strategy)

    elif method == 'adasyn':
        # adasyn 객체 생성, sampling_strategy 값 설정
        oversampler = ADASYN(sampling_strategy=sampling_strategy)

    elif method == 'Borderline-SMOTE':
        # Borderline-SMOTE 객체 생성, sampling_strategy 값 설정
        oversampler = BorderlineSMOTE(sampling_strategy=sampling_strategy)

    elif method == 'SMOTENC':
        # SMOTENC 객체 생성, sampling_strategy 값 설정
        oversampler = SMOTENC(sampling_strategy=sampling_strategy)

    
    # 오버샘플링 수행
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # 오버샘플링된 데이터 프레임 생성
    df_over = pd.concat([X_resampled, y_resampled], axis=1)

    return df_over


In [5]:
def under(df, target, method='random', sampling_strategy=1.0):
    
    # 타겟 변수와 특성 변수를 분리
    X = df
    y = target

    if method == 'random':
        # RandomUnderSampler 객체 생성, sampling_strategy 값 설정
        undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy)

    elif method == 'tomek':
        # TomekLinks 객체 생성, sampling_strategy 값 설정
        undersampler = TomekLinks(sampling_strategy=sampling_strategy)

    elif method == 'NearMiss':
        # NearMiss 객체 생성, sampling_strategy 값 설정
        undersampler = NearMiss(sampling_strategy=sampling_strategy)

    elif method == 'cluster_centroids':
        # ClusterCentroids 객체 생성, sampling_strategy 값 설정
        undersampler = ClusterCentroids(sampling_strategy=sampling_strategy)

    elif method == 'edited_nn':
        # EditedNearestNeighbours 객체 생성, sampling_strategy 값 설정
        undersampler = EditedNearestNeighbours(sampling_strategy=sampling_strategy)

    # 언더샘플링 수행
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    # 언더샘플링된 데이터 프레임 생성
    df_under = pd.concat([X_resampled, y_resampled], axis=1)

    return df_under

In [6]:
def evaluate_resampling_model(x_train, y_train, x_test, y_test, method, num_iterations=10, over_ratio = 0.5 ,under_ratio=0.5,random_state=None):
    

    # 모델 성능 저장을 위한 리스트
    score = {"accuracy_scores_mean" : [] ,"f1_scores_mean" :[], "recall_scores_mean" : [] , "precision_scores_mean" : [],  "auc_mean" : []}
    std_score = {"accuracy_scores_std" : [] ,"f1_scores_std" :[], "recall_scores_std" : [] , "precision_scores_std" : [],  "auc_std" : []}  

    recall_scores=[]
    accuracy_scores=[]
    f1_scores=[]
    precision_scores=[]
    auc_scores=[]
    
    
    # 랜덤샘플러 생성
    sampler_over = BorderlineSMOTE(sampling_strategy=over_ratio, random_state=random_state)
    sampler_under = RandomUnderSampler(sampling_strategy=under_ratio, random_state=random_state)

    

    # 랜덤 샘플링 반복 수행
    for i in range(num_iterations):
        
        x_train_over, y_train_over = sampler_over.fit_resample(x_train, y_train)
        x_test_under, y_test_under = sampler_under.fit_resample(x_test, y_test)

        if method == "Logistic":
            model = LogisticRegression().fit(x_train_over, y_train_over)
            y_pred = model.predict(x_test_under)
            y_pred_proba = model.predict_proba(x_test_under)[:,1].reshape(-1,1)

        if method == "LDA":
            model = LinearDiscriminantAnalysis().fit(x_train_over, y_train_over)
            y_pred = model.predict(x_test_under)
            y_pred_proba = model.predict_proba(x_test_under)[:,1].reshape(-1,1)
        
        if method == "Gaussian":
            model = GaussianNB().fit(x_train_over, y_train_over)
            y_pred = model.predict(x_test_under)
            y_pred_proba = model.predict_proba(x_test_under)[:,1].reshape(-1,1)
        
        #모델 성능 평가 
        accuracy = accuracy_score(y_test_under, y_pred)
        f1 = f1_score(y_test_under, y_pred)
        recall = recall_score(y_test_under, y_pred)
        precision = precision_score(y_test_under, y_pred)
        auc = roc_auc_score(y_test_under, y_pred_proba)
        
        # 리스트에 결과값 담아주기
        accuracy_scores.append(accuracy)
        f1_scores.append(f1)
        recall_scores.append(recall)
        precision_scores.append(precision)
        auc_scores.append(auc)

    

    # 모델 성능 평균 및 표준편차 계산
    accuracy_mean = np.mean(accuracy_scores)
    f1_mean = np.mean(f1_scores)
    recall_mean = np.mean(recall_scores)
    precision_mean = np.mean(precision_scores)
    auc_scores_mean = np.mean(auc_scores)


    score["accuracy_scores_mean"].append(accuracy_mean)
    score["f1_scores_mean"].append(f1_mean)
    score["recall_scores_mean"].append(recall_mean)
    score["precision_scores_mean"].append(precision_mean)
    score["auc_mean"].append(auc_scores_mean)

    accuracy_std = np.std(accuracy_scores)
    f1_std = np.std(f1_scores)
    recall_std = np.std(recall_scores)
    precision_std = np.std(precision_scores)
    auc_scores_std = np.std(auc_scores)

    std_score["accuracy_scores_std"].append(accuracy_std)
    std_score["f1_scores_std"].append(f1_std)
    std_score["recall_scores_std"].append(recall_std)
    std_score["precision_scores_std"].append(precision_std)
    std_score["auc_std"].append(auc_scores_std)
    
    print(score)
    print(std_score)

---
# target1

In [27]:
train = pd.read_csv("./datasets/통계검증완료/코스닥_standard_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스닥_standard_test.csv", index_col=0)

최종_col_1= ['EBIT/총자산', '현금흐름 대 자산', '순운전자본비율', 'abs(영업현금흐름-당기순이익)/매출액', '누적수익성비율',
       '금융비용부담률', '당기전기영업손익', 'TMD']

x_train = train[최종_col_1]
y_train = train["target_1"]

x_test = test[최종_col_1]
y_test = test["target_1"]



In [28]:
y_train.value_counts(property)

0    0.965517
1    0.034483
Name: target_1, dtype: float64

In [29]:
y_test.value_counts(property)

0    0.974012
1    0.025988
Name: target_1, dtype: float64

In [30]:
## 불균형 처리 안한 적합
model_basic(x_train, y_train, x_test, y_test)

[[2793   18]
 [  31   44]]
[[2767   44]
 [  30   45]]
[[2668  143]
 [  16   59]]
[[2802    9]
 [  31   44]]
[[2798   13]
 [  33   42]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.98,0.73,0.98,0.71,0.59,0.64,0.79
1,LinearDiscriminantAnalysis(),0.97,0.72,0.97,0.51,0.6,0.55,0.79
2,GaussianNB(),0.96,0.79,0.94,0.29,0.79,0.43,0.87
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.99,0.83,0.59,0.69,0.79
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.98,0.76,0.56,0.65,0.78


In [31]:
## train 데이터만 1:2 오버샘플링후 적합

train_1 =over(train[최종_col_1],train["target_1"], method='Borderline-SMOTE', sampling_strategy=0.5)

model_basic(train_1[최종_col_1], train_1["target_1"], x_test, y_test)

[[2586  225]
 [  13   62]]
[[2633  178]
 [  14   61]]
[[2551  260]
 [  10   65]]
[[2738   73]
 [  27   48]]
[[2734   77]
 [  25   50]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.91,0.9,0.92,0.22,0.83,0.34,0.87
1,LinearDiscriminantAnalysis(),0.9,0.88,0.93,0.26,0.81,0.39,0.88
2,GaussianNB(),0.9,0.88,0.91,0.2,0.87,0.32,0.89
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.97,0.4,0.64,0.49,0.81
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.96,0.39,0.67,0.5,0.82


In [32]:
## train 데이터 오버 1:0.5, test 데이터 언더 1:0.5 

train_1 =over(train[최종_col_1],train["target_1"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_1],test["target_1"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_1], train_1["target_1"], test_1[최종_col_1], test_1["target_1"])


[[141   9]
 [ 12  63]]
[[142   8]
 [ 12  63]]
[[139  11]
 [ 10  65]]
[[148   2]
 [ 26  49]]
[[146   4]
 [ 22  53]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.91,0.9,0.91,0.88,0.84,0.86,0.89
1,LinearDiscriminantAnalysis(),0.9,0.87,0.91,0.89,0.84,0.86,0.89
2,GaussianNB(),0.9,0.88,0.91,0.86,0.87,0.86,0.9
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.88,0.96,0.65,0.78,0.82
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.88,0.93,0.71,0.8,0.84


In [33]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='Logistic',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.8924622222222222], 'f1_scores_mean': [0.8387076576880815], 'recall_scores_mean': [0.8364533333333333], 'precision_scores_mean': [0.8417913028812349], 'auc_mean': [0.9457384888888889]}
{'accuracy_scores_std': [0.014706735838123486], 'f1_scores_std': [0.01877770323060012], 'recall_scores_std': [0.007856988396750163], 'precision_scores_std': [0.03655256361213723], 'auc_std': [0.006346211397303059]}


In [34]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='LDA',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.8999022222222223], 'f1_scores_mean': [0.8468560141127079], 'recall_scores_mean': [0.8282266666666664], 'precision_scores_mean': [0.8670999614903789], 'auc_mean': [0.9533179555555555]}
{'accuracy_scores_std': [0.013246767879722938], 'f1_scores_std': [0.017343612560620707], 'recall_scores_std': [0.007071834588311886], 'precision_scores_std': [0.03520161840581928], 'auc_std': [0.006141945046648448]}


--- 
# targer2

In [35]:
train = pd.read_csv("./datasets/통계검증완료/코스닥_power_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스닥_power_test.csv", index_col=0)


최종_col_2= ['EBIT/총자산', '현금흐름 대 자산', '순운전자본비율', '누적수익성비율', '총자본회전률', 'TMD',
       '매출액총이익률']


x_train = train[최종_col_2]
y_train = train["target_2"]

x_test = test[최종_col_2]
y_test = test["target_2"]



In [36]:
## 불균형 처리 안한 적합
model_basic(x_train, y_train, x_test, y_test)

[[3476   67]
 [  35   38]]
[[3421  122]
 [  40   33]]
[[3246  297]
 [  17   56]]
[[3480   63]
 [  37   36]]
[[3476   67]
 [  34   39]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.97,0.63,0.97,0.36,0.52,0.43,0.75
1,LinearDiscriminantAnalysis(),0.96,0.63,0.96,0.21,0.45,0.29,0.71
2,GaussianNB(),0.95,0.76,0.91,0.16,0.77,0.26,0.84
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.97,0.36,0.49,0.42,0.74
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.97,0.37,0.53,0.44,0.76


In [37]:
## train 데이터만 1:2 오버샘플링후 적합

train_1 =over(train[최종_col_2],train["target_2"], method='Borderline-SMOTE', sampling_strategy=0.5)

model_basic(train_1[최종_col_2], train_1["target_2"], x_test, y_test)

[[3033  510]
 [  10   63]]
[[2976  567]
 [  11   62]]
[[2906  637]
 [   6   67]]
[[3374  169]
 [  24   49]]
[[3343  200]
 [  24   49]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.88,0.86,0.86,0.11,0.86,0.2,0.86
1,LinearDiscriminantAnalysis(),0.87,0.85,0.84,0.1,0.85,0.18,0.84
2,GaussianNB(),0.87,0.86,0.82,0.1,0.92,0.17,0.87
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.95,0.22,0.67,0.34,0.81
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.94,0.2,0.67,0.3,0.81


In [38]:
## train 데이터 오버 1:0.5, test 데이터 언더 1:0.5 

train_1 =over(train[최종_col_2],train["target_2"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_2],test["target_2"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_2], train_1["target_2"], test_1[최종_col_2], test_1["target_2"])

[[130  16]
 [ 10  63]]
[[128  18]
 [ 10  63]]
[[122  24]
 [  6  67]]
[[143   3]
 [ 24  49]]
[[142   4]
 [ 24  49]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.88,0.86,0.88,0.8,0.86,0.83,0.88
1,LinearDiscriminantAnalysis(),0.87,0.86,0.87,0.78,0.86,0.82,0.87
2,GaussianNB(),0.87,0.87,0.86,0.74,0.92,0.82,0.88
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.88,0.94,0.67,0.78,0.83
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.87,0.92,0.67,0.78,0.82


In [39]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='Logistic',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.8570593607305936], 'f1_scores_mean': [0.8013864260611946], 'recall_scores_mean': [0.8620410958904111], 'precision_scores_mean': [0.7495776195225464], 'auc_mean': [0.9109999061737662]}
{'accuracy_scores_std': [0.019252727097157793], 'f1_scores_std': [0.021457643883243848], 'recall_scores_std': [0.0035710886362093877], 'precision_scores_std': [0.03737421993869979], 'auc_std': [0.00964874064517834]}


In [40]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='LDA',over_ratio=0.5,under_ratio=0.5,num_iterations=1000)

{'accuracy_scores_mean': [0.8463972602739727], 'f1_scores_mean': [0.7897639146745058], 'recall_scores_mean': [0.8623150684931509], 'precision_scores_mean': [0.729370382233371], 'auc_mean': [0.9039242822293113]}
{'accuracy_scores_std': [0.020452685837256367], 'f1_scores_std': [0.022165533746078495], 'recall_scores_std': [0.0030136674967258734], 'precision_scores_std': [0.0377626143110009], 'auc_std': [0.011051193164874023]}


----
# target3

In [12]:
train = pd.read_csv("./datasets/통계검증완료/코스닥_standard_train.csv", index_col=0)
test = pd.read_csv("./datasets/통계검증완료/코스닥_standard_test.csv", index_col=0)

id_col = ['회사명', '거래소코드', '회계년도', '산업군','target_3']


최종_col_3 =  ['abs(영업현금흐름-당기순이익)/매출액','금융비용부담률','WW지수' ,'누적수익성비율', 'TMD','순운전자본비율']

x_train = train[최종_col_3]
y_train = train["target_3"]

x_test = test[최종_col_3]
y_test = test["target_3"]



In [13]:
## 불균형 처리 안한 적합
model_basic(x_train, y_train, x_test, y_test)

[[2464   13]
 [  21   51]]
[[2462   15]
 [  21   51]]
[[2383   94]
 [  12   60]]
[[2466   11]
 [  20   52]]
[[2464   13]
 [  26   46]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.98,0.94,0.99,0.8,0.71,0.75,0.97
1,LinearDiscriminantAnalysis(),0.98,0.94,0.99,0.77,0.71,0.74,0.97
2,GaussianNB(),0.96,0.94,0.96,0.39,0.83,0.53,0.97
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.99,0.83,0.72,0.77,0.97
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.98,0.78,0.64,0.7,0.97


In [24]:
## train 데이터만 1:2 오버샘플링후 적합

train_1 =over(train[최종_col_3],train["target_3"], method='Borderline-SMOTE', sampling_strategy=0.3)

model_basic(train_1[최종_col_3], train_1["target_3"], x_test, y_test)

[[2375  102]
 [  10   62]]
[[2395   82]
 [  12   60]]
[[2306  171]
 [   9   63]]
[[2441   36]
 [  18   54]]
[[2443   34]
 [  18   54]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.92,0.96,0.96,0.38,0.86,0.53,0.98
1,LinearDiscriminantAnalysis(),0.9,0.95,0.96,0.42,0.83,0.56,0.97
2,GaussianNB(),0.91,0.95,0.93,0.27,0.88,0.41,0.97
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.98,0.6,0.75,0.67,0.97
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.98,0.61,0.75,0.68,0.97


In [19]:
## train 데이터 오버 1:0.5, test 데이터 언더 1:0.5 

train_1 =over(train[최종_col_3],train["target_3"], method='Borderline-SMOTE', sampling_strategy=0.5)

test_1 =under(test[최종_col_3],test["target_3"], method='random', sampling_strategy=0.5)

model_basic(train_1[최종_col_3], train_1["target_3"], test_1[최종_col_3], test_1["target_3"])


[[136   8]
 [  8  64]]
[[136   8]
 [  7  65]]
[[129  15]
 [  7  65]]
[[142   2]
 [ 18  54]]
[[139   5]
 [ 17  55]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,LogisticRegression(),0.91,0.96,0.93,0.89,0.89,0.89,0.97
1,LinearDiscriminantAnalysis(),0.9,0.96,0.93,0.89,0.9,0.9,0.97
2,GaussianNB(),0.91,0.95,0.9,0.81,0.9,0.86,0.97
3,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.91,0.96,0.75,0.84,0.96
4,"XGBClassifier(base_score=None, booster=None, c...",1.0,1.0,0.9,0.92,0.76,0.83,0.96


In [21]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='Logistic',over_ratio=0.1,under_ratio=0.1,num_iterations=1000)

{'accuracy_scores_mean': [0.9681085858585857], 'f1_scores_mean': [0.818987344335192], 'recall_scores_mean': [0.7919999999999999], 'precision_scores_mean': [0.8485931617385765], 'auc_mean': [0.9748690972222223]}
{'accuracy_scores_std': [0.0034280793660569917], 'f1_scores_std': [0.016042816215772023], 'recall_scores_std': [0.004423213177636239], 'precision_scores_std': [0.033974901970202344], 'auc_std': [0.0014816954914506858]}


In [12]:
evaluate_resampling_model(x_train, y_train,x_test,y_test,
                          method='LDA',over_ratio=0.5,under_ratio=0.1,num_iterations=1000)

{'accuracy_scores_mean': [0.9133333333333332], 'f1_scores_mean': [0.8709972889132218], 'recall_scores_mean': [0.875], 'precision_scores_mean': [0.8677153702409295], 'auc_mean': [0.9680507330246914]}
{'accuracy_scores_std': [0.013471824481846716], 'f1_scores_std': [0.017439209413209644], 'recall_scores_std': [0.0], 'precision_scores_std': [0.03450559253257963], 'auc_std': [0.005319401070170391]}


-----------------------------
#target3   train 정상 : 부실 = 1:0.5 오버샘플링에서 성능이 가장좋음
 
------------------------------

In [47]:
가져가는_col = id_col + 최종_col_3
train_1.to_csv('./datasets/불균형처리/boder_코스닥_standar_train.csv',encoding='utf-8-sig')
test[가져가는_col].to_csv('./datasets/불균형처리/코스닥_standard_test.csv',encoding='utf-8-sig')