# 아래 경로의 폴더 안에 있는 파일들에 대해서 ML models 실험.
경로 : "/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data"

- labeling 기준 : eGFR < 60 기준 만을 사용. => tight3.csv 파일 사용.
- original vs under-sampling vs over-sampling
    - (basic) vs (food feature) vs (basic + food feature)
- SVM, RF, GBC

In [12]:
import random
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import torch
from scipy import stats
from scipy.stats import randint, loguniform


from sklearn import preprocessing
from sklearn import tree

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression 

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score, roc_auc_score

from sklearn.preprocessing import label_binarize

In [13]:
# set the seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(1109)

#### functions

In [14]:
# test에 나머지 control sample 추가해서 idx 만 반환
def divide_testset(unbalanced_data, ratio):
    # train에서 ckd, control index 확인
    total_idx = unbalanced_data.index
    ckd2_idx = unbalanced_data[unbalanced_data['onset_tight'] == 2].index        # eGFR > 60 & urineprotein > 1+
    ckd1_idx = unbalanced_data[unbalanced_data['onset_tight'] == 1].index        # 실제 ckd
    control_idx = unbalanced_data[unbalanced_data['onset_tight'] == 0].index    # 실제 control
    # print(control_idx, ckd_idx)

    # ckd 갯수와 동일하게 control idx sampling
    rng = np.random.default_rng(seed=0) 
    sampled_ckd1_idx = pd.Index(rng.choice(ckd1_idx, size=int(len(ckd1_idx)*ratio), replace=False))
    
    sampled_control_idx = pd.Index(rng.choice(control_idx, size=len(sampled_ckd1_idx), replace=False)) # test_ckd 갯수와 control sampling
    sampled_ckd2_idx = pd.Index(rng.choice(ckd2_idx, size=len(sampled_ckd1_idx), replace=False)) # test_ckd 갯수와 ckd2 sampling
    
    test_idx = sampled_ckd1_idx.append(sampled_control_idx)
    test_idx = test_idx.append(sampled_ckd2_idx)
    train_idx = total_idx.difference(test_idx)

    # return 실제 ckd, 실제 ckd 갯수와 동일한 갯수의 subject, control_idx - ckd_idx
    return unbalanced_data.loc[train_idx], unbalanced_data.loc[test_idx]

In [15]:
### Oversampling
def oversampling(unbalanced_dataframe, seed):
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=seed)
    temp = unbalanced_dataframe.drop(['RID'], axis=1)
    X_train, y_train = smote.fit_resample(temp, temp['onset_tight'])

    # X_train에는 RID, onset_3 없음.
    return X_train.drop(['onset_tight'], axis=1), y_train

### Undersampling
# test에 나머지 control sample 추가해서 idx 만 반환
def _under_sampling_idx(unbalanced_data, seed):
    # train에서 ckd, control index 확인
    ckd2_idx = unbalanced_data[unbalanced_data['onset_tight'] == 2].index        # eGFR > 60 & urineprotein > 1+
    ckd1_idx = unbalanced_data[unbalanced_data['onset_tight'] == 1].index        # 실제 ckd
    control_idx = unbalanced_data[unbalanced_data['onset_tight'] == 0].index    # 실제 control
    # print(control_idx, ckd_idx)

    # ckd 갯수와 동일하게 control idx sampling
    """
    Control CKD 비율 조정
    """
    rng = np.random.default_rng(seed=seed) 
    sampled_control_idx = pd.Index(rng.choice(control_idx, size=len(ckd1_idx), replace=False)) # ckd 갯수와 동일하게 sampling
    not_sampled_control_idx = control_idx.difference(sampled_control_idx)
    
    sampled_ckd2_idx = pd.Index(rng.choice(ckd2_idx, size=len(ckd1_idx), replace=False)) # ckd 갯수와 동일하게 sampling
    not_sampled_control_idx = ckd2_idx.difference(sampled_ckd2_idx)

    # 잘 sampling 되었는지 확인
    assert set(sampled_control_idx).issubset(set(control_idx))
    assert set(sampled_ckd2_idx).issubset(set(ckd2_idx))
    # print(len(sampled_control_idx))

    balanced_idx = sampled_control_idx.append(ckd1_idx)
    balanced_idx = balanced_idx.append(sampled_ckd2_idx)

    # return 실제 ckd, 실제 ckd 갯수와 동일한 갯수의 subject, control_idx - ckd_idx
    return ckd1_idx, sampled_control_idx, not_sampled_control_idx, balanced_idx

def undersampling(unbalanced_data, seed):
    a, b, c, d = _under_sampling_idx(unbalanced_data, seed)
    under_sampled_data = unbalanced_data.loc[d]
    X_undersampled = under_sampled_data.drop(['RID', 'onset_tight'], axis=1)
    y_undersampled = under_sampled_data['onset_tight']
    return X_undersampled, y_undersampled

In [16]:
def get_results(y_test, final_prediction, final_probabilities):
    cm = confusion_matrix(list(y_test), list(final_prediction))
    print(cm)
    tn, fn, tp, fp  = cm[0][0], cm[1][0], cm[1][1], cm[0][1]
    recall = tp / (fn + tp)
    precision = tp / (fp + tp)
    acc = (tp + tn) / (tn + fn + tp + fp)
    
    # Calculate AUC score
    auc = roc_auc_score(y_test, final_probabilities, multi_class="ovr")
    
    print("Recall \t Precision \t Acc \t AUC")
    print(f"{np.round(recall, 4)} {np.round(precision, 4)} {np.round(acc, 4)} {np.round(auc, 4)}")

In [17]:
def model_fitting(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def model_eval(fitted_model, X_test, y_test):
    y_pred = fitted_model.predict(X_test)
    y_pred_prob = fitted_model.predict_proba(X_test)  # AUC 계산을 위해 확률값 사용    
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')  # 다중 클래스이므로 'macro' 사용
    recall = recall_score(y_test, y_pred, average='macro')        # 다중 클래스이므로 'macro' 사용
    # auc = roc_auc_score(y_bin[test_index], y_pred_prob, multi_class="ovr")  # 다중 클래스 AUC

    print("Recall \t Precision \t Acc")
    print(f"{np.round(recall, 4)} {np.round(precision, 4)} {np.round(accuracy, 4)}")
    
# def model_eval(fitted_model, X_test, y_test):
#     model_prediction = fitted_model.predict(X_test)
#     model_probabilities = fitted_model.predict_proba(X_test)[:, 1]
#     model_score = fitted_model.score(X_test, y_test)
#     print(f"Score with simple {fitted_model} model")
#     print(0.5, np.round(model_score, 4))     # accuracy

#     get_results(y_test, model_prediction, model_probabilities)

# 1. Food sum

In [18]:
tight3_foodsum = pd.read_csv("/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data/0922_3way_data/3way_basic_food_sum.csv")

In [19]:
print(tight3_foodsum[['onset', 'onset_tight']].value_counts())

# Urine protein 1+ & eGFR >= 60 인 환자 (1137명) 2로 labeling
tight3_foodsum.loc[(tight3_foodsum['onset'] == 1) & (tight3_foodsum['onset_tight'] == 0), 'onset_tight'] = 2
tight3_foodsum['onset_tight'].value_counts()

tight3_foodsum.drop(['onset'], axis=1, inplace=True)

onset  onset_tight
0      0              55569
1      0               1137
       1                525
Name: count, dtype: int64


In [25]:
n_split = 5

# Train/test divide => Always Same
tight3_foodsum_train, tight3_foodsum_test = divide_testset(tight3_foodsum, 0.2)        # train-test ratio
y_tight3_test = tight3_foodsum_test['onset_tight']
X_tight3_test = tight3_foodsum_test.drop(['RID', 'onset_tight'], axis=1)

seed_results = {}
for seed in [42, 59, 63, 79, 101]:    
    datas = {"original":(), "undersampling":(), "oversampling":(), "test":(X_tight3_test, y_tight3_test)}

    datas['undersampling'] = undersampling(tight3_foodsum_train, seed=seed)
    datas['oversampling'] = oversampling(tight3_foodsum_train, seed=seed)
    y_tight3_foodsum_train = tight3_foodsum_train['onset_tight']
    X_tight3_foodsum_train = tight3_foodsum_train.drop(['RID', 'onset_tight'], axis=1)
    datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)
    
    for data in ['undersampling']:
        print(f"For {data} dataset!!!")
        X_test, y_test = datas['test']
        X_train, y_train = datas[data]
        print(y_train.value_counts())
        
        wei_train_scaler = StandardScaler()
        X_train = wei_train_scaler.fit_transform(X_train)
        X_test = wei_train_scaler.transform(X_test)
        print(f"{data} dataset loaded and scaled")
        
        scoring = {
        'recall': 'recall_macro',      # recall for each class, then averaged
        'precision': 'precision_macro',# precision for each class, then averaged
        'accuracy': 'accuracy',        # accuracy
        # 'auc': make_scorer(roc_auc_score, multi_class='ovr')  # AUC 계산 (이진 분류시)
        }
        
        models = (
            SVC(kernel='linear', random_state=seed, probability=True),
            RandomForestClassifier(random_state=seed, max_depth=3),
            LogisticRegression(max_iter=1000, random_state=seed),
            GradientBoostingClassifier(random_state=seed, max_depth=1, learning_rate=0.01),
        )
        
        for i, model in enumerate(models):
            print("=" * 30)
            print(f"Cross Validation for {model} in fold {n_split}, with seed {seed}")
            # # Stratified K-Fold 교차 검증 (K=5)
            skf = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=seed)
            results = cross_validate(model, X_train, y_train, cv=skf, scoring=scoring, return_train_score=False)

            # 결과 출력
            print("각 폴드의 Recall:", results['test_recall'])
            print("각 폴드의 Precision:", results['test_precision'])
            print("각 폴드의 Accuracy:", results['test_accuracy'])
            # print("각 폴드의 AUC:", results['test_auc'])

            # 평균값 계산
            print("평균 Recall, Precision, Accuracy, AUC:")
            print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}")
            # print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}, {round(np.mean(results['test_auc']), 4)}")
            
            if seed not in seed_results.keys():
                seed_results[seed] = [(round(np.mean(results['test_recall']), 4),
                                    round(np.mean(results['test_precision']),4),
                                    round(np.mean(results['test_accuracy']), 4))]
            else:
                seed_results[seed].append((round(np.mean(results['test_recall']), 4),
                                    round(np.mean(results['test_precision']),4),
                                    round(np.mean(results['test_accuracy']), 4)))
            
            print()
            print("-- Final Model Train & Eval --")
            final_model = models[i]
            final_model = model_fitting(final_model, X_train, y_train)
            model_eval(final_model, X_test, y_test)
            print()

print(seed_results)

For undersampling dataset!!!
onset_tight
0    420
1    420
2    420
Name: count, dtype: int64
undersampling dataset loaded and scaled
Cross Validation for SVC(kernel='linear', probability=True, random_state=42) in fold 5, with seed 42
각 폴드의 Recall: [0.54761905 0.58333333 0.58333333 0.60714286 0.56746032]
각 폴드의 Precision: [0.53706246 0.57842919 0.58888426 0.59193644 0.56529791]
각 폴드의 Accuracy: [0.54761905 0.58333333 0.58333333 0.60714286 0.56746032]
평균 Recall, Precision, Accuracy, AUC:
0.5778, 0.5723, 0.5778

-- Final Model Train & Eval --
Recall 	 Precision 	 Acc
0.5841 0.5855 0.5841

Cross Validation for RandomForestClassifier(max_depth=3, random_state=42) in fold 5, with seed 42
각 폴드의 Recall: [0.57936508 0.57539683 0.5952381  0.56746032 0.57142857]
각 폴드의 Precision: [0.57303355 0.55951152 0.58259932 0.54673282 0.55859686]
각 폴드의 Accuracy: [0.57936508 0.57539683 0.5952381  0.56746032 0.57142857]
평균 Recall, Precision, Accuracy, AUC:
0.5778, 0.5641, 0.5778

-- Final Model Train & Eval --


{42: [(0.5778, 0.5723, 0.5778), (0.5778, 0.5641, 0.5778), (0.581, 0.5739, 0.581), (0.5556, 0.5459, 0.5556)], 59: [(0.5762, 0.5657, 0.5762), (0.5635, 0.5422, 0.5635), (0.5627, 0.5495, 0.5627), (0.5421, 0.531, 0.5421)], 63: [(0.5833, 0.5758, 0.5833), (0.5722, 0.5519, 0.5722), (0.5802, 0.5707, 0.5802), (0.5397, 0.5245, 0.5397)], 79: [(0.5817, 0.5775, 0.5817), (0.5881, 0.5726, 0.5881), (0.5897, 0.5811, 0.5897), (0.5468, 0.5472, 0.5468)], 101: [(0.5833, 0.5766, 0.5833), (0.5802, 0.5672, 0.5802), (0.5897, 0.5818, 0.5897), (0.5524, 0.5555, 0.5524)]}


# 2. Food mean

In [27]:
tight3_foodsum = pd.read_csv("/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data/0922_3way_data/3way_basic_food_mean.csv")

In [28]:
print(tight3_foodsum[['onset', 'onset_tight']].value_counts())

# Urine protein 1+ & eGFR >= 60 인 환자 (1137명) 2로 labeling
tight3_foodsum.loc[(tight3_foodsum['onset'] == 1) & (tight3_foodsum['onset_tight'] == 0), 'onset_tight'] = 2
tight3_foodsum['onset_tight'].value_counts()

tight3_foodsum.drop(['onset'], axis=1, inplace=True)

onset  onset_tight
0      0              55569
1      0               1137
       1                525
Name: count, dtype: int64


In [29]:
n_split = 5

# Train/test divide => Always Same
tight3_foodsum_train, tight3_foodsum_test = divide_testset(tight3_foodsum, 0.2)        # train-test ratio
y_tight3_test = tight3_foodsum_test['onset_tight']
X_tight3_test = tight3_foodsum_test.drop(['RID', 'onset_tight'], axis=1)

seed_results = {}
for seed in [42, 59, 63, 79, 101]:    
    datas = {"original":(), "undersampling":(), "oversampling":(), "test":(X_tight3_test, y_tight3_test)}

    datas['undersampling'] = undersampling(tight3_foodsum_train, seed=seed)
    datas['oversampling'] = oversampling(tight3_foodsum_train, seed=seed)
    y_tight3_foodsum_train = tight3_foodsum_train['onset_tight']
    X_tight3_foodsum_train = tight3_foodsum_train.drop(['RID', 'onset_tight'], axis=1)
    datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)
    
    for data in ['undersampling']:
        print(f"For {data} dataset!!!")
        X_test, y_test = datas['test']
        X_train, y_train = datas[data]
        print(y_train.value_counts())
        
        wei_train_scaler = StandardScaler()
        X_train = wei_train_scaler.fit_transform(X_train)
        X_test = wei_train_scaler.transform(X_test)
        print(f"{data} dataset loaded and scaled")
        
        scoring = {
        'recall': 'recall_macro',      # recall for each class, then averaged
        'precision': 'precision_macro',# precision for each class, then averaged
        'accuracy': 'accuracy',        # accuracy
        # 'auc': make_scorer(roc_auc_score, multi_class='ovr')  # AUC 계산 (이진 분류시)
        }
        
        models = (
            SVC(kernel='linear', random_state=seed, probability=True),
            RandomForestClassifier(random_state=seed, max_depth=3),
            LogisticRegression(max_iter=1000, random_state=seed),
            GradientBoostingClassifier(random_state=seed, max_depth=1, learning_rate=0.01),
        )
        
        for i, model in enumerate(models):
            print("=" * 30)
            print(f"Cross Validation for {model} in fold {n_split}, with seed {seed}")
            # # Stratified K-Fold 교차 검증 (K=5)
            skf = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=seed)
            results = cross_validate(model, X_train, y_train, cv=skf, scoring=scoring, return_train_score=False)

            # 결과 출력
            print("각 폴드의 Recall:", results['test_recall'])
            print("각 폴드의 Precision:", results['test_precision'])
            print("각 폴드의 Accuracy:", results['test_accuracy'])
            # print("각 폴드의 AUC:", results['test_auc'])

            # 평균값 계산
            print("평균 Recall, Precision, Accuracy, AUC:")
            print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}")
            # print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}, {round(np.mean(results['test_auc']), 4)}")
            
            if seed not in seed_results.keys():
                seed_results[seed] = [(round(np.mean(results['test_recall']), 4),
                                    round(np.mean(results['test_precision']),4),
                                    round(np.mean(results['test_accuracy']), 4))]
            else:
                seed_results[seed].append((round(np.mean(results['test_recall']), 4),
                                    round(np.mean(results['test_precision']),4),
                                    round(np.mean(results['test_accuracy']), 4)))
            
            print()
            print("-- Final Model Train & Eval --")
            final_model = models[i]
            final_model = model_fitting(final_model, X_train, y_train)
            model_eval(final_model, X_test, y_test)
            print()

print(seed_results)

For undersampling dataset!!!
onset_tight
0    420
1    420
2    420
Name: count, dtype: int64
undersampling dataset loaded and scaled
Cross Validation for SVC(kernel='linear', probability=True, random_state=42) in fold 5, with seed 42
각 폴드의 Recall: [0.54761905 0.58333333 0.58333333 0.60714286 0.56746032]
각 폴드의 Precision: [0.53706246 0.57842919 0.58888426 0.59193644 0.56529791]
각 폴드의 Accuracy: [0.54761905 0.58333333 0.58333333 0.60714286 0.56746032]
평균 Recall, Precision, Accuracy, AUC:
0.5778, 0.5723, 0.5778

-- Final Model Train & Eval --
Recall 	 Precision 	 Acc
0.5841 0.5855 0.5841

Cross Validation for RandomForestClassifier(max_depth=3, random_state=42) in fold 5, with seed 42
각 폴드의 Recall: [0.57936508 0.57539683 0.5952381  0.56746032 0.57142857]
각 폴드의 Precision: [0.57303355 0.55951152 0.58259932 0.54673282 0.55859686]
각 폴드의 Accuracy: [0.57936508 0.57539683 0.5952381  0.56746032 0.57142857]
평균 Recall, Precision, Accuracy, AUC:
0.5778, 0.5641, 0.5778

-- Final Model Train & Eval --


# 3. Food adjsum

In [30]:
tight3_foodsum = pd.read_csv("/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data/0922_3way_data/3way_basic_food_adjusted_sum.csv")

In [31]:
print(tight3_foodsum[['onset', 'onset_tight']].value_counts())

# Urine protein 1+ & eGFR >= 60 인 환자 (1137명) 2로 labeling
tight3_foodsum.loc[(tight3_foodsum['onset'] == 1) & (tight3_foodsum['onset_tight'] == 0), 'onset_tight'] = 2
tight3_foodsum['onset_tight'].value_counts()

tight3_foodsum.drop(['onset'], axis=1, inplace=True)

onset  onset_tight
0      0              55569
1      0               1137
       1                525
Name: count, dtype: int64


In [32]:
n_split = 5

# Train/test divide => Always Same
tight3_foodsum_train, tight3_foodsum_test = divide_testset(tight3_foodsum, 0.2)        # train-test ratio
y_tight3_test = tight3_foodsum_test['onset_tight']
X_tight3_test = tight3_foodsum_test.drop(['RID', 'onset_tight'], axis=1)

seed_results = {}
for seed in [42, 59, 63, 79, 101]:    
    datas = {"original":(), "undersampling":(), "oversampling":(), "test":(X_tight3_test, y_tight3_test)}

    datas['undersampling'] = undersampling(tight3_foodsum_train, seed=seed)
    datas['oversampling'] = oversampling(tight3_foodsum_train, seed=seed)
    y_tight3_foodsum_train = tight3_foodsum_train['onset_tight']
    X_tight3_foodsum_train = tight3_foodsum_train.drop(['RID', 'onset_tight'], axis=1)
    datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)
    
    for data in ['undersampling']:
        print(f"For {data} dataset!!!")
        X_test, y_test = datas['test']
        X_train, y_train = datas[data]
        print(y_train.value_counts())
        
        wei_train_scaler = StandardScaler()
        X_train = wei_train_scaler.fit_transform(X_train)
        X_test = wei_train_scaler.transform(X_test)
        print(f"{data} dataset loaded and scaled")
        
        scoring = {
        'recall': 'recall_macro',      # recall for each class, then averaged
        'precision': 'precision_macro',# precision for each class, then averaged
        'accuracy': 'accuracy',        # accuracy
        # 'auc': make_scorer(roc_auc_score, multi_class='ovr')  # AUC 계산 (이진 분류시)
        }
        
        models = (
            SVC(kernel='linear', random_state=seed, probability=True),
            RandomForestClassifier(random_state=seed, max_depth=3),
            LogisticRegression(max_iter=1000, random_state=seed),
            GradientBoostingClassifier(random_state=seed, max_depth=1, learning_rate=0.01),
        )
        
        for i, model in enumerate(models):
            print("=" * 30)
            print(f"Cross Validation for {model} in fold {n_split}, with seed {seed}")
            # # Stratified K-Fold 교차 검증 (K=5)
            skf = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=seed)
            results = cross_validate(model, X_train, y_train, cv=skf, scoring=scoring, return_train_score=False)

            # 결과 출력
            print("각 폴드의 Recall:", results['test_recall'])
            print("각 폴드의 Precision:", results['test_precision'])
            print("각 폴드의 Accuracy:", results['test_accuracy'])
            # print("각 폴드의 AUC:", results['test_auc'])

            # 평균값 계산
            print("평균 Recall, Precision, Accuracy, AUC:")
            print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}")
            # print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}, {round(np.mean(results['test_auc']), 4)}")
            
            if seed not in seed_results.keys():
                seed_results[seed] = [(round(np.mean(results['test_recall']), 4),
                                    round(np.mean(results['test_precision']),4),
                                    round(np.mean(results['test_accuracy']), 4))]
            else:
                seed_results[seed].append((round(np.mean(results['test_recall']), 4),
                                    round(np.mean(results['test_precision']),4),
                                    round(np.mean(results['test_accuracy']), 4)))
            
            print()
            print("-- Final Model Train & Eval --")
            final_model = models[i]
            final_model = model_fitting(final_model, X_train, y_train)
            model_eval(final_model, X_test, y_test)
            print()

print(seed_results)

For undersampling dataset!!!
onset_tight
0    420
1    420
2    420
Name: count, dtype: int64
undersampling dataset loaded and scaled
Cross Validation for SVC(kernel='linear', probability=True, random_state=42) in fold 5, with seed 42
각 폴드의 Recall: [0.54761905 0.58333333 0.58730159 0.60714286 0.56746032]
각 폴드의 Precision: [0.53706246 0.57842919 0.59262646 0.59193644 0.56529791]
각 폴드의 Accuracy: [0.54761905 0.58333333 0.58730159 0.60714286 0.56746032]
평균 Recall, Precision, Accuracy, AUC:
0.5786, 0.5731, 0.5786

-- Final Model Train & Eval --
Recall 	 Precision 	 Acc
0.5873 0.5881 0.5873

Cross Validation for RandomForestClassifier(max_depth=3, random_state=42) in fold 5, with seed 42
각 폴드의 Recall: [0.56349206 0.59920635 0.58333333 0.59126984 0.58333333]
각 폴드의 Precision: [0.55572836 0.59387374 0.5710728  0.57607691 0.57031626]
각 폴드의 Accuracy: [0.56349206 0.59920635 0.58333333 0.59126984 0.58333333]
평균 Recall, Precision, Accuracy, AUC:
0.5841, 0.5734, 0.5841

-- Final Model Train & Eval --


# 4. Food adj mean

In [33]:
tight3_foodsum = pd.read_csv("/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data/0922_3way_data/3way_basic_food_adjusted_mean.csv")

In [34]:
print(tight3_foodsum[['onset', 'onset_tight']].value_counts())

# Urine protein 1+ & eGFR >= 60 인 환자 (1137명) 2로 labeling
tight3_foodsum.loc[(tight3_foodsum['onset'] == 1) & (tight3_foodsum['onset_tight'] == 0), 'onset_tight'] = 2
tight3_foodsum['onset_tight'].value_counts()

tight3_foodsum.drop(['onset'], axis=1, inplace=True)

onset  onset_tight
0      0              55569
1      0               1137
       1                525
Name: count, dtype: int64


In [35]:
n_split = 5

# Train/test divide => Always Same
tight3_foodsum_train, tight3_foodsum_test = divide_testset(tight3_foodsum, 0.2)        # train-test ratio
y_tight3_test = tight3_foodsum_test['onset_tight']
X_tight3_test = tight3_foodsum_test.drop(['RID', 'onset_tight'], axis=1)

seed_results = {}
for seed in [42, 59, 63, 79, 101]:    
    datas = {"original":(), "undersampling":(), "oversampling":(), "test":(X_tight3_test, y_tight3_test)}

    datas['undersampling'] = undersampling(tight3_foodsum_train, seed=seed)
    datas['oversampling'] = oversampling(tight3_foodsum_train, seed=seed)
    y_tight3_foodsum_train = tight3_foodsum_train['onset_tight']
    X_tight3_foodsum_train = tight3_foodsum_train.drop(['RID', 'onset_tight'], axis=1)
    datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)
    
    for data in ['undersampling']:
        print(f"For {data} dataset!!!")
        X_test, y_test = datas['test']
        X_train, y_train = datas[data]
        print(y_train.value_counts())
        
        wei_train_scaler = StandardScaler()
        X_train = wei_train_scaler.fit_transform(X_train)
        X_test = wei_train_scaler.transform(X_test)
        print(f"{data} dataset loaded and scaled")
        
        scoring = {
        'recall': 'recall_macro',      # recall for each class, then averaged
        'precision': 'precision_macro',# precision for each class, then averaged
        'accuracy': 'accuracy',        # accuracy
        # 'auc': make_scorer(roc_auc_score, multi_class='ovr')  # AUC 계산 (이진 분류시)
        }
        
        models = (
            SVC(kernel='linear', random_state=seed, probability=True),
            RandomForestClassifier(random_state=seed, max_depth=3),
            LogisticRegression(max_iter=1000, random_state=seed),
            GradientBoostingClassifier(random_state=seed, max_depth=1, learning_rate=0.01),
        )
        
        for i, model in enumerate(models):
            print("=" * 30)
            print(f"Cross Validation for {model} in fold {n_split}, with seed {seed}")
            # # Stratified K-Fold 교차 검증 (K=5)
            skf = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=seed)
            results = cross_validate(model, X_train, y_train, cv=skf, scoring=scoring, return_train_score=False)

            # 결과 출력
            print("각 폴드의 Recall:", results['test_recall'])
            print("각 폴드의 Precision:", results['test_precision'])
            print("각 폴드의 Accuracy:", results['test_accuracy'])
            # print("각 폴드의 AUC:", results['test_auc'])

            # 평균값 계산
            print("평균 Recall, Precision, Accuracy, AUC:")
            print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}")
            # print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}, {round(np.mean(results['test_auc']), 4)}")
            
            if seed not in seed_results.keys():
                seed_results[seed] = [(round(np.mean(results['test_recall']), 4),
                                    round(np.mean(results['test_precision']),4),
                                    round(np.mean(results['test_accuracy']), 4))]
            else:
                seed_results[seed].append((round(np.mean(results['test_recall']), 4),
                                    round(np.mean(results['test_precision']),4),
                                    round(np.mean(results['test_accuracy']), 4)))
            
            print()
            print("-- Final Model Train & Eval --")
            final_model = models[i]
            final_model = model_fitting(final_model, X_train, y_train)
            model_eval(final_model, X_test, y_test)
            print()

print(seed_results)

For undersampling dataset!!!
onset_tight
0    420
1    420
2    420
Name: count, dtype: int64
undersampling dataset loaded and scaled
Cross Validation for SVC(kernel='linear', probability=True, random_state=42) in fold 5, with seed 42
각 폴드의 Recall: [0.54761905 0.58333333 0.58730159 0.60714286 0.56746032]
각 폴드의 Precision: [0.53706246 0.57842919 0.59262646 0.59193644 0.56529791]
각 폴드의 Accuracy: [0.54761905 0.58333333 0.58730159 0.60714286 0.56746032]
평균 Recall, Precision, Accuracy, AUC:
0.5786, 0.5731, 0.5786

-- Final Model Train & Eval --
Recall 	 Precision 	 Acc
0.5873 0.5881 0.5873

Cross Validation for RandomForestClassifier(max_depth=3, random_state=42) in fold 5, with seed 42
각 폴드의 Recall: [0.56349206 0.59920635 0.58333333 0.59126984 0.58333333]
각 폴드의 Precision: [0.55572836 0.59387374 0.5710728  0.57607691 0.57031626]
각 폴드의 Accuracy: [0.56349206 0.59920635 0.58333333 0.59126984 0.58333333]
평균 Recall, Precision, Accuracy, AUC:
0.5841, 0.5734, 0.5841

-- Final Model Train & Eval --
