In [1]:
import random
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import torch

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, make_scorer, recall_score, precision_score, accuracy_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_validate, StratifiedKFold

import xgboost as xgb

import os
import shap

import warnings
from sklearn.exceptions import ConvergenceWarning
import pickle

# ConvergenceWarning 경고를 무시
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
"""FUNCTIONS"""

# test에 나머지 control sample 추가해서 idx 만 반환
def divide_testset(unbalanced_data, ratio):
    # train에서 ckd, control index 확인
    total_idx = unbalanced_data.index
    ckd_idx = unbalanced_data[unbalanced_data['onset_tight'] == 1].index        # 실제 ckd
    control_idx = unbalanced_data[unbalanced_data['onset_tight'] == 0].index    # 실제 control
    # print(control_idx, ckd_idx)

    # ckd 갯수와 동일하게 control idx sampling
    rng = np.random.default_rng(seed=0) 
    sampled_ckd_idx = pd.Index(rng.choice(ckd_idx, size=int(len(ckd_idx)*ratio), replace=False))
    sampled_control_idx = pd.Index(rng.choice(control_idx, size=len(sampled_ckd_idx), replace=False)) # test_ckd 갯수와 동일하게 sampling
    
    test_idx = sampled_ckd_idx.append(sampled_control_idx)
    train_idx = total_idx.difference(test_idx)

    # return 실제 ckd, 실제 ckd 갯수와 동일한 갯수의 subject, control_idx - ckd_idx
    return unbalanced_data.loc[train_idx], unbalanced_data.loc[test_idx]

### Oversampling
def oversampling(unbalanced_dataframe, seed):
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=seed)
    temp = unbalanced_dataframe.drop(['RID'], axis=1)
    X_train, y_train = smote.fit_resample(temp, temp['onset_tight'])

    # X_train에는 RID, onset_3 없음.
    return X_train.drop(['onset_tight'], axis=1), y_train

### Undersampling
# test에 나머지 control sample 추가해서 idx 만 반환
def _under_sampling_idx(unbalanced_data, seed):
    # train에서 ckd, control index 확인
    ckd_idx = unbalanced_data[unbalanced_data['onset_tight'] == 1].index        # 실제 ckd
    control_idx = unbalanced_data[unbalanced_data['onset_tight'] == 0].index    # 실제 control
    # print(control_idx, ckd_idx)

    # ckd 갯수와 동일하게 control idx sampling
    """
    Control CKD 비율 조정
    """
    rng = np.random.default_rng(seed=seed) 
    sampled_control_idx = pd.Index(rng.choice(control_idx, size=len(ckd_idx), replace=False)) # ckd 갯수와 동일하게 sampling
    not_sampled_control_idx = control_idx.difference(sampled_control_idx)

    # 잘 sampling 되었는지 확인
    assert set(sampled_control_idx).issubset(set(control_idx))
    # print(len(sampled_control_idx))

    balanced_idx = sampled_control_idx.append(ckd_idx)

    # return 실제 ckd, 실제 ckd 갯수와 동일한 갯수의 subject, control_idx - ckd_idx
    return ckd_idx, sampled_control_idx, not_sampled_control_idx, balanced_idx

def undersampling(unbalanced_data, seed):
    a, b, c, d = _under_sampling_idx(unbalanced_data, seed)
    under_sampled_data = unbalanced_data.loc[d]
    X_undersampled = under_sampled_data.drop(['RID', 'onset_tight'], axis=1)
    y_undersampled = under_sampled_data['onset_tight']
    return X_undersampled, y_undersampled

def print_cv_results_ML(results_all_seed):
    # 모델별로 시드들의 평균과 표준편차를 구하는 부분
    results_summary = {}

    # 각 시드에 저장된 모델별 결과에 접근
    for seed in results_all_seed:
        for model_name in results_all_seed[seed]:
            if model_name not in results_summary:
                results_summary[model_name] = {
                    'recall': [],
                    'precision': [],
                    'accuracy': [],
                    'auc': [],
                    'recall_std': [],
                    'precision_std': [],
                    'accuracy_std': [],
                    'auc_std': []
                }
            # 각 시드의 성능 값을 추가
            results_summary[model_name]['recall'].append(results_all_seed[seed][model_name]['recall'][0])
            results_summary[model_name]['precision'].append(results_all_seed[seed][model_name]['precision'][0])
            results_summary[model_name]['accuracy'].append(results_all_seed[seed][model_name]['accuracy'][0])
            results_summary[model_name]['auc'].append(results_all_seed[seed][model_name]['auc'][0])
            results_summary[model_name]['recall_std'].append(results_all_seed[seed][model_name]['recall_std'][0])
            results_summary[model_name]['precision_std'].append(results_all_seed[seed][model_name]['precision_std'][0])
            results_summary[model_name]['accuracy_std'].append(results_all_seed[seed][model_name]['accuracy_std'][0])
            results_summary[model_name]['auc_std'].append(results_all_seed[seed][model_name]['auc_std'][0])

    # 모델별로 평균과 표준편차 계산
    for model_name in results_summary:
        print(f"Model: {model_name}")
        
        recall_mean = np.mean(results_summary[model_name]['recall'])
        recall_std = np.mean(results_summary[model_name]['recall_std'])
        
        precision_mean = np.mean(results_summary[model_name]['precision'])
        precision_std = np.mean(results_summary[model_name]['precision_std'])
        
        accuracy_mean = np.mean(results_summary[model_name]['accuracy'])
        accuracy_std = np.mean(results_summary[model_name]['accuracy_std'])
        
        auc_mean = np.mean(results_summary[model_name]['auc'])
        auc_std = np.mean(results_summary[model_name]['auc_std'])
        
        print(f"Recall Precision Accuracy AUC")
        print(f"{recall_mean:.4f}, {precision_mean:.4f}, {accuracy_mean:.4f}, {auc_mean:.4f}")
        print(f"{recall_std:.4f}, {precision_std:.4f}, {accuracy_std:.4f}, {auc_std:.4f}")    
        print("=" * 50)
        
def get_results(y_test, final_prediction, final_probabilities):
    cm = confusion_matrix(list(y_test), list(final_prediction))
    print(cm)
    tn, fn, tp, fp  = cm[0][0], cm[1][0], cm[1][1], cm[0][1]
    recall = tp / (fn + tp)
    precision = tp / (fp + tp)
    acc = (tp + tn) / (tn + fn + tp + fp)
    
    # Calculate AUC score
    auc = roc_auc_score(y_test, final_probabilities)
    
    print("Recall \t Precision \t Acc \t AUC")
    print(f"{np.round(recall, 4)} {np.round(precision, 4)} {np.round(acc, 4)} {np.round(auc, 4)}")
    
def model_fitting(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def model_eval(fitted_model, X_test, y_test):
    model_prediction = fitted_model.predict(X_test)
    model_probabilities = fitted_model.predict_proba(X_test)[:, 1]
    model_score = fitted_model.score(X_test, y_test)
    print(f"Score with simple {fitted_model} model")
    print(0.5, np.round(model_score, 4))     # accuracy

    get_results(y_test, model_prediction, model_probabilities)

### Use Adjusted mean food

In [3]:
root_path = "/home/user19/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data"
total = pd.read_csv(f"{root_path}/0922_basic_food_adjusted_mean.csv")
basic = pd.read_csv(f"{root_path}/0922_basic_only.csv")
food = pd.read_csv(f"{root_path}/0922_food_adjusted_mean_only.csv")

##### No HB, ALBUMIN, TCHL

In [4]:
hbaltchl = ['CT1_HB', 'Imp_CT1_ALBUMIN', 'CT1_TCHL']

In [5]:
total_no_hb = total.drop(hbaltchl, axis=1)
basic_no_hb = basic.drop(hbaltchl, axis=1)
# food_no_hb = food.drop(hbaltchl, axis=1)

In [6]:
len(total.columns)

38

In [8]:
""" Total Dataset ( Food + Basic ) """

results_all_seed = {}

n_split = 10
for seed in [42, 59, 63, 79, 101]:    
    datas = {"original":(), "undersampling":(), "oversampling":()}

    datas['undersampling'] = undersampling(basic_no_hb, seed=seed)        # eGFR 만을 빼면 성능이 줄어든다..
    datas['oversampling'] = oversampling(basic_no_hb, seed=seed)
    y_tight3_foodsum_train = basic_no_hb['onset_tight']
    X_tight3_foodsum_train = basic_no_hb.drop(['RID', 'onset_tight'], axis=1)
    datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)
    
    results_all_seed[seed] = {}  # 각 시드마다 딕셔너리 생성
    
    for data in ['undersampling']:
        print(f"For {data} dataset!!!")
        # X_test, y_test = datas['test']
        X_train, y_train = datas[data]
        
        wei_train_scaler = StandardScaler()
        X_train = wei_train_scaler.fit_transform(X_train)
        # X_test = wei_train_scaler.transform(X_test)
        print(f"{data} dataset loaded and scaled")
        print(f"X_train :: {X_train.shape}, y_train :: {y_train.shape}")
        
        scoring = {
        'recall': 'recall_macro',      # recall for each class, then averaged
        'precision': 'precision_macro',# precision for each class, then averaged
        'accuracy': 'accuracy',        # accuracy
        'auc': make_scorer(roc_auc_score, multi_class='ovr')  # AUC 계산 (이진 분류시)
        }
        
        models = (
            # SVC(kernel='linear', random_state=seed, probability=True),
            # RandomForestClassifier(random_state=seed, max_depth=3),
            # LogisticRegression(max_iter=1000, random_state=seed),
            GradientBoostingClassifier(random_state=seed, max_depth=3, learning_rate=0.01),
            # xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=seed)
        )
        
        for i, model in enumerate(models):
            model_name = model.__class__.__name__  # 모델 이름 저장
            print("=" * 30)
            print(f"Cross Validation for {model} in fold {n_split}, with seed {seed}")
            # Stratified K-Fold 교차 검증 (K=5)
            skf = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=seed)
            results = cross_validate(model, X_train, y_train, cv=skf, scoring=scoring, return_train_score=False)

            # 결과 출력
            print("각 폴드의 Recall:", results['test_recall'])
            print("각 폴드의 Precision:", results['test_precision'])
            print("각 폴드의 Accuracy:", results['test_accuracy'])
            print("각 폴드의 AUC:", results['test_auc'])

            # 결과 저장
            if model_name not in results_all_seed[seed]:
                results_all_seed[seed][model_name] = {
                    'recall': [], 
                    'precision': [], 
                    'accuracy': [], 
                    'auc': [],
                    'recall_std': [], 
                    'precision_std': [], 
                    'accuracy_std': [], 
                    'auc_std': []
                }
            results_all_seed[seed][model_name]['recall'].append(np.mean(results['test_recall']))
            results_all_seed[seed][model_name]['precision'].append(np.mean(results['test_precision']))
            results_all_seed[seed][model_name]['accuracy'].append(np.mean(results['test_accuracy']))
            results_all_seed[seed][model_name]['auc'].append(np.mean(results['test_auc']))
            results_all_seed[seed][model_name]['recall_std'].append(np.std(results['test_recall']))
            results_all_seed[seed][model_name]['precision_std'].append(np.std(results['test_precision']))
            results_all_seed[seed][model_name]['accuracy_std'].append(np.std(results['test_accuracy']))
            results_all_seed[seed][model_name]['auc_std'].append(np.std(results['test_auc']))
            
            
            # 평균값 계산
            print("평균 Recall, Precision, Accuracy, AUC:")
            print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}, {round(np.mean(results['test_auc']), 4)}")
            print("표준편차 Recall, Precision, Accuracy, AUC:")
            print(f"{round(np.std(results['test_recall']), 4)}, {round(np.std(results['test_precision']),4)}, {round(np.std(results['test_accuracy']), 4)}, {round(np.std(results['test_auc']), 4)}")
            print()
            
print_cv_results_ML(results_all_seed)

For undersampling dataset!!!
undersampling dataset loaded and scaled
X_train :: (1050, 12), y_train :: (1050,)
Cross Validation for GradientBoostingClassifier(learning_rate=0.01, random_state=42) in fold 10, with seed 42
각 폴드의 Recall: [0.80986938 0.85703919 0.77231495 0.78120464 0.80025399 0.94285196
 0.84760522 0.88588534 0.82855588 0.82928157]
각 폴드의 Precision: [0.81122449 0.85729847 0.78150788 0.78181818 0.80090909 0.94285196
 0.84760522 0.88616558 0.82855588 0.83611111]
각 폴드의 Accuracy: [0.80952381 0.85714286 0.77142857 0.78095238 0.8        0.94285714
 0.84761905 0.88571429 0.82857143 0.82857143]
각 폴드의 AUC: [0.80986938 0.85703919 0.77231495 0.78120464 0.80025399 0.94285196
 0.84760522 0.88588534 0.82855588 0.82928157]
평균 Recall, Precision, Accuracy, AUC:
0.8355, 0.8374, 0.8352, 0.8355
표준편차 Recall, Precision, Accuracy, AUC:
0.0486, 0.0472, 0.0488, 0.0486

For undersampling dataset!!!
undersampling dataset loaded and scaled
X_train :: (1050, 12), y_train :: (1050,)
Cross Validation fo

##### 기저질환 환자 명수 확인

- 기저질환이 모두 없는 환자 : 37943 ( 약 66.30% )

In [25]:
col_giju = ['new_CT1_gohyeol', 'CT1_dangnyo', 'CT1_simhyeol']
print(total[col_giju].value_counts())
print((total[col_giju].value_counts() / total[col_giju].value_counts().sum()) * 100)

new_CT1_gohyeol  CT1_dangnyo  CT1_simhyeol
0                0            0               37943
1                0            0               12542
0                1            0                2382
1                1            0                2239
                 0            1                 844
0                0            1                 836
1                1            1                 278
0                1            1                 167
Name: count, dtype: int64
new_CT1_gohyeol  CT1_dangnyo  CT1_simhyeol
0                0            0               66.297985
1                0            0               21.914697
0                1            0                4.162080
1                1            0                3.912215
                 0            1                1.474725
0                0            1                1.460747
1                1            1                0.485751
0                1            1                0.291800
Name: count, dtype: floa

- 기저 질환이 모두 없는 환자 ( 37943명 중에서 CKD 발생이 몇 명인지 확인)
- 37943 명 중 Control : 37814 명, CKD 129 명 -> (CKD 비율 0.3 %)

In [26]:
no_jilhwan_total = total[(total['new_CT1_gohyeol'] == 0) & (total['CT1_dangnyo'] == 0) & (total['CT1_simhyeol'] == 0)]
no_jilhwan_basic = basic[(total['new_CT1_gohyeol'] == 0) & (total['CT1_dangnyo'] == 0) & (total['CT1_simhyeol'] == 0)]

# 고혈압, 당뇨, 심혈관질환 컬럼 제외
no_jilhwan_total = no_jilhwan_total.drop(col_giju, axis=1)      # 12 + 21
no_jilhwan_basic = no_jilhwan_basic.drop(col_giju, axis=1)      # 12

# 추가로 TC, albumin, HB 컬럼 제외
no_jilhwan_total_no_hb = no_jilhwan_total.drop(hbaltchl, axis=1)        # 12 + 21 - 3
no_jilhwan_basic_no_hb = no_jilhwan_basic.drop(hbaltchl, axis=1)        # 12 -3

In [27]:
print(no_jilhwan_basic['onset_tight'].value_counts())
print((no_jilhwan_total['onset_tight'].value_counts() / no_jilhwan_total['onset_tight'].value_counts().sum())*100)

onset_tight
0    37814
1      129
Name: count, dtype: int64
onset_tight
0    99.660016
1     0.339984
Name: count, dtype: float64


In [35]:
""" Total Dataset ( Food + Basic ) """

results_all_seed = {}

n_split = 10
for seed in [42, 59, 63, 79, 101]:    
    datas = {"original":(), "undersampling":(), "oversampling":()}

    datas['undersampling'] = undersampling(no_jilhwan_basic_no_hb, seed=seed)        # eGFR 만을 빼면 성능이 줄어든다..
    datas['oversampling'] = oversampling(no_jilhwan_basic_no_hb, seed=seed)
    y_tight3_foodsum_train = no_jilhwan_basic_no_hb['onset_tight']
    X_tight3_foodsum_train = no_jilhwan_basic_no_hb.drop(['RID', 'onset_tight'], axis=1)
    datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)
    
    results_all_seed[seed] = {}  # 각 시드마다 딕셔너리 생성
    
    for data in ['undersampling']:
        print(f"For {data} dataset!!!")
        # X_test, y_test = datas['test']
        X_train, y_train = datas[data]
        
        wei_train_scaler = StandardScaler()
        X_train = wei_train_scaler.fit_transform(X_train)
        # X_test = wei_train_scaler.transform(X_test)
        print(f"{data} dataset loaded and scaled")
        print(f"X_train :: {X_train.shape}, y_train :: {y_train.shape}")
        
        scoring = {
        'recall': 'recall_macro',      # recall for each class, then averaged
        'precision': 'precision_macro',# precision for each class, then averaged
        'accuracy': 'accuracy',        # accuracy
        'auc': make_scorer(roc_auc_score, multi_class='ovr')  # AUC 계산 (이진 분류시)
        }
        
        models = (
            SVC(kernel='linear', random_state=seed, probability=True),
            RandomForestClassifier(random_state=seed, max_depth=3),
            LogisticRegression(max_iter=1000, random_state=seed),
            GradientBoostingClassifier(random_state=seed, max_depth=3, learning_rate=0.01),
            # xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=seed)
        )
        
        for i, model in enumerate(models):
            model_name = model.__class__.__name__  # 모델 이름 저장
            print("=" * 30)
            print(f"Cross Validation for {model} in fold {n_split}, with seed {seed}")
            # Stratified K-Fold 교차 검증 (K=5)
            skf = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=seed)
            results = cross_validate(model, X_train, y_train, cv=skf, scoring=scoring, return_train_score=False)

            # 결과 출력
            print("각 폴드의 Recall:", results['test_recall'])
            print("각 폴드의 Precision:", results['test_precision'])
            print("각 폴드의 Accuracy:", results['test_accuracy'])
            print("각 폴드의 AUC:", results['test_auc'])

            # 결과 저장
            if model_name not in results_all_seed[seed]:
                results_all_seed[seed][model_name] = {
                    'recall': [], 
                    'precision': [], 
                    'accuracy': [], 
                    'auc': [],
                    'recall_std': [], 
                    'precision_std': [], 
                    'accuracy_std': [], 
                    'auc_std': []
                }
            results_all_seed[seed][model_name]['recall'].append(np.mean(results['test_recall']))
            results_all_seed[seed][model_name]['precision'].append(np.mean(results['test_precision']))
            results_all_seed[seed][model_name]['accuracy'].append(np.mean(results['test_accuracy']))
            results_all_seed[seed][model_name]['auc'].append(np.mean(results['test_auc']))
            results_all_seed[seed][model_name]['recall_std'].append(np.std(results['test_recall']))
            results_all_seed[seed][model_name]['precision_std'].append(np.std(results['test_precision']))
            results_all_seed[seed][model_name]['accuracy_std'].append(np.std(results['test_accuracy']))
            results_all_seed[seed][model_name]['auc_std'].append(np.std(results['test_auc']))
            
            
            # 평균값 계산
            print("평균 Recall, Precision, Accuracy, AUC:")
            print(f"{round(np.mean(results['test_recall']), 4)}, {round(np.mean(results['test_precision']),4)}, {round(np.mean(results['test_accuracy']), 4)}, {round(np.mean(results['test_auc']), 4)}")
            print("표준편차 Recall, Precision, Accuracy, AUC:")
            print(f"{round(np.std(results['test_recall']), 4)}, {round(np.std(results['test_precision']),4)}, {round(np.std(results['test_accuracy']), 4)}, {round(np.std(results['test_auc']), 4)}")
            print()
            
print_cv_results_ML(results_all_seed)

For undersampling dataset!!!
undersampling dataset loaded and scaled
X_train :: (258, 9), y_train :: (258,)
Cross Validation for SVC(kernel='linear', probability=True, random_state=42) in fold 10, with seed 42
각 폴드의 Recall: [0.92307692 0.76923077 0.80769231 0.80769231 0.88461538 0.84615385
 0.84615385 0.76923077 0.79807692 0.83974359]
각 폴드의 Precision: [0.93333333 0.77575758 0.80952381 0.80952381 0.88690476 0.88235294
 0.88235294 0.77575758 0.80194805 0.83974359]
각 폴드의 Accuracy: [0.92307692 0.76923077 0.80769231 0.80769231 0.88461538 0.84615385
 0.84615385 0.76923077 0.8        0.84      ]
각 폴드의 AUC: [0.92307692 0.76923077 0.80769231 0.80769231 0.88461538 0.84615385
 0.84615385 0.76923077 0.79807692 0.83974359]
평균 Recall, Precision, Accuracy, AUC:
0.8292, 0.8397, 0.8294, 0.8292
표준편차 Recall, Precision, Accuracy, AUC:
0.0465, 0.051, 0.0464, 0.0465

Cross Validation for RandomForestClassifier(max_depth=3, random_state=42) in fold 10, with seed 42
각 폴드의 Recall: [0.88461538 0.84615385 0.8076

### shap value load

In [49]:
import pickle

# 저장된 shap_values_all_models 불러오기
shap_values_save_path = '/home/user19/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/shap_figs/basic_foodadjmean_under_seed42/shap_values_all_models.pk'  # 저장된 경로 지정

# pickle을 사용해 파일 불러오기
with open(shap_values_save_path, 'rb') as f:
    shap_values_all_models_loaded = pickle.load(f)

print(shap_values_all_models_loaded.keys())
shap_abs_mean = shap_values_all_models_loaded['LogisticRegression'][0]

dict_keys(['SVC', 'RandomForest', 'LogisticRegression', 'GradientBoosting', 'XGBoost'])


(105, 36)

In [81]:
for i in range(10):
    print(f"{i+1}-th fold shap scale")
    print(np.abs(shap_values_all_models_loaded['LogisticRegression'][i]).max(), np.abs(shap_values_all_models_loaded['LogisticRegression'][i]).min())
    print()

1-th fold shap scale
4.286749061609158 1.41187184172254e-06

2-th fold shap scale
3.747013758829214 5.076857247376431e-07

3-th fold shap scale
4.303282668124774 1.591530099128552e-05

4-th fold shap scale
3.5290529379354543 2.8820354409382918e-06

5-th fold shap scale
3.580234334300884 1.7128125810304588e-06

6-th fold shap scale
4.982286312347576 4.299012648649807e-06

7-th fold shap scale
3.4649812373145394 6.755026160410896e-05

8-th fold shap scale
2.9875469628432634 9.053051306757784e-07

9-th fold shap scale
4.017277107283533 5.38522143218238e-06

10-th fold shap scale
4.13459710829984 1.3408297561485307e-06



In [82]:
for i in range(10):
    print(f"{i+1}-th fold shap scale")
    print(np.abs(shap_values_all_models_loaded['GradientBoosting'][i]).max(), np.abs(shap_values_all_models_loaded['GradientBoosting'][i]).min())
    print()

1-th fold shap scale
1.2479852890957288 0.0

2-th fold shap scale
1.212216992917501 0.0

3-th fold shap scale
1.4257061903826438 0.0

4-th fold shap scale
1.2246024438149983 0.0

5-th fold shap scale
1.1816490861238718 0.0

6-th fold shap scale
1.282482581213648 0.0

7-th fold shap scale
1.2406411938535014 0.0

8-th fold shap scale
1.3375645971275252 0.0

9-th fold shap scale
1.2465252556032014 0.0

10-th fold shap scale
1.298005765441243 0.0



In [88]:
for i in range(10):
    print(f"{i+1}-th fold shap scale")
    print(np.abs(shap_values_all_models_loaded['SVC'][i][1]).max(), np.abs(shap_values_all_models_loaded['SVC'][i][1]).min())
    print()

1-th fold shap scale
0.43232088168233 0.0

2-th fold shap scale
0.4904088140437238 0.0

3-th fold shap scale
0.5157758828684269 0.0

4-th fold shap scale
0.4577115588539231 0.0

5-th fold shap scale
0.38982626043568236 0.0

6-th fold shap scale
0.4948495550605818 0.0

7-th fold shap scale
0.4490888622274815 0.0

8-th fold shap scale
0.40417296415978415 0.0

9-th fold shap scale
0.4209066783995242 0.0

10-th fold shap scale
0.5070299408536776 0.0



In [90]:
for i in range(10):
    print(f"{i+1}-th fold shap scale")
    print(np.abs(shap_values_all_models_loaded['RandomForest'][i][1]).max(), np.abs(shap_values_all_models_loaded['RandomForest'][i][1]).min())
    print(np.abs(shap_values_all_models_loaded['RandomForest'][i][1]).mean(axis=0))
    print()

1-th fold shap scale
0.1587132695610463 0.0
[0.00461031 0.07612653 0.10156494 0.01755901 0.01475149 0.00060418
 0.00131622 0.00062813 0.00080405 0.00758513 0.         0.00032048
 0.03386446 0.01429116 0.00409383 0.00181028 0.02324033 0.00134914
 0.00610387 0.00228371 0.00076543 0.00243884 0.00117226 0.00219424
 0.000522   0.0007942  0.00242447 0.00037002 0.00115734 0.00150833
 0.00260207 0.00061042 0.00032596 0.00109502 0.00597259 0.00346242]

2-th fold shap scale
0.16300538214938887 1.293182878623055e-07
[2.58343151e-03 7.11849080e-02 1.03565634e-01 1.71076449e-02
 1.11386901e-02 8.45767539e-04 2.04384754e-03 8.13740509e-04
 1.86978166e-03 7.33711092e-03 6.87847928e-05 7.97422812e-04
 2.89678703e-02 1.83408810e-02 2.67170763e-03 1.32452150e-03
 2.44576532e-02 1.12481287e-03 5.22354139e-03 1.37103208e-03
 7.09749599e-04 9.26858193e-04 9.06328362e-04 9.54541071e-04
 1.20496955e-03 9.04167786e-04 5.77481085e-03 6.08010061e-04
 1.22728031e-03 7.90397647e-04 6.44623133e-04 7.42860043e-04
 

In [108]:
for i in range(10):
    print(shap_values_all_models_loaded['SVC'][i][0].shape)

(105, 36)
(105, 36)
(105, 36)
(105, 36)
(105, 36)
(105, 36)
(105, 36)
(105, 36)
(105, 36)
(105, 36)


In [101]:
shap_values_all_models_loaded['SVC'][0][1].shape

(105, 36)

### Bayesian model 성능확인

In [4]:
print(total.shape, basic.shape, food.shape)

(57231, 38) (57231, 17) (57231, 23)


In [22]:
""" Total Dataset ( Food + Basic ) """

results_all_seed = {}

# total, basic, food
a, b = divide_testset(total, ratio=0.1)

n_split = 10
for seed in [42, 59, 63, 79, 101]:    
# for seed in [42]:
    datas = {"original":(), "undersampling":(), "oversampling":()}

    datas['undersampling'] = undersampling(a, seed=seed)        # eGFR 만을 빼면 성능이 줄어든다..
    datas['oversampling'] = oversampling(a, seed=seed)
    y_tight3_foodsum_train = a['onset_tight']
    X_tight3_foodsum_train = a.drop(['RID', 'onset_tight'], axis=1)
    datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)
    
    results_all_seed[seed] = {}  # 각 시드마다 딕셔너리 생성
    
    for data in ['oversampling']:
        print(f"For {data} dataset!!!")
        # X_test, y_test = datas['test']
        X_train, y_train = datas[data]
        
        # wei_train_scaler = StandardScaler()
        # X_train = wei_train_scaler.fit_transform(X_train)
        # X_test = wei_train_scaler.transform(X_test)
        print(f"{data} dataset loaded and scaled")
        print(f"X_train :: {X_train.shape}, y_train :: {y_train.shape}")
        
        models = (
            # SVC(kernel='linear', random_state=seed, probability=True),
            # RandomForestClassifier(random_state=seed, max_depth=3),
            LogisticRegression(max_iter=1000, random_state=seed),
            # GradientBoostingClassifier(random_state=seed, max_depth=3, learning_rate=0.01),
            # GaussianNB(var_smoothing=1e-7),     # 1e-11, 1e-10, 1e-9, 1e-8, 1e-7
            # xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=seed)
        )
        
        skf = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=seed)
            
        for fold_num, (train_index, test_index) in enumerate(skf.split(X_train, y_train), 1):
            # Train/Test 데이터 분할
            X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
            y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]

            for model in models:
                print(f"Training {model.__class__.__name__} model on fold {fold_num}...")
                model.fit(X_fold_train, y_fold_train)
                print(f"Model fitted!!")
        
        y_tight3_foodsum_test = b['onset_tight']
        X_tight3_foodsum_test = b.drop(['RID', 'onset_tight'], axis=1)
        
        for trained_model in models:
            print("="*30)
            print(f"Predicting {model.__class__.__name__}!!")
            preds = trained_model.predict(X_tight3_foodsum_test)
            model_probabilities = trained_model.predict_proba(X_tight3_foodsum_test)[:, 1]
            print("=" * 30)
            print(seed)
            get_results(y_tight3_foodsum_test, np.array(preds), model_probabilities)
            print("=" * 30)
            
# print_cv_results_ML(results_all_seed)

For oversampling dataset!!!
oversampling dataset loaded and scaled
X_train :: (113308, 36), y_train :: (113308,)
Training LogisticRegression model on fold 1...


KeyboardInterrupt: 

In [18]:
results_all_seed = {}

# total, basic, food
a, b = divide_testset(total, ratio=0.1)

In [17]:
""" Total Dataset ( Food + Basic ) """

# results_all_seed = {}

# # total, basic, food
# a, b = divide_testset(total, ratio=0.1)

n_split = 10
for seed in [42, 59, 63, 79, 101]:    
# for seed in [42]:    
    datas = {"original":(), "undersampling":(), "oversampling":()}

    datas['undersampling'] = undersampling(a, seed=seed)        # eGFR 만을 빼면 성능이 줄어든다..
    datas['oversampling'] = oversampling(a, seed=seed)
    y_tight3_foodsum_train = a['onset_tight']
    X_tight3_foodsum_train = a.drop(['RID', 'onset_tight'], axis=1)
    datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)
    
    results_all_seed[seed] = {}  # 각 시드마다 딕셔너리 생성
    
    for data in ['original']:
        print(f"For {data} dataset!!!")
        # X_test, y_test = datas['test']
        X_train, y_train = datas[data]
        print(X_train.shape, y_train.shape)
        
        # wei_train_scaler = StandardScaler()
        # X_train = wei_train_scaler.fit_transform(X_train)
        # X_test = wei_train_scaler.transform(X_test)
        print(f"{data} dataset loaded and scaled")
        print(f"X_train :: {X_train.shape}, y_train :: {y_train.shape}")
        
        models = (
            # SVC(kernel='linear', random_state=seed, probability=True),
            # RandomForestClassifier(random_state=seed, max_depth=3),
            # LogisticRegression(max_iter=1000, random_state=seed),
            GradientBoostingClassifier(random_state=seed, max_depth=3, learning_rate=0.01),
            # GaussianNB(var_smoothing=1e-7),     # 1e-11, 1e-10, 1e-9, 1e-8, 1e-7
            # xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=seed)
        )
        
        # skf = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=seed)
            
        # for fold_num, (train_index, test_index) in enumerate(skf.split(X_train, y_train), 1):
        #     # Train/Test 데이터 분할
        #     X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
        #     y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]

        #     for model in models:
        #         print(f"Training {model.__class__.__name__} model on fold {fold_num}...")
        #         model.fit(X_fold_train, y_fold_train)
        #         print(f"Model fitted!!")
        
        model = models[0]
        model.fit(X_train, y_train)
        print(f"Model fitted!!")
        
        y_tight3_foodsum_test = b['onset_tight']
        X_tight3_foodsum_test = b.drop(['RID', 'onset_tight'], axis=1)
        
        for trained_model in models:
            print("="*30)
            print(f"Predicting {model.__class__.__name__}!!")
            preds = trained_model.predict(X_tight3_foodsum_test)
            model_probabilities = trained_model.predict_proba(X_tight3_foodsum_test)[:, 1]
            print("=" * 30)
            print(seed)
            get_results(y_tight3_foodsum_test, np.array(preds), model_probabilities)
            print("=" * 30)
            
# print_cv_results_ML(results_all_seed)

For original dataset!!!
(57127, 21) (57127,)
original dataset loaded and scaled
X_train :: (57127, 21), y_train :: (57127,)
Model fitted!!
Predicting GaussianNB!!
42
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.5411
For original dataset!!!
(57127, 21) (57127,)
original dataset loaded and scaled
X_train :: (57127, 21), y_train :: (57127,)
Model fitted!!
Predicting GaussianNB!!
59
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.5411


invalid value encountered in long_scalars
invalid value encountered in long_scalars


For original dataset!!!
(57127, 21) (57127,)
original dataset loaded and scaled
X_train :: (57127, 21), y_train :: (57127,)
Model fitted!!
Predicting GaussianNB!!
63
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.5411
For original dataset!!!
(57127, 21) (57127,)
original dataset loaded and scaled
X_train :: (57127, 21), y_train :: (57127,)
Model fitted!!
Predicting GaussianNB!!
79
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.5411


invalid value encountered in long_scalars
invalid value encountered in long_scalars


For original dataset!!!
(57127, 21) (57127,)
original dataset loaded and scaled
X_train :: (57127, 21), y_train :: (57127,)
Model fitted!!
Predicting GaussianNB!!
101
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.5411


invalid value encountered in long_scalars
