# 아래 경로의 폴더 안에 있는 파일들에 대해서 ML models 실험.
경로 : "/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data"

- labeling 기준 : eGFR < 60 기준 만을 사용. => tight3.csv 파일 사용.
- original vs under-sampling vs over-sampling
    - (basic) vs (food feature) vs (basic + food feature)
- SVM, RF, GBC

In [2]:
import random
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import torch
from scipy import stats
from scipy.stats import randint, loguniform


from sklearn import preprocessing
from sklearn import tree

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 

from sklearn.svm import SVC
from sklearn.metrics import make_scorer, roc_auc_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression 

In [3]:
# set the seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed = 1109
set_seed(seed)

#### functions

In [4]:
# test에 나머지 control sample 추가해서 idx 만 반환
def divide_testset(unbalanced_data, ratio):
    # train에서 ckd, control index 확인
    total_idx = unbalanced_data.index
    ckd_idx = unbalanced_data[unbalanced_data['onset_tight'] == 1].index        # 실제 ckd
    control_idx = unbalanced_data[unbalanced_data['onset_tight'] == 0].index    # 실제 control
    # print(control_idx, ckd_idx)

    # ckd 갯수와 동일하게 control idx sampling
    rng = np.random.default_rng(seed=42)
    sampled_ckd_idx = pd.Index(rng.choice(ckd_idx, size=int(len(ckd_idx)*ratio), replace=False))
    sampled_control_idx = pd.Index(rng.choice(control_idx, size=len(sampled_ckd_idx), replace=False)) # test_ckd 갯수와 동일하게 sampling
    
    test_idx = sampled_ckd_idx.append(sampled_control_idx)
    train_idx = total_idx.difference(test_idx)

    # return 실제 ckd, 실제 ckd 갯수와 동일한 갯수의 subject, control_idx - ckd_idx
    return unbalanced_data.loc[train_idx], unbalanced_data.loc[test_idx]

In [5]:
### Oversampling
def oversampling(unbalanced_dataframe):
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=seed)
    temp = unbalanced_dataframe.drop(['RID'], axis=1)
    X_train, y_train = smote.fit_resample(temp, temp['onset_tight'])

    # X_train에는 RID, onset_3 없음.
    return X_train.drop(['onset_tight'], axis=1), y_train

### Undersampling
# test에 나머지 control sample 추가해서 idx 만 반환
def _under_sampling_idx(unbalanced_data):
    # train에서 ckd, control index 확인
    ckd_idx = unbalanced_data[unbalanced_data['onset_tight'] == 1].index        # 실제 ckd
    control_idx = unbalanced_data[unbalanced_data['onset_tight'] == 0].index    # 실제 control
    # print(control_idx, ckd_idx)

    # ckd 갯수와 동일하게 control idx sampling
    sampled_control_idx = pd.Index(np.random.choice(control_idx, size=len(ckd_idx), replace=False)) # ckd 갯수와 동일하게 sampling
    not_sampled_control_idx = control_idx.difference(sampled_control_idx)

    # 잘 sampling 되었는지 확인
    assert set(sampled_control_idx).issubset(set(control_idx))
    # print(len(sampled_control_idx))

    balanced_idx = sampled_control_idx.append(ckd_idx)

    # return 실제 ckd, 실제 ckd 갯수와 동일한 갯수의 subject, control_idx - ckd_idx
    return ckd_idx, sampled_control_idx, not_sampled_control_idx, balanced_idx

def undersampling(unbalanced_data):
    a, b, c, d = _under_sampling_idx(unbalanced_data)
    under_sampled_data = unbalanced_data.loc[d]
    X_undersampled = under_sampled_data.drop(['RID', 'onset_tight'], axis=1)
    y_undersampled = under_sampled_data['onset_tight']
    return X_undersampled, y_undersampled

In [6]:
# def get_results(y_test, final_prediction):
#     cm = confusion_matrix(list(y_test), list(final_prediction))
#     print(cm)
#     tn, fn, tp, fp  = cm[0][0], cm[1][0], cm[1][1], cm[0][1]
#     recall = tp / (fn + tp)
#     precision = tp / (fp + tp)
#     acc = (tp + tn) / (tn + fn + tp+fp)
#     print("Recall \t Precision \t Acc")
#     print(f"{np.round(recall,4)} {np.round(precision,4)} {np.round(acc,4)}")

def get_results(y_test, final_prediction, final_probabilities):
    cm = confusion_matrix(list(y_test), list(final_prediction))
    print(cm)
    tn, fn, tp, fp  = cm[0][0], cm[1][0], cm[1][1], cm[0][1]
    recall = tp / (fn + tp)
    precision = tp / (fp + tp)
    acc = (tp + tn) / (tn + fn + tp + fp)
    
    # Calculate AUC score
    auc = roc_auc_score(y_test, final_probabilities)
    
    print("Recall \t Precision \t Acc \t AUC")
    print(f"{np.round(recall, 4)} {np.round(precision, 4)} {np.round(acc, 4)} {np.round(auc, 4)}")

In [7]:
def model_fitting(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def model_eval(fitted_model, X_test, y_test):
    model_prediction = fitted_model.predict(X_test)
    model_probabilities = fitted_model.predict_proba(X_test)[:, 1]
    model_score = fitted_model.score(X_test, y_test)
    print(f"Score with simple {fitted_model} model")
    print(0.5, np.round(model_score, 4))     # accuracy

    get_results(y_test, model_prediction, model_probabilities)

# 1. Food sum

In [8]:
tight3_foodsum = pd.read_csv("/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data/0922_basic_food_sum.csv")

In [9]:
tight3_foodsum.columns

Index(['RID', 'CT1_SEX', 'CT1_AGE', 'CT1_eGFR', 'imp_CT1_SBP', 'imp_CT1_BMI',
       'CT1_HB', 'Imp_CT1_ALBUMIN', 'CT1_TCHL', 'imp_new_EDU',
       'imp_new_INCOME', 'imp_new_DRINK', 'imp_new_SMOKE', 'new_CT1_gohyeol',
       'CT1_dangnyo', 'CT1_simhyeol', 'onset_tight', 'F01_Sum', 'F02_Sum',
       'F03_Sum', 'F04_Sum', 'F05_Sum', 'F06_Sum', 'F07_Sum', 'F08_Sum',
       'F09_Sum', 'F10_Sum', 'F11_Sum', 'F12_Sum', 'F13_Sum', 'F14_Sum',
       'F15_Sum', 'F16_Sum', 'F17_Sum', 'F18_Sum', 'F19_Sum', 'F20_Sum',
       'F21_Sum'],
      dtype='object')

In [10]:
tight3_foodsum_basic = tight3_foodsum[['RID', 'CT1_SEX', 'CT1_AGE', 'CT1_eGFR', 'imp_CT1_SBP', 'imp_CT1_BMI',
       'CT1_HB', 'Imp_CT1_ALBUMIN', 'CT1_TCHL', 'imp_new_EDU',
       'imp_new_INCOME', 'imp_new_DRINK', 'imp_new_SMOKE', 'new_CT1_gohyeol',
       'CT1_dangnyo', 'CT1_simhyeol', 'onset_tight']]
tight3_foodsum_food = tight3_foodsum[['RID', 'onset_tight', 'F01_Sum', 'F02_Sum',
       'F03_Sum', 'F04_Sum', 'F05_Sum', 'F06_Sum', 'F07_Sum', 'F08_Sum',
       'F09_Sum', 'F10_Sum', 'F11_Sum', 'F12_Sum', 'F13_Sum', 'F14_Sum',
       'F15_Sum', 'F16_Sum', 'F17_Sum', 'F18_Sum', 'F19_Sum', 'F20_Sum',
       'F21_Sum']]

### 1. Original vs Under-sampling vs Over-sampling

Make Dataset

In [11]:
tight3_foodsum_train, tight3_foodsum_test = divide_testset(tight3_foodsum, 0.1)        # 기본 정보만 가진 데이터

y_tight3_test = tight3_foodsum_test['onset_tight']
X_tight3_test = tight3_foodsum_test.drop(['RID', 'onset_tight'], axis=1)

In [12]:
## key : dataset name
## value : (train dataset, train label)
datas = {"original":(), "undersampling":(), "oversampling":(), "test":(X_tight3_test, y_tight3_test)}

datas['undersampling'] = undersampling(tight3_foodsum_train)
datas['oversampling'] = oversampling(tight3_foodsum_train)

y_tight3_foodsum_train = tight3_foodsum_train['onset_tight']
X_tight3_foodsum_train = tight3_foodsum_train.drop(['RID', 'onset_tight'], axis=1)
datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)

# !!!data shape check!!!
for k, v in datas.items():
    print(k, v[0].shape, v[1].shape, v[1].value_counts())

original (57127, 36) (57127,) onset_tight
0    56654
1      473
Name: count, dtype: int64
undersampling (946, 36) (946,) onset_tight
0    473
1    473
Name: count, dtype: int64
oversampling (113308, 36) (113308,) onset_tight
0    56654
1    56654
Name: count, dtype: int64
test (104, 36) (104,) onset_tight
1    52
0    52
Name: count, dtype: int64


Train and Eval
- svm, Decision Tree, logistic regression, linear regression, Gradient Boosting algorithm

In [13]:
# 159m 37.5s
for data in ['undersampling', 'original', 'oversampling']:
    print(f"For {data} dataset!!!")
    X_test, y_test = datas['test']
    X_train, y_train = datas[data]
    
    wei_train_scaler = StandardScaler()
    X_train = wei_train_scaler.fit_transform(X_train)
    X_test = wei_train_scaler.transform(X_test)
    print(f"{data} dataset loaded and scaled")
    
    ### Train and Eval per each Model ###
    print()
    print("!!! SVM !!!")
    svm_model = SVC(kernel='linear', random_state=42, probability=True)
    svm_model = model_fitting(svm_model, X_train, y_train)
    model_eval(svm_model, X_test, y_test)

    print()
    print("!!! RF !!!")
    rf_model = RandomForestClassifier(random_state=seed, max_depth=3)
    rf_model = model_fitting(rf_model, X_train, y_train)
    model_eval(rf_model, X_test, y_test)

    print()
    print("!!! LR !!!")
    lr_model = LogisticRegression(max_iter=1000, random_state=seed)
    lr_model = model_fitting(lr_model, X_train, y_train)
    model_eval(lr_model, X_test, y_test)

    print()
    print("!!! GBC !!!")
    gbc_model = GradientBoostingClassifier(random_state=seed, max_depth=1, learning_rate=0.01)
    gbc_model = model_fitting(gbc_model, X_train, y_train)
    model_eval(gbc_model, X_test, y_test)

For undersampling dataset!!!
undersampling dataset loaded and scaled

!!! SVM !!!
Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.8654
[[41 11]
 [ 3 49]]
Recall 	 Precision 	 Acc 	 AUC
0.9423 0.8167 0.8654 0.9393

!!! RF !!!
Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.8654
[[42 10]
 [ 4 48]]
Recall 	 Precision 	 Acc 	 AUC
0.9231 0.8276 0.8654 0.9153

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.8846
[[42 10]
 [ 2 50]]
Recall 	 Precision 	 Acc 	 AUC
0.9615 0.8333 0.8846 0.939

!!! GBC !!!
Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.8269
[[37 15]
 [ 3 49]]
Recall 	 Precision 	 Acc 	 AUC
0.9423 0.7656 0.8269 0.9057
For original dataset!!!
original dataset loaded and scaled

!!! SVM !!!
Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	

  precision = tp / (fp + tp)


Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.9271

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.5096
[[52  0]
 [51  1]]
Recall 	 Precision 	 Acc 	 AUC
0.0192 1.0 0.5096 0.9427

!!! GBC !!!


  precision = tp / (fp + tp)


Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.8719
For oversampling dataset!!!
oversampling dataset loaded and scaled

!!! SVM !!!


  precision = tp / (fp + tp)


Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.7788
[[46  6]
 [17 35]]
Recall 	 Precision 	 Acc 	 AUC
0.6731 0.8537 0.7788 0.909

!!! RF !!!
Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.7596
[[42 10]
 [15 37]]
Recall 	 Precision 	 Acc 	 AUC
0.7115 0.7872 0.7596 0.882

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.7692
[[46  6]
 [18 34]]
Recall 	 Precision 	 Acc 	 AUC
0.6538 0.85 0.7692 0.9087

!!! GBC !!!
Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.8365
[[40 12]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.7966 0.8365 0.8794


# 2. Food mean

In [14]:
tight3_foodsum = pd.read_csv("/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data/0922_basic_food_mean.csv")

In [15]:
tight3_foodsum.columns

Index(['RID', 'CT1_SEX', 'CT1_AGE', 'CT1_eGFR', 'imp_CT1_SBP', 'imp_CT1_BMI',
       'CT1_HB', 'Imp_CT1_ALBUMIN', 'CT1_TCHL', 'imp_new_EDU',
       'imp_new_INCOME', 'imp_new_DRINK', 'imp_new_SMOKE', 'new_CT1_gohyeol',
       'CT1_dangnyo', 'CT1_simhyeol', 'onset_tight', 'F01_Mean', 'F02_Mean',
       'F03_Mean', 'F04_Mean', 'F05_Mean', 'F06_Mean', 'F07_Mean', 'F08_Mean',
       'F09_Mean', 'F10_Mean', 'F11_Mean', 'F12_Mean', 'F13_Mean', 'F14_Mean',
       'F15_Mean', 'F16_Mean', 'F17_Mean', 'F18_Mean', 'F19_Mean', 'F20_Mean',
       'F21_Mean'],
      dtype='object')

In [17]:
tight3_foodsum_basic = tight3_foodsum[['RID', 'CT1_SEX', 'CT1_AGE', 'CT1_eGFR', 'imp_CT1_SBP', 'imp_CT1_BMI',
       'CT1_HB', 'Imp_CT1_ALBUMIN', 'CT1_TCHL', 'imp_new_EDU',
       'imp_new_INCOME', 'imp_new_DRINK', 'imp_new_SMOKE', 'new_CT1_gohyeol',
       'CT1_dangnyo', 'CT1_simhyeol', 'onset_tight']]
tight3_foodsum_food = tight3_foodsum[['RID', 'onset_tight', 'F01_Mean', 'F02_Mean', 'F03_Mean',
       'F04_Mean', 'F05_Mean', 'F06_Mean', 'F07_Mean', 'F08_Mean', 'F09_Mean',
       'F10_Mean', 'F11_Mean', 'F12_Mean', 'F13_Mean', 'F14_Mean', 'F15_Mean',
       'F16_Mean', 'F17_Mean', 'F18_Mean', 'F19_Mean', 'F20_Mean', 'F21_Mean']]

### 1. Original vs Under-sampling vs Over-sampling

Make Dataset

In [18]:
tight3_foodsum_train, tight3_foodsum_test = divide_testset(tight3_foodsum, 0.1)        # 기본 정보만 가진 데이터

y_tight3_test = tight3_foodsum_test['onset_tight']
X_tight3_test = tight3_foodsum_test.drop(['RID', 'onset_tight'], axis=1)

In [19]:
## key : dataset name
## value : (train dataset, train label)
datas = {"original":(), "undersampling":(), "oversampling":(), "test":(X_tight3_test, y_tight3_test)}

datas['undersampling'] = undersampling(tight3_foodsum_train)
datas['oversampling'] = oversampling(tight3_foodsum_train)

y_tight3_foodsum_train = tight3_foodsum_train['onset_tight']
X_tight3_foodsum_train = tight3_foodsum_train.drop(['RID', 'onset_tight'], axis=1)
datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)

# !!!data shape check!!!
for k, v in datas.items():
    print(k, v[0].shape, v[1].shape, v[1].value_counts())

original (57127, 36) (57127,) onset_tight
0    56654
1      473
Name: count, dtype: int64
undersampling (946, 36) (946,) onset_tight
0    473
1    473
Name: count, dtype: int64
oversampling (113308, 36) (113308,) onset_tight
0    56654
1    56654
Name: count, dtype: int64
test (104, 36) (104,) onset_tight
1    52
0    52
Name: count, dtype: int64


Train and Eval
- svm, Decision Tree, logistic regression, linear regression, Gradient Boosting algorithm

In [21]:
# 197m 37.5s
for data in ['undersampling', 'original', 'oversampling']:
    print(f"For {data} dataset!!!")
    X_test, y_test = datas['test']
    X_train, y_train = datas[data]
    
    wei_train_scaler = StandardScaler()
    X_train = wei_train_scaler.fit_transform(X_train)
    X_test = wei_train_scaler.transform(X_test)
    print(f"{data} dataset loaded and scaled")
    
    ### Train and Eval per each Model ###
    print()
    print("!!! SVM !!!")
    svm_model = SVC(kernel='linear', random_state=42, probability=True)
    svm_model = model_fitting(svm_model, X_train, y_train)
    model_eval(svm_model, X_test, y_test)

    print()
    print("!!! RF !!!")
    rf_model = RandomForestClassifier(random_state=seed, max_depth=3)
    rf_model = model_fitting(rf_model, X_train, y_train)
    model_eval(rf_model, X_test, y_test)

    print()
    print("!!! LR !!!")
    lr_model = LogisticRegression(max_iter=1000, random_state=seed)
    lr_model = model_fitting(lr_model, X_train, y_train)
    model_eval(lr_model, X_test, y_test)

    print()
    print("!!! GBC !!!")
    gbc_model = GradientBoostingClassifier(random_state=seed, max_depth=1, learning_rate=0.01)
    gbc_model = model_fitting(gbc_model, X_train, y_train)
    model_eval(gbc_model, X_test, y_test)

For undersampling dataset!!!
undersampling dataset loaded and scaled

!!! SVM !!!
Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.875
[[42 10]
 [ 3 49]]
Recall 	 Precision 	 Acc 	 AUC
0.9423 0.8305 0.875 0.9419

!!! RF !!!
Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.875
[[43  9]
 [ 4 48]]
Recall 	 Precision 	 Acc 	 AUC
0.9231 0.8421 0.875 0.912

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.8846
[[43  9]
 [ 3 49]]
Recall 	 Precision 	 Acc 	 AUC
0.9423 0.8448 0.8846 0.9323

!!! GBC !!!
Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.8654
[[43  9]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.8393 0.8654 0.8963
For original dataset!!!
original dataset loaded and scaled

!!! SVM !!!
Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Pre

  precision = tp / (fp + tp)


Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.9271

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.5096
[[52  0]
 [51  1]]
Recall 	 Precision 	 Acc 	 AUC
0.0192 1.0 0.5096 0.9427

!!! GBC !!!


  precision = tp / (fp + tp)


Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.8719
For oversampling dataset!!!
oversampling dataset loaded and scaled

!!! SVM !!!


  precision = tp / (fp + tp)


Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.7788
[[45  7]
 [16 36]]
Recall 	 Precision 	 Acc 	 AUC
0.6923 0.8372 0.7788 0.9094

!!! RF !!!
Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.7596
[[42 10]
 [15 37]]
Recall 	 Precision 	 Acc 	 AUC
0.7115 0.7872 0.7596 0.8857

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.7885
[[46  6]
 [16 36]]
Recall 	 Precision 	 Acc 	 AUC
0.6923 0.8571 0.7885 0.9105

!!! GBC !!!
Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.8462
[[41 11]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.8103 0.8462 0.8955


# 3. Food adjsum

In [22]:
tight3_foodsum = pd.read_csv("/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data/0922_basic_food_adjusted_sum.csv")

In [23]:
tight3_foodsum.columns

Index(['RID', 'CT1_SEX', 'CT1_AGE', 'CT1_eGFR', 'imp_CT1_SBP', 'imp_CT1_BMI',
       'CT1_HB', 'Imp_CT1_ALBUMIN', 'CT1_TCHL', 'imp_new_EDU',
       'imp_new_INCOME', 'imp_new_DRINK', 'imp_new_SMOKE', 'new_CT1_gohyeol',
       'CT1_dangnyo', 'CT1_simhyeol', 'onset_tight', 'F01_Sum_res',
       'F02_Sum_res', 'F03_Sum_res', 'F04_Sum_res', 'F05_Sum_res',
       'F06_Sum_res', 'F07_Sum_res', 'F08_Sum_res', 'F09_Sum_res',
       'F10_Sum_res', 'F11_Sum_res', 'F12_Sum_res', 'F13_Sum_res',
       'F14_Sum_res', 'F15_Sum_res', 'F16_Sum_res', 'F17_Sum_res',
       'F18_Sum_res', 'F19_Sum_res', 'F20_Sum_res', 'F21_Sum_res'],
      dtype='object')

In [24]:
tight3_foodsum_basic = tight3_foodsum[['RID', 'CT1_SEX', 'CT1_AGE', 'CT1_eGFR', 'imp_CT1_SBP', 'imp_CT1_BMI',
       'CT1_HB', 'Imp_CT1_ALBUMIN', 'CT1_TCHL', 'imp_new_EDU',
       'imp_new_INCOME', 'imp_new_DRINK', 'imp_new_SMOKE', 'new_CT1_gohyeol',
       'CT1_dangnyo', 'CT1_simhyeol', 'onset_tight']]
tight3_foodsum_food = tight3_foodsum[['RID', 'onset_tight', 'F01_Sum_res', 'F02_Sum_res', 'F03_Sum_res', 'F04_Sum_res',
       'F05_Sum_res', 'F06_Sum_res', 'F07_Sum_res', 'F08_Sum_res',
       'F09_Sum_res', 'F10_Sum_res', 'F11_Sum_res', 'F12_Sum_res',
       'F13_Sum_res', 'F14_Sum_res', 'F15_Sum_res', 'F16_Sum_res',
       'F17_Sum_res', 'F18_Sum_res', 'F19_Sum_res', 'F20_Sum_res',
       'F21_Sum_res']]

### 1. Original vs Under-sampling vs Over-sampling

Make Dataset

In [25]:
tight3_foodsum_train, tight3_foodsum_test = divide_testset(tight3_foodsum, 0.1)        # 기본 정보만 가진 데이터

y_tight3_test = tight3_foodsum_test['onset_tight']
X_tight3_test = tight3_foodsum_test.drop(['RID', 'onset_tight'], axis=1)

In [26]:
## key : dataset name
## value : (train dataset, train label)
datas = {"original":(), "undersampling":(), "oversampling":(), "test":(X_tight3_test, y_tight3_test)}

datas['undersampling'] = undersampling(tight3_foodsum_train)
datas['oversampling'] = oversampling(tight3_foodsum_train)

y_tight3_foodsum_train = tight3_foodsum_train['onset_tight']
X_tight3_foodsum_train = tight3_foodsum_train.drop(['RID', 'onset_tight'], axis=1)
datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)

# !!!data shape check!!!
for k, v in datas.items():
    print(k, v[0].shape, v[1].shape, v[1].value_counts())

original (57127, 36) (57127,) onset_tight
0    56654
1      473
Name: count, dtype: int64
undersampling (946, 36) (946,) onset_tight
0    473
1    473
Name: count, dtype: int64
oversampling (113308, 36) (113308,) onset_tight
0    56654
1    56654
Name: count, dtype: int64
test (104, 36) (104,) onset_tight
1    52
0    52
Name: count, dtype: int64


Train and Eval
- svm, Decision Tree, logistic regression, linear regression, Gradient Boosting algorithm

In [27]:
# 222m
for data in ['undersampling', 'original', 'oversampling']:
    print(f"For {data} dataset!!!")
    X_test, y_test = datas['test']
    X_train, y_train = datas[data]
    
    wei_train_scaler = StandardScaler()
    X_train = wei_train_scaler.fit_transform(X_train)
    X_test = wei_train_scaler.transform(X_test)
    print(f"{data} dataset loaded and scaled")
    
    ### Train and Eval per each Model ###
    print()
    print("!!! SVM !!!")
    svm_model = SVC(kernel='linear', random_state=42, probability=True)
    svm_model = model_fitting(svm_model, X_train, y_train)
    model_eval(svm_model, X_test, y_test)

    print()
    print("!!! RF !!!")
    rf_model = RandomForestClassifier(random_state=seed, max_depth=3)
    rf_model = model_fitting(rf_model, X_train, y_train)
    model_eval(rf_model, X_test, y_test)

    print()
    print("!!! LR !!!")
    lr_model = LogisticRegression(max_iter=1000, random_state=seed)
    lr_model = model_fitting(lr_model, X_train, y_train)
    model_eval(lr_model, X_test, y_test)

    print()
    print("!!! GBC !!!")
    gbc_model = GradientBoostingClassifier(random_state=seed, max_depth=1, learning_rate=0.01)
    gbc_model = model_fitting(gbc_model, X_train, y_train)
    model_eval(gbc_model, X_test, y_test)

For undersampling dataset!!!
undersampling dataset loaded and scaled

!!! SVM !!!
Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.875
[[42 10]
 [ 3 49]]
Recall 	 Precision 	 Acc 	 AUC
0.9423 0.8305 0.875 0.9316

!!! RF !!!
Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.875
[[44  8]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.8545 0.875 0.9046

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.8654
[[42 10]
 [ 4 48]]
Recall 	 Precision 	 Acc 	 AUC
0.9231 0.8276 0.8654 0.9331

!!! GBC !!!
Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.8462
[[41 11]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.8103 0.8462 0.9136
For original dataset!!!
original dataset loaded and scaled

!!! SVM !!!
Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Pr

  precision = tp / (fp + tp)


Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.9294

!!! LR !!!


  precision = tp / (fp + tp)


Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.5096
[[52  0]
 [51  1]]
Recall 	 Precision 	 Acc 	 AUC
0.0192 1.0 0.5096 0.9427

!!! GBC !!!
Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.8719
For oversampling dataset!!!
oversampling dataset loaded and scaled

!!! SVM !!!


  precision = tp / (fp + tp)


Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.8077
[[46  6]
 [14 38]]
Recall 	 Precision 	 Acc 	 AUC
0.7308 0.8636 0.8077 0.9079

!!! RF !!!
Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.8269
[[43  9]
 [ 9 43]]
Recall 	 Precision 	 Acc 	 AUC
0.8269 0.8269 0.8269 0.8828

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.7981
[[46  6]
 [15 37]]
Recall 	 Precision 	 Acc 	 AUC
0.7115 0.8605 0.7981 0.9094

!!! GBC !!!
Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.8365
[[40 12]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.7966 0.8365 0.8996


# 4. Food adj mean

In [28]:
tight3_foodsum = pd.read_csv("/home/user18/pnu_ckd/hexa_preprocessing_after95/0911_dl_models/data/0922_data/0922_basic_food_adjusted_mean.csv")

In [29]:
tight3_foodsum.columns

Index(['RID', 'CT1_SEX', 'CT1_AGE', 'CT1_eGFR', 'imp_CT1_SBP', 'imp_CT1_BMI',
       'CT1_HB', 'Imp_CT1_ALBUMIN', 'CT1_TCHL', 'imp_new_EDU',
       'imp_new_INCOME', 'imp_new_DRINK', 'imp_new_SMOKE', 'new_CT1_gohyeol',
       'CT1_dangnyo', 'CT1_simhyeol', 'onset_tight', 'F01_Mean_res',
       'F02_Mean_res', 'F03_Mean_res', 'F04_Mean_res', 'F05_Mean_res',
       'F06_Mean_res', 'F07_Mean_res', 'F08_Mean_res', 'F09_Mean_res',
       'F10_Mean_res', 'F11_Mean_res', 'F12_Mean_res', 'F13_Mean_res',
       'F14_Mean_res', 'F15_Mean_res', 'F16_Mean_res', 'F17_Mean_res',
       'F18_Mean_res', 'F19_Mean_res', 'F20_Mean_res', 'F21_Mean_res'],
      dtype='object')

In [30]:
tight3_foodsum_basic = tight3_foodsum[['RID', 'CT1_SEX', 'CT1_AGE', 'CT1_eGFR', 'imp_CT1_SBP', 'imp_CT1_BMI',
       'CT1_HB', 'Imp_CT1_ALBUMIN', 'CT1_TCHL', 'imp_new_EDU',
       'imp_new_INCOME', 'imp_new_DRINK', 'imp_new_SMOKE', 'new_CT1_gohyeol',
       'CT1_dangnyo', 'CT1_simhyeol', 'onset_tight']]
tight3_foodsum_food = tight3_foodsum[['RID', 'onset_tight', 'F01_Mean_res', 'F02_Mean_res', 'F03_Mean_res',
       'F04_Mean_res', 'F05_Mean_res', 'F06_Mean_res', 'F07_Mean_res',
       'F08_Mean_res', 'F09_Mean_res', 'F10_Mean_res', 'F11_Mean_res',
       'F12_Mean_res', 'F13_Mean_res', 'F14_Mean_res', 'F15_Mean_res',
       'F16_Mean_res', 'F17_Mean_res', 'F18_Mean_res', 'F19_Mean_res',
       'F20_Mean_res', 'F21_Mean_res']]

### 1. Original vs Under-sampling vs Over-sampling

Make Dataset

In [31]:
tight3_foodsum_train, tight3_foodsum_test = divide_testset(tight3_foodsum, 0.1)        # 기본 정보만 가진 데이터

y_tight3_test = tight3_foodsum_test['onset_tight']
X_tight3_test = tight3_foodsum_test.drop(['RID', 'onset_tight'], axis=1)

In [32]:
## key : dataset name
## value : (train dataset, train label)
datas = {"original":(), "undersampling":(), "oversampling":(), "test":(X_tight3_test, y_tight3_test)}

datas['undersampling'] = undersampling(tight3_foodsum_train)
datas['oversampling'] = oversampling(tight3_foodsum_train)

y_tight3_foodsum_train = tight3_foodsum_train['onset_tight']
X_tight3_foodsum_train = tight3_foodsum_train.drop(['RID', 'onset_tight'], axis=1)
datas['original'] = (X_tight3_foodsum_train, y_tight3_foodsum_train)

# !!!data shape check!!!
for k, v in datas.items():
    print(k, v[0].shape, v[1].shape, v[1].value_counts())

original (57127, 36) (57127,) onset_tight
0    56654
1      473
Name: count, dtype: int64
undersampling (946, 36) (946,) onset_tight
0    473
1    473
Name: count, dtype: int64
oversampling (113308, 36) (113308,) onset_tight
0    56654
1    56654
Name: count, dtype: int64
test (104, 36) (104,) onset_tight
1    52
0    52
Name: count, dtype: int64


Train and Eval
- svm, Decision Tree, logistic regression, linear regression, Gradient Boosting algorithm

In [33]:
# 159m 37.5s
for data in ['undersampling', 'original', 'oversampling']:
    print(f"For {data} dataset!!!")
    X_test, y_test = datas['test']
    X_train, y_train = datas[data]
    
    wei_train_scaler = StandardScaler()
    X_train = wei_train_scaler.fit_transform(X_train)
    X_test = wei_train_scaler.transform(X_test)
    print(f"{data} dataset loaded and scaled")
    
    ### Train and Eval per each Model ###
    print()
    print("!!! SVM !!!")
    svm_model = SVC(kernel='linear', random_state=42, probability=True)
    svm_model = model_fitting(svm_model, X_train, y_train)
    model_eval(svm_model, X_test, y_test)

    print()
    print("!!! RF !!!")
    rf_model = RandomForestClassifier(random_state=seed, max_depth=3)
    rf_model = model_fitting(rf_model, X_train, y_train)
    model_eval(rf_model, X_test, y_test)

    print()
    print("!!! LR !!!")
    lr_model = LogisticRegression(max_iter=1000, random_state=seed)
    lr_model = model_fitting(lr_model, X_train, y_train)
    model_eval(lr_model, X_test, y_test)

    print()
    print("!!! GBC !!!")
    gbc_model = GradientBoostingClassifier(random_state=seed, max_depth=1, learning_rate=0.01)
    gbc_model = model_fitting(gbc_model, X_train, y_train)
    model_eval(gbc_model, X_test, y_test)

For undersampling dataset!!!
undersampling dataset loaded and scaled

!!! SVM !!!
Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.8654
[[42 10]
 [ 4 48]]
Recall 	 Precision 	 Acc 	 AUC
0.9231 0.8276 0.8654 0.9283

!!! RF !!!
Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.8654
[[43  9]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.8393 0.8654 0.9116

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.8558
[[42 10]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.8246 0.8558 0.9283

!!! GBC !!!
Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.8462
[[41 11]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.8103 0.8462 0.912
For original dataset!!!
original dataset loaded and scaled

!!! SVM !!!
Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	

  precision = tp / (fp + tp)


Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.9294

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.5096
[[52  0]
 [51  1]]
Recall 	 Precision 	 Acc 	 AUC
0.0192 1.0 0.5096 0.9427

!!! GBC !!!


  precision = tp / (fp + tp)


Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.5
[[52  0]
 [52  0]]
Recall 	 Precision 	 Acc 	 AUC
0.0 nan 0.5 0.8719
For oversampling dataset!!!
oversampling dataset loaded and scaled

!!! SVM !!!


  precision = tp / (fp + tp)


Score with simple SVC(kernel='linear', probability=True, random_state=42) model
0.5 0.8269
[[46  6]
 [12 40]]
Recall 	 Precision 	 Acc 	 AUC
0.7692 0.8696 0.8269 0.9153

!!! RF !!!
Score with simple RandomForestClassifier(max_depth=3, random_state=1109) model
0.5 0.8365
[[44  8]
 [ 9 43]]
Recall 	 Precision 	 Acc 	 AUC
0.8269 0.8431 0.8365 0.8946

!!! LR !!!
Score with simple LogisticRegression(max_iter=1000, random_state=1109) model
0.5 0.8173
[[46  6]
 [13 39]]
Recall 	 Precision 	 Acc 	 AUC
0.75 0.8667 0.8173 0.9157

!!! GBC !!!
Score with simple GradientBoostingClassifier(learning_rate=0.01, max_depth=1, random_state=1109) model
0.5 0.8462
[[41 11]
 [ 5 47]]
Recall 	 Precision 	 Acc 	 AUC
0.9038 0.8103 0.8462 0.9031
