In [1]:
%cd /root/Real/KAIST/700G/

/root/Real/KAIST/700G


In [19]:
!pip install numpy pandas matplotlib==3.5.3 seaborn scikit-learn==1.2.2 lightgbm shap tqdm 

Collecting matplotlib==3.5.3
  Downloading matplotlib-3.5.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.6.0
    Uninstalling matplotlib-3.6.0:
      Successfully uninstalled matplotlib-3.6.0
Successfully installed matplotlib-3.5.3


In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.decomposition import PCA

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

from lightgbm import LGBMClassifier
import shap
from tqdm.notebook import tqdm

import os

In [4]:
sklearn.set_config(transform_output="pandas") #python version >= 3.8, sklearn version >= 1.2.0
LGBMClassifier.transform = lambda self,x:x

In [12]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [29]:
def load_dataset(disease_no, test_size=0.2):

    df = pd.read_csv("Data/phr_data_dropped.csv")
    target_variables = {
        0 : ['bmi', 'height', 'weight', 'waist'],
        1 : ['blood_sugar'],
        2 : ['neutral_fat'],
        3 : ['hdl', 'ldl'], #neutral fat and choloesterol are controversial
        4 : ['got', 'gpt'], #gamma gtp
        5 : ['hemoglobin'],
        6 : ['max_bp', 'min_bp']
    }

    #Dropping NA values in target variables
    df.dropna(subset = target_variables[disease_no], inplace=True)

    #Dropping genetic features
    df = df.drop(df.columns[:64], axis=1)

    criterion = {
        0: df['bmi']>=25,
        1: df['blood_sugar']>=100,
        2: df['neutral_fat']>=np.log1p(200),
        3: (df['hdl']<40)|(df['ldl']>=160),
        4: (df['got']>np.log1p(40))|(df['gpt']>np.log1p(40)),
        5: df['hemoglobin']+df['gender']<=13, #modified from <13
        6: (df['max_bp']>=130)|(df['min_bp']>=80)
    }

    diseases = np.where(criterion[disease_no], 1, 0)
    diseases = pd.Series(diseases, index=df.index)

    #Leaving lifelog and servey.etc
    #df = df[df.columns[19:].append(pd.Index(target_variables[disease_no]))]
    
    x = df.copy()
    x.drop(columns=target_variables[disease_no], axis=1, inplace=True)

    x_train, x_test, y_train, y_test = train_test_split(x, diseases, test_size=test_size, stratify=diseases)
    
    return x_train, x_test, y_train, y_test

In [30]:
def build_pipeline():
    pca_for_features = ColumnTransformer([
        ('pca_smoking_all', PCA(n_components=1), ['have_smoking', 'smoking_duration_all', 'smoking_all_count']),
        ('pca_secondary_smoking_home', PCA(n_components=1), ['secondary_smoking_home', 'secondary_smoking_home_count_per_week', 'secondary_smoking_duration_home', 'secondary_smoking_hour_home']),
        ('pca_secondary_smoking_work', PCA(n_components=1), ['secondary_smoking_work', 'secondary_smoking_work_per_week', 'secondary_smoking_duration_work', 'secondary_smoking_hour_work']),
        ('pca_recent_symptom', PCA(n_components=2), ['last2week_symptom_decreasedintertest_in_last2weeks', 'last2week_symptom_depressed_in_last2weeks',
                                                    'last2week_symptom_sleepdisorder_in_last2weeks', 'last2week_symptom_tiredness_in_last2weeks',
                                                    'last2week_symptom_eatingdisorder_in_last2weeks', 'last2week_symptom_discourage_in_last2weeks',
                                                    'last2week_symptom_decreasedconcentration_in_last2weeks', 'last2week_symptom_anxious_in_last2weeks',
                                                    'last2week_symptom_selfharm_in_last2weeks'])
        ],
        remainder='passthrough'
    )

    lgbm_params = {
        'learning_rate': [0.005, 0.01],
        'n_estimators': [8,16,24],
        'num_leaves': [6,8,12], # large num_leaves helps improve accuracy but might lead to over-fitting
        'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
        'objective' : ['binary'],
        'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
        'random_state' : [500],
        'colsample_bytree' : [0.64, 0.65, 0.66],
        'subsample' : [0.7,0.75],
        'reg_alpha' : [1,1.2],
        'reg_lambda' : [1,1.2,1.4],
    }

    LGBMClassifier.transform = lambda self,x:x

    main_pipeline = Pipeline([
        ('imputer', IterativeImputer()),
        ('scaler', RobustScaler()),
        ('pca_for_smoking', pca_for_features),
        ('estimator', GridSearchCV(estimator=LGBMClassifier(), param_grid=lgbm_params, cv=KFold(n_splits=5, shuffle=True), n_jobs=-3))
        ]
    )

    return main_pipeline

In [31]:
def scoring(y_test, y_predproba, random_state, criteria=0.5):
    y_pred = y_predproba[:,1] >= criteria

    score_df = pd.DataFrame()
    #TODO: confusion matrix
    score_df['accuracy'] = [accuracy_score(y_test, y_pred)]
    score_df['precision'] = [precision_score(y_test, y_pred)]
    score_df['recall'] = [recall_score(y_test, y_pred)]
    score_df['f1_score'] = [f1_score(y_test, y_pred)]
    score_df['auroc'] = [roc_auc_score(y_test, y_predproba[:,1])]
    score_df['auprc'] = [average_precision_score(y_test, y_predproba[:,1])]

    score_df.index = ['experiment' + str(random_state)]

    return score_df

In [32]:
def featimp(main_pipeline, x_test, random_state):
    transformed_x_test = main_pipeline.transform(x_test)

    explainer = shap.TreeExplainer(main_pipeline['estimator'].best_estimator_)
    shap_values = explainer.shap_values(transformed_x_test)

    vals= np.abs(shap_values[1]).mean(0)
    shap_importance = pd.DataFrame([vals], index=['shap_value'+ str(random_state)], columns=main_pipeline.transform(x_test).columns)
    
    return shap_importance

In [33]:
def export_to_csv(file_lists):
    for df, name in file_lists:
        if not os.path.exists(name):
            df.to_csv(name, index=True, mode='w', encoding='utf-8-sig')
        else:
            df.to_csv(name, index=True, mode='a', encoding='utf-8-sig', header=False)

In [34]:
def train(disease_no, score_file_name, featimp_file_name, random_state=0):
    np.random.seed(random_state)

    x_train, x_test, y_train, y_test = load_dataset(disease_no=disease_no)
    main_pipeline = build_pipeline()

    weight = class_weight.compute_sample_weight('balanced', y_train)

    main_pipeline.fit_transform(x_train, y_train, **{'estimator__sample_weight': weight})
    y_predproba = main_pipeline.predict_proba(x_test)

    score_df = scoring(y_test, y_predproba, random_state)
    featimp_df = featimp(main_pipeline, x_test, random_state)

    export_to_csv([(score_df, score_file_name), (featimp_df, featimp_file_name)])

In [35]:
def main(disease_no, path = 'Outputs/tmp', random_state_list = range(0, 10)):
    os.makedirs(path, exist_ok=True)

    for random_state in random_state_list:
        print("Current training: Disease {}, random state: {}".format(disease_no, random_state))
        train(disease_no, score_file_name=path + '/' + str(disease_no) + '_eval.csv', featimp_file_name=path + '/' + str(disease_no) + '_featimp.csv', random_state=random_state)

In [36]:
for num in range(0, 7):
    main(num, path='Outputs/without_healthcheck')

Current training: Disease 0, random state: 0
Current training: Disease 0, random state: 1
Current training: Disease 0, random state: 2
Current training: Disease 0, random state: 3
Current training: Disease 0, random state: 4
Current training: Disease 0, random state: 5
Current training: Disease 0, random state: 6
Current training: Disease 0, random state: 7
Current training: Disease 0, random state: 8
Current training: Disease 0, random state: 9
Current training: Disease 1, random state: 0
Current training: Disease 1, random state: 1
Current training: Disease 1, random state: 2
Current training: Disease 1, random state: 3
Current training: Disease 1, random state: 4
Current training: Disease 1, random state: 5
Current training: Disease 1, random state: 6
Current training: Disease 1, random state: 7
Current training: Disease 1, random state: 8
Current training: Disease 1, random state: 9
Current training: Disease 2, random state: 0
Current training: Disease 2, random state: 1
Current tr