In [1]:
%cd /root/share/Real/KAIST/700G_new/
%load_ext autoreload
%autoreload 2

/root/share/Real/KAIST/700G_new


In [2]:
import sklearn 
print(sklearn.__version__)

1.2.2


In [3]:
!pip install git+https://github.com/VLL-HD/FrEIA.git tensorboard pytorch-ignite==0.4.2

Collecting git+https://github.com/VLL-HD/FrEIA.git
  Cloning https://github.com/VLL-HD/FrEIA.git to /tmp/pip-req-build-umfh2kxm
  Running command git clone --filter=blob:none --quiet https://github.com/VLL-HD/FrEIA.git /tmp/pip-req-build-umfh2kxm
  Resolved https://github.com/VLL-HD/FrEIA.git to commit a4d3a7db135460e4dd11d4fd7f24b1c97fe7c0d3
  Preparing metadata (setup.py) ... [?25ldone
[0m

In [4]:
import numpy as np
import pandas as pd
import random

from sklearn.preprocessing import LabelEncoder
import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.decomposition import PCA

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

import FrEIA.framework as Ff
import FrEIA.modules as Fm

import torch
import torch.nn as nn
import torch.nn.functional as F

import model_flow
import constant as const
import utils
import main

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
sklearn.set_config(transform_output="pandas") #python version >= 3.8, sklearn version >= 1.2.0

In [6]:
def init_seeds(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [26]:
def load_dataset(disease_no, test_size=0.2):

    df = pd.read_csv("Data/phr_data_dropped.csv")
    target_variables = {
        0 : ['bmi', 'height', 'weight', 'waist'],
        1 : ['blood_sugar'],
        2 : ['neutral_fat'],
        3 : ['hdl', 'ldl'], #neutral fat and choloesterol are controversial
        4 : ['got', 'gpt'], #gamma gtp
        5 : ['hemoglobin'],
        6 : ['max_bp', 'min_bp']
    }

    #Dropping NA values in target variables
    df.dropna(subset = target_variables[disease_no], inplace=True)

    #Dropping genetic features
    df = df.drop(df.columns[:64], axis=1)

    criterion = {
        0: df['bmi']>=25,
        1: df['blood_sugar']>=126,
        2: df['neutral_fat']>=np.log1p(200),
        3: (df['hdl']<40)|(df['ldl']>=160),
        4: (df['got']>np.log1p(40))|(df['gpt']>np.log1p(40)),
        5: df['hemoglobin']+df['gender']<=13, #modified from <13
        6: (df['max_bp']>=130)|(df['min_bp']>=80)
    }

    diseases = np.where(criterion[disease_no], 1, 0)
    diseases = pd.Series(diseases, index=df.index)

    #Leaving lifelog and servey.etc
    #df = df[df.columns[19:].append(pd.Index(target_variables[disease_no]))]
    
    x = df.copy()
    x.drop(columns=target_variables[disease_no], axis=1, inplace=True)

    diseases = np.where(criterion[disease_no], 1, 0)
    encoder = LabelEncoder()
    diseases = encoder.fit_transform(diseases)

    normal = df[diseases == 0]
    diseases = df[diseases == 1] 
    x_diseases = diseases.drop(columns=target_variables[disease_no], axis=1)
    y_diseases = pd.DataFrame(np.ones(len(x_diseases)), columns=['diseases'], index=x_diseases.index)
    x_normal = normal.drop(columns=target_variables[disease_no], axis=1)
    y_normal = pd.DataFrame(np.zeros(len(x_normal)), columns=['diseases'], index=x_normal.index)

    x_train, x_test, y_train, y_test = train_test_split(x_normal, y_normal, test_size=1-const.TRAIN_RATIO, stratify=normal['gender'])
    
    x_test = pd.concat([x_test, x_diseases])
    y_test = pd.concat([y_test, y_diseases])

    #Conditions are not scaled.
    con_vec = ["age", "gender"]
    
    c_train = x_train[con_vec]
    c_test = x_test[con_vec]

    x_train.drop(con_vec, axis=1, inplace=True)
    x_test.drop(con_vec, axis=1, inplace=True)
    
    return x_train, x_test, y_train, y_test, c_train, c_test

In [27]:
def build_pipeline():
    pca_for_features = ColumnTransformer([
        ('pca_smoking_all', PCA(n_components=1), ['have_smoking', 'smoking_duration_all', 'smoking_all_count']),
        ('pca_secondary_smoking_home', PCA(n_components=1), ['secondary_smoking_home', 'secondary_smoking_home_count_per_week', 'secondary_smoking_duration_home', 'secondary_smoking_hour_home']),
        ('pca_secondary_smoking_work', PCA(n_components=1), ['secondary_smoking_work', 'secondary_smoking_work_per_week', 'secondary_smoking_duration_work', 'secondary_smoking_hour_work']),
        ('pca_recent_symptom', PCA(n_components=2), ['last2week_symptom_decreasedintertest_in_last2weeks', 'last2week_symptom_depressed_in_last2weeks',
                                                    'last2week_symptom_sleepdisorder_in_last2weeks', 'last2week_symptom_tiredness_in_last2weeks',
                                                    'last2week_symptom_eatingdisorder_in_last2weeks', 'last2week_symptom_discourage_in_last2weeks',
                                                    'last2week_symptom_decreasedconcentration_in_last2weeks', 'last2week_symptom_anxious_in_last2weeks',
                                                    'last2week_symptom_selfharm_in_last2weeks'])
        ],
        remainder='passthrough'
    )

    preproc_pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('imputer', IterativeImputer()),
        ('pca_for_smoking', pca_for_features)
        ]
    )

    return preproc_pipeline

In [29]:
def train(num, random_state):
    init_seeds(random_state)
    num = num

    x_train, x_test, y_train, y_test, c_train, c_test = load_dataset(disease_no=num)

    preproc_pipeline = build_pipeline()
    x_train = preproc_pipeline.fit_transform(x_train)
    x_test = preproc_pipeline.transform(x_test)
    
    x_train = torch.FloatTensor(x_train.values).to('cuda')
    #y_train = torch.tensor(y_train.values).to("cuda")
    c_train = torch.FloatTensor(c_train.values).to('cuda')
    x_test = torch.FloatTensor(x_test.values).to('cuda')
    #y_test = torch.tensor(y_test)
    c_test = torch.FloatTensor(c_test.values).to('cuda')

    model = model_flow.CD_Flow(
        dim_features = x_train.shape[1],
        flow_steps = const.FLOW_STEPS,
        cond_dims = 2
    ).to('cuda')

    main.train(model, x_train, x_test, y_test, c_train, c_test, multiple = True)

In [38]:
import shutil
try:
    shutil.rmtree('_experiments')
except:
    pass

for ran in range(42, 92): #42, 43 .... , 51
    print("\n\n------RANDOM STATE:", ran, "------")
    train(1, ran)



------RANDOM STATE: 42 ------


2023-05-25 18:30:21,367 eval INFO: [epoch 0] [AUROC 0.8298284449363594] [AUPRC 0.08810499277578981]
2023-05-25 18:30:22,875 eval INFO: [epoch 1] [AUROC 0.817653569452131] [AUPRC 0.08085667301109026]
2023-05-25 18:30:24,383 eval INFO: [epoch 2] [AUROC 0.8049252905368011] [AUPRC 0.07541374264849963]
2023-05-25 18:30:25,876 eval INFO: [epoch 3] [AUROC 0.794133923630327] [AUPRC 0.0703175108607919]
2023-05-25 18:30:27,376 eval INFO: [epoch 4] [AUROC 0.7866629773104593] [AUPRC 0.06786533564942676]
2023-05-25 18:30:28,876 eval INFO: [epoch 5] [AUROC 0.7899833978970666] [AUPRC 0.06844387111358076]
2023-05-25 18:30:30,401 eval INFO: [epoch 6] [AUROC 0.7921970116214719] [AUPRC 0.06874731391239597]
2023-05-25 18:30:31,912 eval INFO: [epoch 7] [AUROC 0.7960708356391809] [AUPRC 0.06991917081036776]
2023-05-25 18:30:33,418 eval INFO: [epoch 8] [AUROC 0.796624239070282] [AUPRC 0.06995644899160598]
2023-05-25 18:30:34,923 eval INFO: [epoch 9] [AUROC 0.7971776425013832] [AUPRC 0.07031031889082964]
2023

KeyboardInterrupt: 