# chasv_60model_tpot.v4.1
split 9:1

In [1]:
SEED = 0
cores = 16
from numpy.random import seed
seed(SEED)
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tpot import TPOTClassifier
from sklearn.preprocessing import KBinsDiscretizer



In [2]:
df = pd.read_csv("/home/danssa/proj_ua/data/chasv_development.v1.csv", dtype={'id':np.str})
df2 = df.loc[df['from']!="knhanes"]
df2.describe()

Unnamed: 0,eGFR_ab,eGFR_ckd,male,age,he_uph,he_unitr,he_usg,he_upro,he_uglu,he_uket,he_ubil,he_ubld,he_uro,leucocyte,dm,htn
count,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0,220020.0
mean,0.067598,93.71938,0.428929,47.179829,6.183206,0.02028,1.018628,0.331024,0.234011,0.20559,0.039356,0.757068,0.168417,0.516135,0.025325,0.039046
std,0.251056,22.147902,0.494924,15.570474,0.824044,0.140957,0.007915,0.769946,0.88869,0.669646,0.324305,1.252923,0.560018,1.024153,0.157111,0.193706
min,0.0,1.704754,0.0,18.0,5.0,0.0,1.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,80.80301,0.0,35.0,5.5,0.0,1.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,95.81374,0.0,45.0,6.0,0.0,1.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,110.264,1.0,58.0,7.0,0.0,1.025,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
max,1.0,257.176,1.0,95.0,9.0,1.0,1.03,5.0,5.0,5.0,4.0,5.0,5.0,4.0,1.0,1.0


In [3]:
#3group age split  

##step 1 finding edge value
abnormal_disc = df2.query('eGFR_ab==1').loc[:,'age']
abnormal_disc = pd.DataFrame(abnormal_disc)

est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
est.fit(abnormal_disc)

ab_disc = est.transform(abnormal_disc).astype('float')
print("edges : ", est.bin_edges_[0])

age0_edge = est.bin_edges_[0][1]
age1_edge = est.bin_edges_[0][2]
age2_edge = est.bin_edges_[0][3]
print('age0_edge:', age0_edge, '\nage1_edge:', age1_edge, '\nage2_edge:', age2_edge)

abnormal_disc['level'] = abnormal_disc.apply(lambda x : 0 if x['age']<age0_edge else 1 if x['age']<age1_edge else 2, axis=1)
print('age group:\n',abnormal_disc['level'].value_counts().sort_index())

edges :  [18.         53.80721514 70.94886739 95.        ]
age0_edge: 53.80721513971053 
age1_edge: 70.94886739427912 
age2_edge: 95.0
age group:
 0    2104
1    5505
2    7264
Name: level, dtype: int64


In [4]:
##make 3group by age

df3 = df2.copy()

df3['level'] = df3.apply(lambda x : 0 if x['age']<age0_edge else 1 if x['age']<age1_edge else 2, axis=1)
print(df3['level'].value_counts())

0    147140
1     52377
2     20503
Name: level, dtype: int64


In [5]:
##age0 group
X_age0 = df3[df3['level']==0]
y_age0 = X_age0['eGFR_ab'].astype("int64")

print("total cases = %d" %X_age0.shape[0])
print("total abnormal function of kidney = %d" %sum(y_age0))

X_train0, X_test0, y_train0, y_test0 = train_test_split(X_age0, y_age0, test_size=0.1, stratify=y_age0, random_state=SEED)
print("train0 : %d" % sum(y_train0), "test0 : %d" % sum(y_test0))

total cases = 147140
total abnormal function of kidney = 2104
train0 : 1894 test0 : 210


In [6]:
##age1 group
X_age1 = df3[df3['level']==1]
y_age1 = X_age1['eGFR_ab']

print("total cases = %d" %X_age1.shape[0])
print("total abnormal function of kidney = %d" %sum(y_age1))

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_age1, y_age1, test_size=0.1, stratify=y_age1, random_state=SEED)
print("train1 : %d" % sum(y_train1), "test0 : %d" % sum(y_test1))

total cases = 52377
total abnormal function of kidney = 5505
train1 : 4954 test0 : 551


In [7]:
##age2 group
X_age2 = df3[df3['level']==2]
y_age2 = X_age2['eGFR_ab']

print("total cases = %d" %X_age2.shape[0])
print("total abnormal function of kidney = %d" %sum(y_age2))

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_age2, y_age2, test_size=0.1, stratify=y_age2, random_state=SEED)
print("train2 : %d" % sum(y_train2), "test2 : %d" % sum(y_test2))

total cases = 20503
total abnormal function of kidney = 7264
train2 : 6537 test2 : 727


In [8]:
##concat both trainset and testset
X_train = pd.concat([X_train0, X_train1, X_train2])
y_train = pd.concat([y_train0, y_train1, y_train2])

X_test = pd.concat([X_test0, X_test1, X_test2])
y_test = pd.concat([y_test0, y_test1, y_test2])

print("total cases = %d" % (X_train.shape[0] + X_test.shape[0]))
print("total abnormal function of kidney = %d" % (sum(y_train) + sum(y_test)))

total cases = 220020
total abnormal function of kidney = 14873


In [9]:
X_train_features = X_train.loc[:, 'male':'leucocyte']

print('%d train cases, %d variables' % (X_train_features.shape[0], X_train_features.shape[1]))
print('%d test cases'%X_test.shape[0])

198017 train cases, 12 variables
22003 test cases


In [10]:
#standardization

scaler = StandardScaler()
std_cols=['age','he_uph','he_usg']
std_df=X_train_features[std_cols]

X_train_features[std_cols]=scaler.fit_transform(std_df)
X_train_features.describe()

Unnamed: 0,male,age,he_uph,he_unitr,he_usg,he_upro,he_uglu,he_uket,he_ubil,he_ubld,he_uro,leucocyte
count,198017.0,198017.0,198017.0,198017.0,198017.0,198017.0,198017.0,198017.0,198017.0,198017.0,198017.0,198017.0
mean,0.42914,1.607555e-17,3.87679e-16,0.020332,7.31179e-15,0.331451,0.235051,0.206831,0.039593,0.756268,0.168546,0.518011
std,0.494955,1.000003,1.000003,0.141132,1.000003,0.770043,0.89065,0.671309,0.325001,1.252869,0.559666,1.025491
min,0.0,-1.873717,-1.436703,0.0,-1.722663,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,-0.7818369,-0.8297511,0.0,-1.091099,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,-0.1395544,-0.2227995,0.0,0.172029,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.6954129,0.9911036,0.0,0.8035928,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,1.0,3.071858,3.41891,1.0,1.435157,5.0,5.0,5.0,4.0,5.0,5.0,4.0


In [11]:
from collections import Counter
counter = Counter(y_train)
estimate = round(counter[0]/counter[1])
step = round((estimate - 1)/3)
estimate

14

In [12]:
#https://dask-cuda.readthedocs.io/en/latest/quickstart.html

from dask_cuda import LocalCUDACluster
from dask.distributed import Client

# Create a Dask Cluster with one worker per GPU
cluster = LocalCUDACluster()
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38721 instead


In [13]:
classifier_config_dict = {

    # xgboost tree method = gpu hist
    
    'xgboost.XGBClassifier': {
        'n_estimators': [100, 250 ,500, 750, 1000],
        'learning_rate': [1e-2, 1e-1, 0.3],
        'max_depth': range(2, 11),
        'min_child_weight': range(1, 21),
        'gamma':np.arange(0, 2.01, 0.2),
        'subsample': np.arange(0.2, 1.01, 0.2),
        'colsample_bytree': np.arange(0.4,1.01,0.2),
        "reg_alpha": [0, 0.25, 0.5, 0.75, 1],
        "reg_lambda": [1, 2, 4, 6, 8],
        'scale_pos_weight': [estimate],
        'objective': ['binary:logistic'],
        'tree_method' : ['gpu_hist'],
        'n_jobs': [1],
        'verbosity': [0]
    },

}

tpot = TPOTClassifier(scoring="roc_auc",
                      cv=5,
                      random_state=SEED,
                      n_jobs=4,
                      verbosity=3,
                      generations=100,
                      population_size=100,
                      use_dask=True,
                      warm_start=False,
                      config_dict=classifier_config_dict,
                      template='Classifier')

training_features=X_train_features.copy(deep=True)
tpot.fit(training_features, y_train)

tpot.export('/home/danssa/proj_ua/progress/CHA+SV*/60model/chasv_60model.v4.1.py')

1 operators have been imported by TPOT.


Version 0.11.6.post3 of tpot is outdated. Version 0.11.7 was released Wednesday January 06, 2021.


Optimization Progress:   0%|          | 0/10100 [00:00<?, ?pipeline/s]


Generation 1 - Current Pareto front scores:

-1	0.9352215917594261	XGBClassifier(input_matrix, XGBClassifier__colsample_bytree=0.8000000000000002, XGBClassifier__gamma=0.6000000000000001, XGBClassifier__learning_rate=0.01, XGBClassifier__max_depth=2, XGBClassifier__min_child_weight=1, XGBClassifier__n_estimators=750, XGBClassifier__n_jobs=1, XGBClassifier__objective=binary:logistic, XGBClassifier__reg_alpha=0.5, XGBClassifier__reg_lambda=6, XGBClassifier__scale_pos_weight=14, XGBClassifier__subsample=0.2, XGBClassifier__tree_method=gpu_hist, XGBClassifier__verbosity=0)
Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.

Generation 2 - Current Pareto front scores:

-1	0.9352215917594261	XGBClassifier(input_matrix, XGBClassifier__colsample_bytree=0.8000000000000002, XGBClassifier__gamma=0.6000000000000001, XGBClassifier__learning_rate=0.01, XGBClassifier__max_depth=2, XGBClassifier__min_child_weight=1, X

XGBoostError: [06:20:33] /opt/conda/envs/rapids/conda-bld/xgboost_1607619219243/work/src/tree/updater_gpu_hist.cu:786: Exception in gpu_hist: [06:20:33] /opt/conda/envs/rapids/conda-bld/xgboost_1607619219243/work/src/c_api/../data/../common/device_helpers.cuh:400: Memory allocation error on worker 0: std::bad_alloc: CUDA error at: /home/danssa/anaconda3/envs/rapids-0.17/include/rmm/mr/device/cuda_memory_resource.hpp:69: cudaErrorMemoryAllocation out of memory
- Free memory: 8978432
- Requested memory: 3591368

Stack trace:
  [bt] (0) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(+0x14eb6f) [0x7fae9f7fbb6f]
  [bt] (1) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(dh::detail::ThrowOOMError(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x3ad) [0x7fae9fa354bd]
  [bt] (2) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry>::allocate(unsigned long)+0x1df) [0x7fae9fa559df]
  [bt] (3) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(thrust::detail::vector_base<xgboost::Entry, dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry> >::vector_base<__gnu_cxx::__normal_iterator<xgboost::Entry const*, std::vector<xgboost::Entry, std::allocator<xgboost::Entry> > > >(__gnu_cxx::__normal_iterator<xgboost::Entry const*, std::vector<xgboost::Entry, std::allocator<xgboost::Entry> > >, __gnu_cxx::__normal_iterator<xgboost::Entry const*, std::vector<xgboost::Entry, std::allocator<xgboost::Entry> > >)+0x5c) [0x7fae9fa55aac]
  [bt] (4) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(xgboost::common::ProcessBatch(int, xgboost::MetaInfo const&, xgboost::SparsePage const&, unsigned long, unsigned long, xgboost::common::SketchContainer*, int, unsigned long)+0x83) [0x7fae9fa45b73]
  [bt] (5) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(xgboost::common::DeviceSketch(int, xgboost::DMatrix*, int, unsigned long)+0x752) [0x7fae9fa46982]
  [bt] (6) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(xgboost::EllpackPageImpl::EllpackPageImpl(xgboost::DMatrix*, xgboost::BatchParam const&)+0x3a9) [0x7fae9faabe49]
  [bt] (7) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(xgboost::EllpackPage::EllpackPage(xgboost::DMatrix*, xgboost::BatchParam const&)+0x2e) [0x7fae9faac2ee]
  [bt] (8) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(xgboost::data::SimpleDMatrix::GetEllpackBatches(xgboost::BatchParam const&)+0x9b) [0x7fae9f8a3a7b]



Stack trace:
  [bt] (0) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(+0x14eb6f) [0x7fae9f7fbb6f]
  [bt] (1) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x763) [0x7fae9fc26623]
  [bt] (2) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0x997) [0x7fae9f8e95e7]
  [bt] (3) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::PredictionCacheEntry*)+0x106) [0x7fae9f8ea8a6]
  [bt] (4) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(+0x25183d) [0x7fae9f8fe83d]
  [bt] (5) /home/danssa/anaconda3/envs/rapids-0.17/lib/libxgboost.so(XGBoosterUpdateOneIter+0x68) [0x7fae9f802728]
  [bt] (6) /home/danssa/anaconda3/envs/rapids-0.17/lib/python3.8/lib-dynload/../../libffi.so.7(+0x69ed) [0x7fb3c32929ed]
  [bt] (7) /home/danssa/anaconda3/envs/rapids-0.17/lib/python3.8/lib-dynload/../../libffi.so.7(+0x6077) [0x7fb3c3292077]
  [bt] (8) /home/danssa/anaconda3/envs/rapids-0.17/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(_ctypes_callproc+0x2b4) [0x7fb3c32a88b4]



In [14]:
tpot.export('/home/danssa/proj_ua/progress/CHA+SV*/60model/chasv_60model.v4.1.py')