# chasv_60model_tpot.v3.4, age 35-75
split 7:3

In [1]:
SEED = 0
cores = 16
from numpy.random import seed
seed(SEED)
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tpot import TPOTClassifier
from sklearn.preprocessing import KBinsDiscretizer



In [2]:
df = pd.read_csv("/home/danssa/proj_ua/data/chasv_development.v1.csv", dtype={'id':np.str}).query('age>=35&age<75')
df2 = df.loc[df['from']!="knhanes"]
df2.describe()

Unnamed: 0,eGFR_ab,eGFR_ckd,male,age,he_uph,he_unitr,he_usg,he_upro,he_uglu,he_uket,he_ubil,he_ubld,he_uro,leucocyte,dm,htn
count,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0,154841.0
mean,0.060262,90.906962,0.458206,50.713377,6.156848,0.018161,1.018472,0.306405,0.25727,0.173526,0.035314,0.736995,0.15472,0.454744,0.033066,0.050432
std,0.237972,19.873652,0.498252,11.012321,0.829076,0.133532,0.007956,0.748006,0.937064,0.605177,0.308274,1.2243,0.524733,0.970793,0.17881,0.218836
min,0.0,1.704754,0.0,35.0,5.0,0.0,1.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,79.11799,0.0,41.0,5.5,0.0,1.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,93.15899,0.0,49.0,6.0,0.0,1.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,105.837633,1.0,59.0,7.0,0.0,1.025,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1.0,257.176,1.0,74.0,9.0,1.0,1.03,5.0,5.0,5.0,4.0,5.0,4.0,4.0,1.0,1.0


In [3]:
#3group age split  

##step 1 finding edge value
abnormal_disc = df2.query('eGFR_ab==1').loc[:,'age']
abnormal_disc = pd.DataFrame(abnormal_disc)

est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
est.fit(abnormal_disc)

ab_disc = est.transform(abnormal_disc).astype('float')
print("edges : ", est.bin_edges_[0])

age0_edge = est.bin_edges_[0][1]
age1_edge = est.bin_edges_[0][2]
age2_edge = est.bin_edges_[0][3]
print('age0_edge:', age0_edge, '\nage1_edge:', age1_edge, '\nage2_edge:', age2_edge)

abnormal_disc['level'] = abnormal_disc.apply(lambda x : 0 if x['age']<age0_edge else 1 if x['age']<age1_edge else 2, axis=1)
print('age group:\n',abnormal_disc['level'].value_counts().sort_index())

edges :  [35.         51.66352511 63.73024576 74.        ]
age0_edge: 51.663525113906786 
age1_edge: 63.73024575557389 
age2_edge: 74.0
age group:
 0    1443
1    2751
2    5137
Name: level, dtype: int64


In [4]:
##make 3group by age

df3 = df2.copy()

df3['level'] = df3.apply(lambda x : 0 if x['age']<age0_edge else 1 if x['age']<age1_edge else 2, axis=1)
print(df3['level'].value_counts())

0    86910
1    42377
2    25554
Name: level, dtype: int64


In [5]:
##age0 group
X_age0 = df3[df3['level']==0]
y_age0 = X_age0['eGFR_ab'].astype("int64")

print("total cases = %d" %X_age0.shape[0])
print("total abnormal function of kidney = %d" %sum(y_age0))

X_train0, X_test0, y_train0, y_test0 = train_test_split(X_age0, y_age0, test_size=0.3, stratify=y_age0, random_state=SEED)
print("train0 : %d" % sum(y_train0), "test0 : %d" % sum(y_test0))

total cases = 86910
total abnormal function of kidney = 1443
train0 : 1010 test0 : 433


In [6]:
##age1 group
X_age1 = df3[df3['level']==1]
y_age1 = X_age1['eGFR_ab']

print("total cases = %d" %X_age1.shape[0])
print("total abnormal function of kidney = %d" %sum(y_age1))

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_age1, y_age1, test_size=0.3, stratify=y_age1, random_state=SEED)
print("train1 : %d" % sum(y_train1), "test0 : %d" % sum(y_test1))

total cases = 42377
total abnormal function of kidney = 2751
train1 : 1926 test0 : 825


In [7]:
##age2 group
X_age2 = df3[df3['level']==2]
y_age2 = X_age2['eGFR_ab']

print("total cases = %d" %X_age2.shape[0])
print("total abnormal function of kidney = %d" %sum(y_age2))

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_age2, y_age2, test_size=0.3, stratify=y_age2, random_state=SEED)
print("train2 : %d" % sum(y_train2), "test2 : %d" % sum(y_test2))

total cases = 25554
total abnormal function of kidney = 5137
train2 : 3596 test2 : 1541


In [8]:
##concat both trainset and testset
X_train = pd.concat([X_train0, X_train1, X_train2])
y_train = pd.concat([y_train0, y_train1, y_train2])

X_test = pd.concat([X_test0, X_test1, X_test2])
y_test = pd.concat([y_test0, y_test1, y_test2])

print("total cases = %d" % (X_train.shape[0] + X_test.shape[0]))
print("total abnormal function of kidney = %d" % (sum(y_train) + sum(y_test)))

total cases = 154841
total abnormal function of kidney = 9331


In [9]:
X_train_features = X_train.loc[:, 'male':'leucocyte']

print('%d train cases, %d variables' % (X_train_features.shape[0], X_train_features.shape[1]))
print('%d test cases'%X_test.shape[0])

108387 train cases, 12 variables
46454 test cases


In [10]:
#standardization

scaler = StandardScaler()
std_cols=['age','he_uph','he_usg']
std_df=X_train_features[std_cols]

X_train_features[std_cols]=scaler.fit_transform(std_df)
X_train_features.describe()

Unnamed: 0,male,age,he_uph,he_unitr,he_usg,he_upro,he_uglu,he_uket,he_ubil,he_ubld,he_uro,leucocyte
count,108387.0,108387.0,108387.0,108387.0,108387.0,108387.0,108387.0,108387.0,108387.0,108387.0,108387.0,108387.0
mean,0.459483,-1.762148e-16,2.055839e-16,0.018129,-2.518645e-14,0.303828,0.254809,0.171866,0.035124,0.735596,0.154954,0.452942
std,0.498358,1.000005,1.000005,0.13342,1.000005,0.745698,0.931188,0.600703,0.305977,1.22178,0.52562,0.968983
min,0.0,-1.428323,-1.398607,0.0,-1.693318,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,-0.8829833,-0.7948687,0.0,-1.064578,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,-0.1558632,-0.1911305,0.0,0.1929025,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.7530369,1.016346,0.0,0.8216426,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,2.116387,3.431298,1.0,1.450383,5.0,5.0,5.0,4.0,5.0,4.0,4.0


In [11]:
from collections import Counter
counter = Counter(y_train)
estimate = round(counter[0]/counter[1])
step = round((estimate - 1)/3)
estimate

16

In [12]:
#https://dask-cuda.readthedocs.io/en/latest/quickstart.html

from dask_cuda import LocalCUDACluster
from dask.distributed import Client

# Create a Dask Cluster with one worker per GPU
cluster = LocalCUDACluster()
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39527 instead


In [13]:
classifier_config_dict = {

    # xgboost tree method = gpu hist
    
    'xgboost.XGBClassifier': {
        'n_estimators': [100, 250 ,500, 750, 1000],
        'learning_rate': [1e-2, 1e-1, 0.3],
        'max_depth': range(2, 11),
        'min_child_weight': range(1, 21),
        'gamma':np.arange(0, 2.01, 0.2),
        'subsample': np.arange(0.2, 1.01, 0.2),
        'colsample_bytree': np.arange(0.4,1.01,0.2),
        "reg_alpha": [0, 0.25, 0.5, 0.75, 1],
        "reg_lambda": [1, 2, 4, 6, 8],
        'scale_pos_weight': [estimate],
        'objective': ['binary:logistic'],
        'tree_method' : ['gpu_hist'],
        'n_jobs': [1],
        'verbosity': [0]
    },

}

tpot = TPOTClassifier(scoring="roc_auc",
                      cv=5,
                      random_state=SEED,
                      n_jobs=4,
                      verbosity=3,
                      generations=100,
                      population_size=100,
                      use_dask=True,
                      warm_start=False,
                      config_dict=classifier_config_dict,
                      template='Classifier')

training_features=X_train_features.copy(deep=True)
tpot.fit(training_features, y_train)

tpot.export('/home/danssa/proj_ua/progress/CHA+SV*/60model/chasv_60model.v3.4_age3575.py')

1 operators have been imported by TPOT.


Version 0.11.6.post3 of tpot is outdated. Version 0.11.7 was released Wednesday January 06, 2021.


Optimization Progress:   0%|          | 0/10100 [00:00<?, ?pipeline/s]


Generation 1 - Current Pareto front scores:

-1	0.8435419705563717	XGBClassifier(input_matrix, XGBClassifier__colsample_bytree=1.0000000000000002, XGBClassifier__gamma=0.4, XGBClassifier__learning_rate=0.01, XGBClassifier__max_depth=2, XGBClassifier__min_child_weight=5, XGBClassifier__n_estimators=750, XGBClassifier__n_jobs=1, XGBClassifier__objective=binary:logistic, XGBClassifier__reg_alpha=1, XGBClassifier__reg_lambda=2, XGBClassifier__scale_pos_weight=16, XGBClassifier__subsample=1.0, XGBClassifier__tree_method=gpu_hist, XGBClassifier__verbosity=0)

Generation 2 - Current Pareto front scores:

-1	0.8435419705563717	XGBClassifier(input_matrix, XGBClassifier__colsample_bytree=1.0000000000000002, XGBClassifier__gamma=0.4, XGBClassifier__learning_rate=0.01, XGBClassifier__max_depth=2, XGBClassifier__min_child_weight=5, XGBClassifier__n_estimators=750, XGBClassifier__n_jobs=1, XGBClassifier__objective=binary:logistic, XGBClassifier__reg_alpha=1, XGBClassifier__reg_lambda=2, XGBClassifi

In [16]:
tpot.export('/home/danssa/proj_ua/progress/CHA+SV*/60model/chasv_60model.v3.4_age3575.py')