In [1]:
import pandas as pd
import numpy as np
import re
import datetime as dt
from IPython.display import display

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb


In [191]:
## Install access to UCI data repo
# pip install ucimlrepo

In [3]:
# from ucimlrepo import fetch_ucirepo 
  
# # fetch dataset 
# chronic_kidney_disease = fetch_ucirepo(id=336) 
  
# # data (as pandas dataframes) 
# X = chronic_kidney_disease.data.features 
# y = chronic_kidney_disease.data.targets 
  
# # metadata 
# print(chronic_kidney_disease.metadata) 
  
# # variable information 
# print(chronic_kidney_disease.variables) 


{'uci_id': 336, 'name': 'Chronic Kidney Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/336/chronic+kidney+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/336/data.csv', 'abstract': 'This dataset can be used to predict the chronic kidney disease and it can be collected from the hospital nearly 2 months of period.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 400, 'num_features': 24, 'feature_types': ['Real'], 'demographics': ['Age'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2015, 'last_updated': 'Mon Mar 04 2024', 'dataset_doi': '10.24432/C5G020', 'creators': ['L. Rubini', 'P. Soundarapandian', 'P. Eswaran'], 'intro_paper': None, 'additional_info': {'summary': 'We use the following representation to collect the dataset\r\n                        age\t\t-\tage\t\r\n\t\t\tbp\t\t-\tblood pressure\r\n\t\t\tsg\t

In [466]:
## Create Single DataFrame using UCI data fetch package
    # X has Dataframe of features
    # y has Dataframe of prediction variable y
# df = X
# df['class'] = y['class'].to_list()
# df.to_csv('ckd_data.csv')

# df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


In [27]:
## Read ckd_data.csv file (derived from UCI data fetch)

df = pd.read_csv('ckd_data.csv')
del df['Unnamed: 0']
df.head(3)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd


# EDA and Data Preparation

In [21]:
### Attribute Information about features and Y prediction in the dataset
    ## taken from UCI ML Repository .info  page

In [352]:
df.iloc[:, :-1]    ## X Features

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane
0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.0,...,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no
1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,,...,11.3,38.0,6000.0,,no,no,no,good,no,no
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,9.6,31.0,7500.0,,no,yes,no,poor,no,yes
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,11.6,35.0,7300.0,4.6,no,no,no,good,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,15.7,47.0,6700.0,4.9,no,no,no,good,no,no
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,16.5,54.0,7800.0,6.2,no,no,no,good,no,no
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,15.8,49.0,6600.0,5.4,no,no,no,good,no,no
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,14.2,51.0,7200.0,5.9,no,no,no,good,no,no


In [354]:
df.iloc[:,[-1]]   ## y prediction variable

Unnamed: 0,class
0,ckd
1,ckd
2,ckd
3,ckd
4,ckd
...,...
395,notckd
396,notckd
397,notckd
398,notckd


In [9]:

df['class'].unique()   ## there are tabs of white-space within this variable

array(['ckd', 'ckd\t', 'notckd'], dtype=object)

In [11]:
df[df['class']=="ckd\t"]   ## MUST REMOVE WhiteSpace "\t"

Unnamed: 0.1,Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
37,37,72.0,80.0,,,,,,notpresent,notpresent,...,28.0,6900.0,2.5,yes,yes,no,poor,no,yes,ckd\t
230,230,65.0,60.0,1.01,2.0,0.0,normal,abnormal,present,notpresent,...,,9500.0,,yes,yes,no,poor,no,no,ckd\t


In [13]:
df['class'].str.strip().unique()

array(['ckd', 'notckd'], dtype=object)

In [366]:
df.head(2)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,999999,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,999999,normal,notpresent,notpresent,999999.0,...,38.0,6000.0,999999.0,no,no,no,good,no,no,ckd


In [29]:
## Check if feature columns also contain "\t" white space
white_space = {}

for cols in df.columns:
    if df[cols].astype(str).str.contains("\t").sum() > 0:
        white_space[cols] = df[df[cols].astype(str).str.contains("\t")][cols]
    else:
        pass
    
white_space
    

{'dm': 188    \tno
 Name: dm, dtype: object,
 'class': 37     ckd\t
 230    ckd\t
 Name: class, dtype: object}

### "\t" whitespace is found in both some X features and in a few observations for y_prediction variables
  Will remove whitespace

In [31]:
df = df.map(lambda x: x.strip() if isinstance(x, str) else x )

### Will replace null values into weighted median value of that feature from total "True" and total "False" observations in dataset

In [19]:
## Numerical columns will contain median value of that feature that is weighted by total "True" and Total "False"

## median calculations for "True" ckd
(df[df['class']=="ckd"]).describe().iloc[5, :]

Unnamed: 0     124.500
age             59.000
bp              80.000
sg               1.015
al               2.000
su               0.000
bgr            143.500
bu              53.000
sc               2.250
sod            136.000
pot              4.300
hemo            10.900
pcv             33.000
wbcc          8800.000
rbcc             3.900
Name: 50%, dtype: float64

In [33]:
## save median values for "false" ckd to a dictionary
    ## will be used to map onto Null values where applicable
ckd_median = (df[df['class']=="ckd"]).describe().iloc[5,:]
ckd_median = ckd_median.to_dict()
ckd_median

{'age': 59.0,
 'bp': 80.0,
 'sg': 1.015,
 'al': 2.0,
 'su': 0.0,
 'bgr': 143.5,
 'bu': 53.0,
 'sc': 2.25,
 'sod': 136.0,
 'pot': 4.3,
 'hemo': 10.9,
 'pcv': 33.0,
 'wbcc': 8800.0,
 'rbcc': 3.9}

In [420]:
(df[df['class']=="notckd"]).describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
count,149.0,148.0,145.0,145.0,145.0,144.0,144.0,145.0,145.0,145.0,144.0,146.0,143.0,143.0
mean,46.516779,71.351351,1.022414,0.0,0.0,107.722222,32.798611,0.868966,141.731034,4.337931,15.188194,46.335616,7705.594406,5.379021
std,15.631138,8.543498,0.002507,0.0,0.0,18.56474,11.450459,0.255092,4.817871,0.587257,1.277536,4.134426,1839.770968,0.596097
min,12.0,60.0,1.02,0.0,0.0,70.0,10.0,0.4,135.0,3.3,13.0,40.0,4300.0,4.4
25%,34.0,60.0,1.02,0.0,0.0,93.75,23.75,0.6,138.0,3.7,14.1,43.0,6300.0,4.9
50%,46.0,70.0,1.02,0.0,0.0,107.5,33.0,0.9,141.0,4.5,15.0,46.0,7500.0,5.3
75%,58.0,80.0,1.025,0.0,0.0,123.25,44.0,1.1,146.0,4.9,16.2,50.0,9250.0,5.9
max,80.0,80.0,1.025,0.0,0.0,140.0,50.0,1.2,150.0,5.0,17.8,54.0,11000.0,6.5


In [35]:
## save median values for "false" ckd to a dictionary
    ## will be used to map onto Null values where applicable
notckd_median = (df[df['class']=="notckd"]).describe().iloc[5,:]
notckd_median = notckd_median.to_dict()
notckd_median

{'age': 46.0,
 'bp': 70.0,
 'sg': 1.02,
 'al': 0.0,
 'su': 0.0,
 'bgr': 107.5,
 'bu': 33.0,
 'sc': 0.9,
 'sod': 141.0,
 'pot': 4.5,
 'hemo': 15.0,
 'pcv': 46.0,
 'wbcc': 7500.0,
 'rbcc': 5.3}

In [37]:
df['class'].value_counts()

class
ckd       250
notckd    150
Name: count, dtype: int64

In [39]:
df.shape[0]

400

In [41]:
true_pct = df['class'].value_counts().iloc[0] / df.shape[0]
false_pct = df['class'].value_counts().iloc[1] / df.shape[0]

print(true_pct)
print(false_pct)

0.625
0.375


In [47]:
## CALCULATE NEW DICTIONARY WITH WEIGHTED MEDIAN VALUES WHEN "TRUE" AND "FALSE" PRESENT FOR CKD

weighted_fill_values = {}

for key in ckd_median.keys():
    weighted_median = (ckd_median[key] * true_pct) +\
    (notckd_median[key] * false_pct)

    weighted_fill_values[key] = weighted_median

In [49]:
weighted_fill_values   ## these are the median weighted values per numerical columns in the dataset

{'age': 54.125,
 'bp': 76.25,
 'sg': 1.016875,
 'al': 1.25,
 'su': 0.0,
 'bgr': 130.0,
 'bu': 45.5,
 'sc': 1.74375,
 'sod': 137.875,
 'pot': 4.375,
 'hemo': 12.4375,
 'pcv': 37.875,
 'wbcc': 8312.5,
 'rbcc': 4.425}

In [51]:
df.isnull().sum()

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wbcc     106
rbcc     131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

In [53]:
## Fill Null values with weighted median calculated values for numerical Null occurrences per column
df = df.fillna(value=weighted_fill_values)

### Categorical Null Values

Most Categorical Columns have few occurrences of Null values, these rows can be removed from entire dataset.

Columns "rbc" and "pc" can be removed entirely since those columns have a substantial amount of Null values

In [55]:
df.isnull().sum()   ## only Categorical Null values are present    

age        0
bp         0
sg         0
al         0
su         0
rbc      152
pc        65
pcc        4
ba         4
bgr        0
bu         0
sc         0
sod        0
pot        0
hemo       0
pcv        0
wbcc       0
rbcc       0
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

In [57]:
del df['rbc']
del df['pc']


In [59]:
### Delete remaining rows with Null Values in Categorical Columns
df = df.dropna()

In [61]:
### Copy Dataframe "df"
    ## rows from DataFrame with Null categorical values
df1 = df.copy()

print(df1.shape[0])
df1.head(3)

393


Unnamed: 0,age,bp,sg,al,su,pcc,ba,bgr,bu,sc,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,notpresent,notpresent,121.0,36.0,1.2,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,notpresent,notpresent,130.0,18.0,0.8,...,38.0,6000.0,4.425,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,notpresent,notpresent,423.0,53.0,1.8,...,31.0,7500.0,4.425,no,yes,no,poor,no,yes,ckd


In [63]:
## Verify No Null Values
df1.isnull().sum()

age      0
bp       0
sg       0
al       0
su       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64

# Prepare y_prediction variable

In [517]:
# df1['class'].unique()

array(['1', '0'], dtype=object)

In [515]:
# df1['class'] = np.where((df1['class']=="ckd"), '1', '0')

In [525]:
# df1['class'] = df1['class'].astype(int)

In [527]:
# df1['class'].unique()

array([1, 0])

# Find Most Important Features

In [531]:
## find most important features
    ### Use Scikit-learn API wrapper "XGBClassifier"
dv = DictVectorizer(sparse=False)

X_features = df1.iloc[: :-1]
X_dicts = X_features.to_dict(orient='records')
X_vector = dv.fit_transform(X_dicts)
y_prediction = df1.iloc[:,-1].values

model = xgb.XGBClassifier()
model.fit(X_vector, y_prediction)
model.feature_importances_


array([0.02010769, 0.01195716, 0.01176336, 0.        , 0.01579635,
       0.        , 0.02020901, 0.        , 0.01865145, 0.01513068,
       0.01904527, 0.00979893, 0.        , 0.5983011 , 0.00773508,
       0.        , 0.01360436, 0.01916808, 0.        , 0.0181212 ,
       0.        , 0.01966519, 0.04551147, 0.        , 0.01979158,
       0.00962759, 0.01520665, 0.01937743, 0.02217956, 0.02869093,
       0.02055987], dtype=float32)

In [535]:
### Combine Features into DataFrame and sort based on highest ranked features
feature_names = dv.get_feature_names_out()
importances = model.feature_importances_


In [541]:
df_features = pd.DataFrame(
    {'features': feature_names,
    'importance': importances}
)

df_features.sort_values('importance', ascending=False)

Unnamed: 0,features,importance
13,class,0.598301
22,pe=no,0.045511
29,su,0.028691
28,sod,0.02218
30,wbcc,0.02056
6,ba=notpresent,0.020209
0,age,0.020108
24,pot,0.019792
21,pcv,0.019665
27,sg,0.019377


# Train Models

Try baseline Decision Tree model

Try Random Forest Model

Try XGBoost model

In [124]:
# df1.to_csv('df1.csv')
df1 = pd.read_csv('df1.csv')

try:
    del df1['Unnamed: 0']
except:
    pass

df1.isnull().sum()

age      0
bp       0
sg       0
al       0
su       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64

In [87]:
### Prepare predictor variable
df1['class'] = np.where((df1['class']=="ckd"), '1', '0')

df1['class']= df1['class'].astype(int)

### Train / Val / Test split the dataset

In [126]:
df_full_train, df_test = train_test_split(df1, test_size=0.2, random_state=3)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=3)

In [128]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [130]:
y_train = df_train['class'].values
y_val = df_val['class'].values
y_test = df_test['class'].values

In [132]:
del df_train['class']
del df_val['class']
del df_test['class']

### Decision Tree Classifier -- The Baseline Model

In [114]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [138]:
### TRAIN DECISION TREE CLASSIFIER

## save training features to dictionary
train_dicts = df_train.to_dict(orient='records')

## instantiate Dictionary Vectorizer
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform (train_dicts)

## instantiate decision tree classifier model
dt = DecisionTreeClassifier()

## fit model 
dt.fit(X_train, y_train)

In [144]:
### PREDICT WITH DECISION TREE CLASSIFIER ON VALIDATION SET

## create dictionary for validation data set features
val_dicts = df_val.to_dict(orient='records')

## vectorize dictionary of features
X_val = dv.transform(val_dicts)

## pass vectorize validation set features to decision tree model and predict
y_pred = dt.predict_proba(X_val)[:,1]

## calculate ROC AUC score on validation set predictions compared to actual validation predictions
roc_auc_score(y_val, y_pred)

0.939795918367347

In [148]:
## Training Score is near perfect, meaning the DT model may have just memorized all of the splits
y_pred = dt.predict_proba(X_train)[:,1]

roc_auc_score(y_train, y_pred)

1.0

### Random Forest Model

In [152]:
from sklearn.ensemble import RandomForestClassifier

In [158]:
### instantiate a Random Forest Classifier model (baseline)
rf = RandomForestClassifier(n_estimators=10, random_state=3, n_jobs=-1)

### fit vectorized features and prediction variables onto model
rf.fit(X_train, y_train)

### Predict with model on Validation dataset
y_pred = rf.predict_proba(X_val)[:, 1]

### cauclate ROC AUC score
roc_auc_score(y_val, y_pred)

## has pretty good score on validation data set



0.9979591836734695

In [None]:
### Tune Random Forest model
    ## tune different parameters for number of trees and their max depth for nodes

In [162]:
## predict on test dataset
test_dicts = df_test.to_dict(orient='records')

## vectorize dictionary of features
X_test = dv.transform(test_dicts)

## pass vectorize validation set features to decision tree model and predict
y_pred = dt.predict_proba(X_test)[:,1]

## calculate ROC AUC score on validation set predictions compared to actual validation predictions
roc_auc_score(y_test, y_pred)

0.9731292517006802

# XGBoost Classifier Model

In [167]:
features = list(dv.get_feature_names_out())

### Create dmatrices needed to fit onto XGBoost model
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [169]:
### Create XGBoost model and train on training data

xgb_params = {
    'eta': 0.3,                                   ## learning rate
    'max_depth': 6,                               ## max number of nodes that split in each decision tree
    'min_child_weight': 1,                        ## min number of observations per node
    
    'objective': 'binary:logistic',
    'nthread': 8,                                ## can be "n_jobs=-1"  which means use all available CPUC cores
    
    'seed': 1,
    'verbosity': 1,                                     ## num_boost_round is the number of Trees that boost and learn to fix residual errors of previous trees before them
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [171]:
### Predict on validation data set
y_pred = model.predict(dval)

## Assess Model Accuracy using ROC AUC score
roc_auc_score(y_val, y_pred)      

0.9952380952380953

### Observation: XGBOOST model yields the best ROC AUC score on validation set

In [176]:
### Verify XGBoost results on test data set


## get feature names
features = list(dv.get_feature_names_out())

## predict on test dataset
test_dicts = df_test.to_dict(orient='records')

## vectorize dictionary of features
X_test = dv.transform(test_dicts)

### Create dmatrix on test dataset
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

### Predict on validation data set
y_pred = model.predict(dtest)

## Assess Model Accuracy using ROC AUC score
roc_auc_score(y_test, y_pred) 

0.9965986394557823

#### XGBoost Classifier is by far the best model

# Tune XGBoost Classifier model to try and achieve best ROC AUC Score

Want to achieve best ROC AUC score on validation and test dataset and supass score of 0.9965986 using baseline XGBoost model with light parameters set

Will tune ETA learning rate, max_depth and min_child_weight parameters, and num_boost_rounds

In [None]:
#### BASELINE XGBOOST MODEL

xgb_params = {
    'eta': 0.3,                                   ## learning rate
    'max_depth': 6,                               ## max number of nodes that split in each decision tree
    'min_child_weight': 1,                        ## min number of observations per node
    
    'objective': 'binary:logistic',
    'nthread': 8,                                ## can be "n_jobs=-1"  which means use all available CPUC cores
    
    'seed': 1,
    'verbosity': 1,                                     ## num_boost_round is the number of Trees that boost and learn to fix residual errors of previous trees before them
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10)

### Further improvement on baseline XGBoost model

In [184]:
### 100 ROUNDS OF TRAINING ON BASELINE MODEL

xgb_params = {
    'eta': 0.3,                                   ## learning rate
    'max_depth': 6,                               ## max number of nodes that split in each decision tree
    'min_child_weight': 1,                        ## min number of observations per node
    
    'objective': 'binary:logistic',
    'nthread': 8,                                ## can be "n_jobs=-1"  which means use all available CPUC cores
    
    'seed': 1,
    'verbosity': 1,                                     ## num_boost_round is the number of Trees that boost and learn to fix residual errors of previous trees before them
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

## prediction
y_pred = model.predict(dval)

## roc auc score
roc_auc_score(y_val, y_pred)    ### further improvement on XGBoost baseline model

0.9972789115646259

In [187]:
## double check on score of test dataset

y_pred = model.predict(dtest)

roc_auc_score(y_test, y_pred)

0.9979591836734695

### 

In [189]:
### 100 ROUNDS OF TRAINING ON BASELINE MODEL with ETA 0.1

xgb_params = {
    'eta': 0.1,                                   ## learning rate
    'max_depth': 6,                               ## max number of nodes that split in each decision tree
    'min_child_weight': 1,                        ## min number of observations per node
    
    'objective': 'binary:logistic',
    'nthread': 8,                                ## can be "n_jobs=-1"  which means use all available CPUC cores
    
    'seed': 1,
    'verbosity': 1,                                     ## num_boost_round is the number of Trees that boost and learn to fix residual errors of previous trees before them
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

## prediction
y_pred = model.predict(dval)

## roc auc score
roc_auc_score(y_val, y_pred)    ### further improvement on XGBoost baseline model

0.9965986394557823

In [191]:
## check ROC AUC score on test dataset

y_pred = model.predict(dtest)

roc_auc_score(y_test, y_pred)

0.998639455782313

In [193]:
### Validation ROC AUC score drops when learning rate is reduced possibly requiring more boost rounds to fully learn

In [199]:
### 150 ROUNDS OF TRAINING ON BASELINE MODEL with ETA of 0.1

xgb_params = {
    'eta': 0.1,                                   ## learning rate
    'max_depth': 6,                               ## max number of nodes that split in each decision tree
    'min_child_weight': 1,                        ## min number of observations per node
    
    'objective': 'binary:logistic',
    'nthread': 8,                                ## can be "n_jobs=-1"  which means use all available CPUC cores
    
    'seed': 1,
    'verbosity': 1,                                     ## num_boost_round is the number of Trees that boost and learn to fix residual errors of previous trees before them
}

model = xgb.train(xgb_params, dtrain, num_boost_round=150)

## prediction
y_pred = model.predict(dval)

## roc auc score
roc_auc_score(y_val, y_pred)    ### further improvement on XGBoost baseline model

0.9972789115646259

In [234]:
### 150 ROUNDS OF TRAINING ON BASELINE MODEL with ETA of 0.1, and max_depth of 10

xgb_params = {
    'eta': 0.1,                                   ## learning rate
    'max_depth': 10,                               ## max number of nodes that split in each decision tree
    'min_child_weight': 1,                        ## min number of observations per node
    
    'objective': 'binary:logistic',
    'nthread': 8,                                ## can be "n_jobs=-1"  which means use all available CPUC cores
    
    'seed': 1,
    'verbosity': 1,                                     ## num_boost_round is the number of Trees that boost and learn to fix residual errors of previous trees before them
}

model = xgb.train(xgb_params, dtrain, num_boost_round=150)

## prediction
y_pred = model.predict(dval)

## roc auc score
roc_auc_score(y_val, y_pred)    ### further im

0.9972789115646259

In [236]:
### check on test dataset
y_pred = model.predict(dtest)

roc_auc_score(y_test, y_pred)

0.9979591836734695

### Best ROC AUC Score achieved with XGBoost Classifier Model. Validation ROC AUC score = 0.9973, and Test ROC AUC score = 0.998. Model has following parameters:

eta learning rate = 0.1

max_depth = 10

min_child_weight = 1

num_boost_rounds = 150


# Switch to Sklearn's XGBClassifier model for smoother API interaction 

No longer need to fit a DMatrix onto xgboost, can only fit a DictVectorizer onto model

In [249]:
### create Sklearn compatible XGBClassifier
from xgboost import XGBClassifier

## import pipeline
from sklearn.pipeline import make_pipeline

In [241]:
## create dictionaries for all feature datasets
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')
test_dicts = df_test.to_dict(orient='records')

In [247]:
### Create Model with winning parameters
xgb_clf = XGBClassifier(
    learning_rate=0.1,        # eta
    max_depth=10,
    min_child_weight=1,
    n_estimators=150,          # num_boost_round
    objective='binary:logistic',
    n_jobs=8,                  # nthread
    random_state=1,            # seed
    verbosity=1,
)

In [251]:
## Create Pipeline
pipeline = make_pipeline(
    DictVectorizer(sparse=False),
    xgb_clf
)

In [253]:
### Train XGBClassifer model
pipeline.fit(train_dicts, y_train)

In [259]:
### Generate prediction
y_pred = pipeline.predict_proba(val_dicts)[:, 1]

## generate ROC AUC score
roc_auc_score(y_val, y_pred)

0.9972789115646259

In [261]:
### Generate prediction on test dataset
y_pred = pipeline.predict_proba(test_dicts)[:, 1]

roc_auc_score(y_test, y_pred)

0.9979591836734695

In [263]:
## Save out model using Pickle
import pickle

with open("pipeline_v2.bin", "wb") as f_out:
    pickle.dump(pipeline, f_out)

### EDA to create Pydantic Schema for FAST API

In [270]:
### look at typical JSON values that will be passed to model in FAST API
train_dicts[:3]

[{'age': 73.0,
  'bp': 70.0,
  'sg': 1.005,
  'al': 0.0,
  'su': 0.0,
  'pcc': 'notpresent',
  'ba': 'notpresent',
  'bgr': 70.0,
  'bu': 32.0,
  'sc': 0.9,
  'sod': 125.0,
  'pot': 4.0,
  'hemo': 10.0,
  'pcv': 29.0,
  'wbcc': 18900.0,
  'rbcc': 3.5,
  'htn': 'yes',
  'dm': 'yes',
  'cad': 'no',
  'appet': 'good',
  'pe': 'yes',
  'ane': 'no'},
 {'age': 34.0,
  'bp': 60.0,
  'sg': 1.02,
  'al': 0.0,
  'su': 0.0,
  'pcc': 'notpresent',
  'ba': 'notpresent',
  'bgr': 117.0,
  'bu': 28.0,
  'sc': 2.2,
  'sod': 138.0,
  'pot': 3.8,
  'hemo': 12.4375,
  'pcv': 37.875,
  'wbcc': 8312.5,
  'rbcc': 4.425,
  'htn': 'no',
  'dm': 'no',
  'cad': 'no',
  'appet': 'good',
  'pe': 'yes',
  'ane': 'no'},
 {'age': 68.0,
  'bp': 80.0,
  'sg': 1.015,
  'al': 0.0,
  'su': 0.0,
  'pcc': 'notpresent',
  'ba': 'notpresent',
  'bgr': 171.0,
  'bu': 30.0,
  'sc': 1.0,
  'sod': 137.875,
  'pot': 4.375,
  'hemo': 13.7,
  'pcv': 43.0,
  'wbcc': 4900.0,
  'rbcc': 5.2,
  'htn': 'no',
  'dm': 'yes',
  'cad': 'no',

In [272]:
## need column types for relevant constraints in FAST API
df_train.dtypes

age      float64
bp       float64
sg       float64
al       float64
su       float64
pcc       object
ba        object
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
dtype: object

In [274]:
### See Unique Values for Categorical Columns
    ### these will become constraints in FAST API 
for col in df_train.select_dtypes(include=['object']).columns:
    print(col, df_train[col].unique())

pcc ['notpresent' 'present']
ba ['notpresent' 'present']
htn ['yes' 'no']
dm ['yes' 'no']
cad ['no' 'yes']
appet ['good' 'poor']
pe ['yes' 'no']
ane ['no' 'yes']


# TEST SENDING REQUEST TO FAST API HOSTED IN DOCKER CONTAINER

In [277]:
import requests

url = "http://localhost:9696/predict"

payload = {
    "age": 73.0,
    "bp": 70.0,
    "sg": 1.005,
    "al": 0.0,
    "su": 0.0,
    "pcc": "notpresent",
    "ba": "notpresent",
    "bgr": 70.0,
    "bu": 32.0,
    "sc": 0.9,
    "sod": 125.0,
    "pot": 4.0,
    "hemo": 10.0,
    "pcv": 29.0,
    "wbcc": 18900.0,
    "rbcc": 3.5,
    "htn": "yes",
    "dm": "yes",
    "cad": "no",
    "appet": "good",
    "pe": "yes",
    "ane": "no"
}

response = requests.post(url, json=payload)

print(response.json())


{'ckd_probability': 0.9905727505683899, 'ckd': True}


In [279]:
df_train.shape

(235, 22)