# Chronic Kidney Disease Prediction

## Data Preprocessing

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Force pandas to show all the columns

pd.set_option('display.max_columns', 26)

In [3]:
df = pd.read_csv('../data/chronic_kidney_disease_imputed.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  400 non-null    int64  
 1   age         400 non-null    float64
 2   bp          400 non-null    float64
 3   sg          400 non-null    float64
 4   al          400 non-null    float64
 5   su          400 non-null    float64
 6   rbc         400 non-null    object 
 7   pc          400 non-null    object 
 8   pcc         400 non-null    object 
 9   ba          400 non-null    object 
 10  bgr         400 non-null    float64
 11  bu          400 non-null    float64
 12  sc          400 non-null    float64
 13  sod         400 non-null    float64
 14  pot         400 non-null    float64
 15  hemo        400 non-null    float64
 16  pcv         400 non-null    float64
 17  wbcc        400 non-null    float64
 18  rbcc        400 non-null    float64
 19  htn         400 non-null    o

In [5]:
# Drop the unnamed (id) column

df.drop(columns=df.columns[0], axis=1, inplace=True)

In [6]:
# Rename column headers to friendlier terms

df.columns = ["age", "blood_pressure", "specific_gravity", "albumin", "sugar", "red_blood_cells", "pus_cell",
              "pus_cell_clumps", "bacteria", "blood_glucose_random", "blood_urea", "serum_creatinine", "sodium",
              "potassium", "haemoglobin", "packed_cell_volume", "white_blood_cell_count", "red_blood_cell_count",
              "hypertension", "diabetes_mellitus", "coronary_artery_disease", "appetite", "peda_edema",
              "anemia", "class"]
len(df.columns)

25

In [7]:
df.shape

(400, 25)

In [8]:
df.head(10)

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,anemia,class
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,36.0,1.2,147.0,5.0,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,123.0,18.0,0.8,138.0,4.0,11.3,38.0,6000.0,4.5,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,141.0,4.0,9.6,31.0,7500.0,5.5,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,145.0,4.9,11.6,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,normal,normal,notpresent,notpresent,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.01,0.0,0.0,normal,normal,notpresent,notpresent,100.0,54.0,24.0,104.0,4.0,12.4,36.0,10400.0,5.2,no,no,no,good,no,no,ckd
7,24.0,80.0,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,31.0,1.1,140.0,4.2,12.4,44.0,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,138.0,60.0,1.9,150.0,2.9,10.8,33.0,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [9]:
# Categorical columns (That specify different categories)
cat_cols = [col for col in df.columns if df[col].dtype == "object"]

# Numeric data columns
num_cols = [col for col in df.columns if df[col].dtype != "object"]

In [10]:
cat_cols

['red_blood_cells',
 'pus_cell',
 'pus_cell_clumps',
 'bacteria',
 'hypertension',
 'diabetes_mellitus',
 'coronary_artery_disease',
 'appetite',
 'peda_edema',
 'anemia',
 'class']

In [11]:
num_cols

['age',
 'blood_pressure',
 'specific_gravity',
 'albumin',
 'sugar',
 'blood_glucose_random',
 'blood_urea',
 'serum_creatinine',
 'sodium',
 'potassium',
 'haemoglobin',
 'packed_cell_volume',
 'white_blood_cell_count',
 'red_blood_cell_count']

In [12]:
# Checking for the total number of null values

df.isna().sum().sort_values(ascending = False)

age                        0
potassium                  0
anemia                     0
peda_edema                 0
appetite                   0
coronary_artery_disease    0
diabetes_mellitus          0
hypertension               0
red_blood_cell_count       0
white_blood_cell_count     0
packed_cell_volume         0
haemoglobin                0
sodium                     0
blood_pressure             0
serum_creatinine           0
blood_urea                 0
blood_glucose_random       0
bacteria                   0
pus_cell_clumps            0
pus_cell                   0
red_blood_cells            0
sugar                      0
albumin                    0
specific_gravity           0
class                      0
dtype: int64

In [13]:
# Checking for no. of categories in each of the categorical columns

for col in cat_cols:
    print(f"{col} has {df[col].unique()} values\n")

red_blood_cells has ['normal' 'abnormal'] values

pus_cell has ['normal' 'abnormal'] values

pus_cell_clumps has ['notpresent' 'present'] values

bacteria has ['notpresent' 'present'] values

hypertension has ['yes' 'no'] values

diabetes_mellitus has ['yes' 'no'] values

coronary_artery_disease has ['no' 'yes'] values

appetite has ['good' 'poor'] values

peda_edema has ['no' 'yes'] values

anemia has ['no' 'yes'] values

class has ['ckd' 'notckd'] values



In [14]:
# Checking for the total number of null values

df.isna().sum().sort_values(ascending = False)

age                        0
potassium                  0
anemia                     0
peda_edema                 0
appetite                   0
coronary_artery_disease    0
diabetes_mellitus          0
hypertension               0
red_blood_cell_count       0
white_blood_cell_count     0
packed_cell_volume         0
haemoglobin                0
sodium                     0
blood_pressure             0
serum_creatinine           0
blood_urea                 0
blood_glucose_random       0
bacteria                   0
pus_cell_clumps            0
pus_cell                   0
red_blood_cells            0
sugar                      0
albumin                    0
specific_gravity           0
class                      0
dtype: int64

In [15]:
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,anemia,class
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,36.0,1.2,147.0,5.0,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,123.0,18.0,0.8,138.0,4.0,11.3,38.0,6000.0,4.5,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,141.0,4.0,9.6,31.0,7500.0,5.5,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,145.0,4.9,11.6,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [16]:
df.describe()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,51.675,76.475,1.0174,1.0225,0.46,149.1025,58.1805,3.139375,137.69375,4.5755,12.42675,38.775,8355.5,4.69825
std,17.022008,13.519665,0.005751,1.34033,1.10972,79.588742,50.853468,5.826721,9.67799,2.843433,2.938579,8.810985,2913.260625,1.039964
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,2.1
25%,42.0,70.0,1.01,0.0,0.0,100.0,27.0,0.9,135.0,3.8,10.275,32.75,6500.0,3.9
50%,55.0,80.0,1.02,0.0,0.0,122.0,42.0,1.3,138.0,4.4,12.6,40.0,7900.0,4.7
75%,64.0,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,14.925,45.0,9800.0,5.4
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      400 non-null    float64
 1   blood_pressure           400 non-null    float64
 2   specific_gravity         400 non-null    float64
 3   albumin                  400 non-null    float64
 4   sugar                    400 non-null    float64
 5   red_blood_cells          400 non-null    object 
 6   pus_cell                 400 non-null    object 
 7   pus_cell_clumps          400 non-null    object 
 8   bacteria                 400 non-null    object 
 9   blood_glucose_random     400 non-null    float64
 10  blood_urea               400 non-null    float64
 11  serum_creatinine         400 non-null    float64
 12  sodium                   400 non-null    float64
 13  potassium                400 non-null    float64
 14  haemoglobin              4

In [18]:
# Converting class values to numeric class

df["class"] = df["class"].map({"ckd": 0, "notckd": 1})
# df['class'] = pd.to_numeric(df['class'], errors='coerce')

In [19]:
df["class"]

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    1
399    1
Name: class, Length: 400, dtype: int64

## Feature Encoding

In [20]:
for col in cat_cols:
    print(f"{col} has {df[col].nunique()} categories\n")

red_blood_cells has 2 categories

pus_cell has 2 categories

pus_cell_clumps has 2 categories

bacteria has 2 categories

hypertension has 2 categories

diabetes_mellitus has 2 categories

coronary_artery_disease has 2 categories

appetite has 2 categories

peda_edema has 2 categories

anemia has 2 categories

class has 2 categories



In [21]:
# Encode the data into numeric data

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [22]:
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,anemia,class
0,48.0,80.0,1.02,1.0,0.0,1,1,0,0,121.0,36.0,1.2,147.0,5.0,15.4,44.0,7800.0,5.2,1,1,0,0,0,0,0
1,7.0,50.0,1.02,4.0,0.0,1,1,0,0,123.0,18.0,0.8,138.0,4.0,11.3,38.0,6000.0,4.5,0,0,0,0,0,0,0
2,62.0,80.0,1.01,2.0,3.0,1,1,0,0,423.0,53.0,1.8,141.0,4.0,9.6,31.0,7500.0,5.5,0,1,0,1,0,1,0
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1,0,0,1,1,1,0
4,51.0,80.0,1.01,2.0,0.0,1,1,0,0,106.0,26.0,1.4,145.0,4.9,11.6,35.0,7300.0,4.6,0,0,0,0,0,0,0


## Model Building

In [23]:
X = df[[column for column in df.columns if column != "class"]]
y = df["class"]

In [24]:
# Splitting overall data into training and testing data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [25]:
X_train.shape

(280, 24)

In [26]:
y_train.shape

(280,)

In [27]:
X_test.shape

(120, 24)

In [28]:
y_test.shape

(120,)

### 1. Using Decision Trees

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

decision_tree = DecisionTreeClassifier()
clf = decision_tree.fit(X_train, y_train)

print(f"Training Accuracy: {accuracy_score(y_train, decision_tree.predict(X_train))}")
print(f"Test Accuracy: {accuracy_score(y_test, decision_tree.predict(X_test))} \n")

print(f"Confusion Matrix: \n{confusion_matrix(y_test, decision_tree.predict(X_test))}\n")
print(f"Classification Report: \n{classification_report(y_test, decision_tree.predict(X_test))}")

Training Accuracy: 1.0
Test Accuracy: 0.9333333333333333 

Confusion Matrix: 
[[67  5]
 [ 3 45]]

Classification Report: 
              precision    recall  f1-score   support

           0       0.96      0.93      0.94        72
           1       0.90      0.94      0.92        48

    accuracy                           0.93       120
   macro avg       0.93      0.93      0.93       120
weighted avg       0.93      0.93      0.93       120



#### K-cross validation

In [30]:
from sklearn import model_selection

predictions = model_selection.cross_val_predict(
    decision_tree,
    X,
    y,
    cv=model_selection.LeaveOneOut()
)

print(np.mean(predictions==y))

0.9625


### 2. Using Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier(
    criterion = "entropy",
    max_depth = 11,
    min_samples_leaf = 2,
    min_samples_split = 3,
    n_estimators = 130
)
random_forest_classifier.fit(X_train, y_train)

print(f"Training Accuracy: {accuracy_score(y_train, random_forest_classifier.predict(X_train))}")
print(f"Test Accuracy: {accuracy_score(y_test, random_forest_classifier.predict(X_test))} \n")

print(f"Confusion Matrix: \n{confusion_matrix(y_test, random_forest_classifier.predict(X_test))}\n")
print(f"Classification Report: \n{classification_report(y_test, random_forest_classifier.predict(X_test))}")

Training Accuracy: 1.0
Test Accuracy: 0.9833333333333333 

Confusion Matrix: 
[[72  0]
 [ 2 46]]

Classification Report: 
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        72
           1       1.00      0.96      0.98        48

    accuracy                           0.98       120
   macro avg       0.99      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120



In [33]:
import pickle

with open("../models/dt_model.pkl", "wb") as f:
    pickle.dump(decision_tree, f)

with open("../models/rf_model.pkl", "wb") as f:
    pickle.dump(random_forest_classifier, f)