# Chronic Kidney Disease Prediction

## Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff

In [2]:
# Force pandas to show all the columns

pd.set_option('display.max_columns', 26)

In [3]:
df = pd.read_csv('../data/chronic_kidney_disease_cleaned.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  158 non-null    int64  
 1   age         158 non-null    float64
 2   bp          158 non-null    float64
 3   sg          158 non-null    float64
 4   al          158 non-null    float64
 5   su          158 non-null    float64
 6   rbc         158 non-null    object 
 7   pc          158 non-null    object 
 8   pcc         158 non-null    object 
 9   ba          158 non-null    object 
 10  bgr         158 non-null    float64
 11  bu          158 non-null    float64
 12  sc          158 non-null    float64
 13  sod         158 non-null    float64
 14  pot         158 non-null    float64
 15  hemo        158 non-null    float64
 16  pcv         158 non-null    float64
 17  wbcc        158 non-null    float64
 18  rbcc        158 non-null    float64
 19  htn         158 non-null    o

In [5]:
# Drop the unnamed (id) column

df.drop(columns=df.columns[0], axis=1, inplace=True)

In [6]:
# Rename column headers to friendlier terms

df.columns = ["age", "blood_pressure", "specific_gravity", "albumin", "sugar", "red_blood_cells", "pus_cell",
              "pus_cell_clumps", "bacteria", "blood_glucose_random", "blood_urea", "serum_creatinine", "sodium",
              "potassium", "haemoglobin", "packed_cell_volume", "white_blood_cell_count", "red_blood_cell_count",
              "hypertension", "diabetes_mellitus", "coronary_artery_disease", "appetite", "peda_edema",
              "anemia", "class"]
len(df.columns)

25

In [7]:
df.shape

(158, 25)

In [8]:
df.head(10)

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,anemia,class
0,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
1,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd
2,63.0,70.0,1.01,3.0,0.0,abnormal,abnormal,present,notpresent,380.0,60.0,2.7,131.0,4.2,10.8,32.0,4500.0,3.8,yes,yes,no,poor,yes,no,ckd
3,68.0,80.0,1.01,3.0,2.0,normal,abnormal,present,present,157.0,90.0,4.1,130.0,6.4,5.6,16.0,11000.0,2.6,yes,yes,yes,poor,yes,no,ckd
4,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,173.0,148.0,3.9,135.0,5.2,7.7,24.0,9200.0,3.2,yes,yes,yes,poor,yes,yes,ckd
5,48.0,80.0,1.025,4.0,0.0,normal,abnormal,notpresent,notpresent,95.0,163.0,7.7,136.0,3.8,9.8,32.0,6900.0,3.4,yes,no,no,good,no,yes,ckd
6,69.0,70.0,1.01,3.0,4.0,normal,abnormal,notpresent,notpresent,264.0,87.0,2.7,130.0,4.0,12.5,37.0,9600.0,4.1,yes,yes,yes,good,yes,no,ckd
7,73.0,70.0,1.005,0.0,0.0,normal,normal,notpresent,notpresent,70.0,32.0,0.9,125.0,4.0,10.0,29.0,18900.0,3.5,yes,yes,no,good,yes,no,ckd
8,73.0,80.0,1.02,2.0,0.0,abnormal,abnormal,notpresent,notpresent,253.0,142.0,4.6,138.0,5.8,10.5,33.0,7200.0,4.3,yes,yes,yes,good,no,no,ckd
9,46.0,60.0,1.01,1.0,0.0,normal,normal,notpresent,notpresent,163.0,92.0,3.3,141.0,4.0,9.8,28.0,14600.0,3.2,yes,yes,no,good,no,no,ckd


In [9]:
# Categorical columns (That specify different categories)
cat_cols = [col for col in df.columns if df[col].dtype == "object"]

# Numeric data columns
num_cols = [col for col in df.columns if df[col].dtype != "object"]

In [10]:
cat_cols

['red_blood_cells',
 'pus_cell',
 'pus_cell_clumps',
 'bacteria',
 'hypertension',
 'diabetes_mellitus',
 'coronary_artery_disease',
 'appetite',
 'peda_edema',
 'anemia',
 'class']

In [11]:
num_cols

['age',
 'blood_pressure',
 'specific_gravity',
 'albumin',
 'sugar',
 'blood_glucose_random',
 'blood_urea',
 'serum_creatinine',
 'sodium',
 'potassium',
 'haemoglobin',
 'packed_cell_volume',
 'white_blood_cell_count',
 'red_blood_cell_count']

In [12]:
# Checking for no. of categories in each of the categorical columns

for col in cat_cols:
    print(f"{col} has {df[col].unique()} values\n")

red_blood_cells has ['normal' 'abnormal'] values

pus_cell has ['abnormal' 'normal'] values

pus_cell_clumps has ['present' 'notpresent'] values

bacteria has ['notpresent' 'present'] values

hypertension has ['yes' 'no'] values

diabetes_mellitus has ['no' 'yes'] values

coronary_artery_disease has ['no' 'yes'] values

appetite has ['poor' 'good'] values

peda_edema has ['yes' 'no'] values

anemia has ['yes' 'no'] values

class has ['ckd' 'notckd'] values



In [13]:
# Checking for the total number of null values

df.isna().sum().sort_values(ascending = False)

age                        0
potassium                  0
anemia                     0
peda_edema                 0
appetite                   0
coronary_artery_disease    0
diabetes_mellitus          0
hypertension               0
red_blood_cell_count       0
white_blood_cell_count     0
packed_cell_volume         0
haemoglobin                0
sodium                     0
blood_pressure             0
serum_creatinine           0
blood_urea                 0
blood_glucose_random       0
bacteria                   0
pus_cell_clumps            0
pus_cell                   0
red_blood_cells            0
sugar                      0
albumin                    0
specific_gravity           0
class                      0
dtype: int64

In [14]:
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,anemia,class
0,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
1,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd
2,63.0,70.0,1.01,3.0,0.0,abnormal,abnormal,present,notpresent,380.0,60.0,2.7,131.0,4.2,10.8,32.0,4500.0,3.8,yes,yes,no,poor,yes,no,ckd
3,68.0,80.0,1.01,3.0,2.0,normal,abnormal,present,present,157.0,90.0,4.1,130.0,6.4,5.6,16.0,11000.0,2.6,yes,yes,yes,poor,yes,no,ckd
4,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,173.0,148.0,3.9,135.0,5.2,7.7,24.0,9200.0,3.2,yes,yes,yes,poor,yes,yes,ckd


In [15]:
df.describe()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,49.563291,74.050633,1.019873,0.797468,0.253165,131.341772,52.575949,2.188608,138.848101,4.636709,13.687342,41.917722,8475.949367,4.891772
std,15.512244,11.175381,0.005499,1.41313,0.813397,64.939832,47.395382,3.077615,7.489421,3.476351,2.882204,9.105164,3126.880181,1.019364
min,6.0,50.0,1.005,0.0,0.0,70.0,10.0,0.4,111.0,2.5,3.1,9.0,3800.0,2.1
25%,39.25,60.0,1.02,0.0,0.0,97.0,26.0,0.7,135.0,3.7,12.6,37.5,6525.0,4.5
50%,50.5,80.0,1.02,0.0,0.0,115.5,39.5,1.1,139.0,4.5,14.25,44.0,7800.0,4.95
75%,60.0,80.0,1.025,1.0,0.0,131.75,49.75,1.6,144.0,4.9,15.775,48.0,9775.0,5.6
max,83.0,110.0,1.025,4.0,5.0,490.0,309.0,15.2,150.0,47.0,17.8,54.0,26400.0,8.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      158 non-null    float64
 1   blood_pressure           158 non-null    float64
 2   specific_gravity         158 non-null    float64
 3   albumin                  158 non-null    float64
 4   sugar                    158 non-null    float64
 5   red_blood_cells          158 non-null    object 
 6   pus_cell                 158 non-null    object 
 7   pus_cell_clumps          158 non-null    object 
 8   bacteria                 158 non-null    object 
 9   blood_glucose_random     158 non-null    float64
 10  blood_urea               158 non-null    float64
 11  serum_creatinine         158 non-null    float64
 12  sodium                   158 non-null    float64
 13  potassium                158 non-null    float64
 14  haemoglobin              1

In [17]:
# Converting class values to numeric class

df["class"] = df["class"].map({"ckd": 0, "notckd": 1})
# df['class'] = pd.to_numeric(df['class'], errors='coerce')

In [18]:
df["class"]

0      0
1      0
2      0
3      0
4      0
      ..
153    1
154    1
155    1
156    1
157    1
Name: class, Length: 158, dtype: int64

## Feature Encoding

In [19]:
for col in cat_cols:
    print(f"{col} has {df[col].nunique()} categories\n")

red_blood_cells has 2 categories

pus_cell has 2 categories

pus_cell_clumps has 2 categories

bacteria has 2 categories

hypertension has 2 categories

diabetes_mellitus has 2 categories

coronary_artery_disease has 2 categories

appetite has 2 categories

peda_edema has 2 categories

anemia has 2 categories

class has 2 categories



In [20]:
# Encode the data into numeric data

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [21]:
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,anemia,class
0,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1,0,0,1,1,1,0
1,53.0,90.0,1.02,2.0,0.0,0,0,1,0,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,1,1,0,1,0,1,0
2,63.0,70.0,1.01,3.0,0.0,0,0,1,0,380.0,60.0,2.7,131.0,4.2,10.8,32.0,4500.0,3.8,1,1,0,1,1,0,0
3,68.0,80.0,1.01,3.0,2.0,1,0,1,1,157.0,90.0,4.1,130.0,6.4,5.6,16.0,11000.0,2.6,1,1,1,1,1,0,0
4,61.0,80.0,1.015,2.0,0.0,0,0,0,0,173.0,148.0,3.9,135.0,5.2,7.7,24.0,9200.0,3.2,1,1,1,1,1,1,0


## Model Building

In [22]:
X = df[[column for column in df.columns if column != "class"]]
y = df["class"]

In [23]:
# Splitting overall data into training and testing data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [24]:
X_train.shape

(126, 24)

In [25]:
y_train.shape

(126,)

In [26]:
X_test.shape

(32, 24)

In [27]:
y_test.shape

(32,)

### 1. Using Decision Trees

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

print(f"Training Accuracy: {accuracy_score(y_train, decision_tree.predict(X_train))}")
print(f"Test Accuracy: {accuracy_score(y_test, decision_tree.predict(X_test))} \n")

print(f"Confusion Matrix: \n{confusion_matrix(y_test, decision_tree.predict(X_test))}\n")
print(f"Classification Report: \n{classification_report(y_test, decision_tree.predict(X_test))}")

Training Accuracy: 1.0
Test Accuracy: 0.96875 

Confusion Matrix: 
[[ 7  1]
 [ 0 24]]

Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.96      1.00      0.98        24

    accuracy                           0.97        32
   macro avg       0.98      0.94      0.96        32
weighted avg       0.97      0.97      0.97        32



### 2. Using Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier(
    criterion = "entropy",
    max_depth = 11,
    min_samples_leaf = 2,
    min_samples_split = 3,
    n_estimators = 130
)
random_forest_classifier.fit(X_train, y_train)

print(f"Training Accuracy: {accuracy_score(y_train, random_forest_classifier.predict(X_train))}")
print(f"Test Accuracy: {accuracy_score(y_test, random_forest_classifier.predict(X_test))} \n")

print(f"Confusion Matrix: \n{confusion_matrix(y_test, random_forest_classifier.predict(X_test))}\n")
print(f"Classification Report: \n{classification_report(y_test, random_forest_classifier.predict(X_test))}")

Training Accuracy: 1.0
Test Accuracy: 1.0 

Confusion Matrix: 
[[ 8  0]
 [ 0 24]]

Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00        24

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32

