In [1]:
import pandas as pd           # For data manipulation
import numpy as np            # For numerical analysis
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno      # For visualizing missing values

# Scikit-learn libraries
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

import pickle                 # For saving/loading model


Matplotlib is building the font cache; this may take a moment.


In [4]:
df = pd.read_csv("dataset/kidney_disease.csv")

In [5]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [7]:
df.columns


Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [17]:
df.columns = [
    'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
    'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
    'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium', 'potassium',
    'hemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
    'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite',
    'pedal_edema', 'anemia', 'classification', 'id'
]


In [18]:
df.columns

Index(['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
       'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
       'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
       'potassium', 'hemoglobin', 'packed_cell_volume',
       'white_blood_cell_count', 'red_blood_cell_count', 'hypertension',
       'diabetes_mellitus', 'coronary_artery_disease', 'appetite',
       'pedal_edema', 'anemia', 'classification', 'id'],
      dtype='object')

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      400 non-null    int64  
 1   blood_pressure           391 non-null    float64
 2   specific_gravity         388 non-null    float64
 3   albumin                  353 non-null    float64
 4   sugar                    354 non-null    float64
 5   red_blood_cells          351 non-null    float64
 6   pus_cell                 248 non-null    object 
 7   pus_cell_clumps          335 non-null    object 
 8   bacteria                 396 non-null    object 
 9   blood_glucose_random     396 non-null    object 
 10  blood_urea               356 non-null    float64
 11  serum_creatinine         381 non-null    float64
 12  sodium                   383 non-null    float64
 13  potassium                313 non-null    float64
 14  hemoglobin               3

In [23]:
df.isnull().any()

age                        False
blood_pressure              True
specific_gravity            True
albumin                     True
sugar                       True
red_blood_cells             True
pus_cell                    True
pus_cell_clumps             True
bacteria                    True
blood_glucose_random        True
blood_urea                  True
serum_creatinine            True
sodium                      True
potassium                   True
hemoglobin                  True
packed_cell_volume          True
white_blood_cell_count      True
red_blood_cell_count        True
hypertension                True
diabetes_mellitus           True
coronary_artery_disease     True
appetite                    True
pedal_edema                 True
anemia                      True
classification              True
id                         False
dtype: bool

In [24]:
# Checking the shape and data type info
print("Shape of the dataset:", df.shape)
print("\nData Type Info:")
df.info()


Shape of the dataset: (400, 26)

Data Type Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      400 non-null    int64  
 1   blood_pressure           391 non-null    float64
 2   specific_gravity         388 non-null    float64
 3   albumin                  353 non-null    float64
 4   sugar                    354 non-null    float64
 5   red_blood_cells          351 non-null    float64
 6   pus_cell                 248 non-null    object 
 7   pus_cell_clumps          335 non-null    object 
 8   bacteria                 396 non-null    object 
 9   blood_glucose_random     396 non-null    object 
 10  blood_urea               356 non-null    float64
 11  serum_creatinine         381 non-null    float64
 12  sodium                   383 non-null    float64
 13  potassium                313 no