In [1]:
#Install all necessary modules
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install missingno
!pip install sklearn
!pip install pickle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1310 sha256=2cff3d6c4aa8294102500e8745cf2f9f884fcd32f628255a720b70d0aa3866a3
  Stored in directory: /root/.cache/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f

In [2]:
#Import all necessary modules
import pandas as pd
import numpy as np
from collections import Counter as C
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import pickle

In [4]:
#Read data from csv file
data = pd.read_csv("chronickidneydisease.csv")

In [5]:
#return the first five rows
data.head()
#return the last five rows
data.tail()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
395,395,55.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd
399,399,58.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,53,6800,6.1,no,no,no,good,no,no,notckd


In [6]:
#drop/remove the id column which is unnecessary for training/prediction
data.drop(["id"],axis=1,inplace=True)

In [7]:
#return all the current columns in the dataframe
data.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [8]:
#rename all the column names for easy understanding
data.columns = [
    "age","blood_pressure","specific_gravity","albumin","sugar","red_blood_cells","pus_cell","pus_cell_clumps","bacteria","blood_glucose_random","blood_urea","serum_creatinine","sodium","potassium","hemoglobin","packed_cell_volume","wbc_count","rbc_count","hypertension","diabetesmellitus","coronary_artery_disease","appetite","pedal_edema","anemia","class"
]
data.columns

Index(['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
       'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
       'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
       'potassium', 'hemoglobin', 'packed_cell_volume', 'wbc_count',
       'rbc_count', 'hypertension', 'diabetesmellitus',
       'coronary_artery_disease', 'appetite', 'pedal_edema', 'anemia',
       'class'],
      dtype='object')

In [9]:
#returns the information(col name,not null count,data type) of all columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      391 non-null    float64
 1   blood_pressure           388 non-null    float64
 2   specific_gravity         353 non-null    float64
 3   albumin                  354 non-null    float64
 4   sugar                    351 non-null    float64
 5   red_blood_cells          248 non-null    object 
 6   pus_cell                 335 non-null    object 
 7   pus_cell_clumps          396 non-null    object 
 8   bacteria                 396 non-null    object 
 9   blood_glucose_random     356 non-null    float64
 10  blood_urea               381 non-null    float64
 11  serum_creatinine         383 non-null    float64
 12  sodium                   313 non-null    float64
 13  potassium                312 non-null    float64
 14  hemoglobin               3

In [10]:
#returns the unique values present in the 'class' column
data['class'].unique()

array(['ckd', 'ckd\t', 'notckd'], dtype=object)

In [11]:
#remove/replace mismatched column label
data['class'] = data['class'].replace("ckd\t","ckd")
data['class'].unique()

array(['ckd', 'notckd'], dtype=object)

In [12]:
#get all categorical columns
cat_cols = set(data.dtypes[data.dtypes=='O'].index.values)
print(cat_cols)

{'bacteria', 'red_blood_cells', 'rbc_count', 'coronary_artery_disease', 'diabetesmellitus', 'hypertension', 'pedal_edema', 'class', 'pus_cell', 'wbc_count', 'appetite', 'pus_cell_clumps', 'packed_cell_volume', 'anemia'}


In [13]:
#get number of rows of each category of all categorical columns
for col in cat_cols:
  print("Columns : ",col)
  print(C(data[col]))
  print('*'*10+'\n')

Columns :  bacteria
Counter({'notpresent': 374, 'present': 22, nan: 4})
**********

Columns :  red_blood_cells
Counter({'normal': 201, nan: 152, 'abnormal': 47})
**********

Columns :  rbc_count
Counter({nan: 130, '5.2': 18, '4.5': 16, '4.9': 14, '4.7': 11, '3.9': 10, '4.8': 10, '4.6': 9, '3.4': 9, '3.7': 8, '5.0': 8, '6.1': 8, '5.5': 8, '5.9': 8, '3.8': 7, '5.4': 7, '5.8': 7, '5.3': 7, '4.3': 6, '4.2': 6, '5.6': 6, '4.4': 5, '3.2': 5, '4.1': 5, '6.2': 5, '5.1': 5, '6.4': 5, '5.7': 5, '6.5': 5, '3.6': 4, '6.0': 4, '6.3': 4, '4.0': 3, '4': 3, '3.5': 3, '3.3': 3, '5': 2, '2.6': 2, '2.8': 2, '2.5': 2, '3.1': 2, '2.1': 2, '2.9': 2, '2.7': 2, '3.0': 2, '2.3': 1, '8.0': 1, '3': 1, '2.4': 1, '\t?': 1})
**********

Columns :  coronary_artery_disease
Counter({'no': 362, 'yes': 34, '\tno': 2, nan: 2})
**********

Columns :  diabetesmellitus
Counter({'no': 258, 'yes': 134, '\tno': 3, '\tyes': 2, nan: 2, ' yes': 1})
**********

Columns :  hypertension
Counter({'no': 251, 'yes': 147, nan: 2})
*****

In [14]:
#remove non continuous columns from categorical columns list
cat_cols.remove("rbc_count")
cat_cols.remove("wbc_count")
cat_cols.remove("packed_cell_volume")
print(cat_cols)

{'bacteria', 'red_blood_cells', 'coronary_artery_disease', 'diabetesmellitus', 'hypertension', 'pedal_edema', 'class', 'pus_cell', 'appetite', 'pus_cell_clumps', 'anemia'}


In [15]:
#get all continuous columns
cont_cols = set(data.dtypes[data.dtypes!='O'].index.values)
print(cont_cols)

{'blood_pressure', 'sugar', 'blood_glucose_random', 'hemoglobin', 'potassium', 'age', 'specific_gravity', 'sodium', 'albumin', 'blood_urea', 'serum_creatinine'}


In [16]:
#get number of rows of each value of all continuous columns
for col in cont_cols:
  print("Columns : ",col)
  print(C(data[col]))
  print('*'*10+'\n')

Columns :  blood_pressure
Counter({80.0: 116, 70.0: 112, 60.0: 71, 90.0: 53, 100.0: 25, 50.0: 5, 110.0: 3, nan: 1, nan: 1, 140.0: 1, 180.0: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, 120.0: 1, nan: 1, nan: 1, nan: 1})
**********

Columns :  sugar
Counter({0.0: 290, 2.0: 18, 3.0: 14, 4.0: 13, 1.0: 13, 5.0: 3, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1})
**********

Columns :  blood_glucose_random
Counter({99.0: 10, 100.0: 9, 93.0: 9, 107.0: 8, 117.0: 6, 140.0: 6, 92.0: 6, 109.0: 6, 131.0: 6, 130.0: 6, 70.0: 5, 114.0: 5, 95.0: 5, 123.0: 5, 124.0: 5, 102.0: 5, 132.0: 5, 104.0: 5, 125.0: 5, 122.0: 5, 121.0: 4, 106.0: 4, 76.0: 4, 91.

In [17]:
#remove non categorical columns from categorical columns list
cont_cols.remove("specific_gravity")
cont_cols.remove("albumin")
cont_cols.remove("sugar")
print(cont_cols)

{'blood_pressure', 'blood_glucose_random', 'hemoglobin', 'potassium', 'age', 'sodium', 'blood_urea', 'serum_creatinine'}


In [18]:
#add categorical columns
cat_cols.add("specific_gravity")
cat_cols.add("albumin")
cat_cols.add("sugar")
print(cat_cols)

{'bacteria', 'red_blood_cells', 'sugar', 'coronary_artery_disease', 'diabetesmellitus', 'hypertension', 'pedal_edema', 'class', 'pus_cell', 'specific_gravity', 'albumin', 'appetite', 'pus_cell_clumps', 'anemia'}


In [19]:
#add continuous columns
cont_cols.add("rbc_count")
cont_cols.add("wbc_count")
cont_cols.add("packed_cell_volume")
print(cont_cols)

{'blood_pressure', 'blood_glucose_random', 'rbc_count', 'hemoglobin', 'potassium', 'age', 'sodium', 'wbc_count', 'blood_urea', 'packed_cell_volume', 'serum_creatinine'}


In [20]:
#clean input labels
data["coronary_artery_disease"] = data["coronary_artery_disease"].replace("\tno","no")
C(data["coronary_artery_disease"])
data["diabetesmellitus"] = data["diabetesmellitus"].replace(to_replace={" yes":"yes","\tyes":"yes","\tno":"no"})
C(data["diabetesmellitus"])

Counter({'yes': 137, 'no': 261, nan: 2})

In [21]:
#check for null values in each column
data.isnull().any()

age                         True
blood_pressure              True
specific_gravity            True
albumin                     True
sugar                       True
red_blood_cells             True
pus_cell                    True
pus_cell_clumps             True
bacteria                    True
blood_glucose_random        True
blood_urea                  True
serum_creatinine            True
sodium                      True
potassium                   True
hemoglobin                  True
packed_cell_volume          True
wbc_count                   True
rbc_count                   True
hypertension                True
diabetesmellitus            True
coronary_artery_disease     True
appetite                    True
pedal_edema                 True
anemia                      True
class                      False
dtype: bool

In [22]:
#get count of null values in each column
data.isnull().sum()

age                          9
blood_pressure              12
specific_gravity            47
albumin                     46
sugar                       49
red_blood_cells            152
pus_cell                    65
pus_cell_clumps              4
bacteria                     4
blood_glucose_random        44
blood_urea                  19
serum_creatinine            17
sodium                      87
potassium                   88
hemoglobin                  52
packed_cell_volume          70
wbc_count                  105
rbc_count                  130
hypertension                 2
diabetesmellitus             2
coronary_artery_disease      2
appetite                     1
pedal_edema                  1
anemia                       1
class                        0
dtype: int64

In [23]:
#convert object data type present in continuous columns to numeric data type
data["packed_cell_volume"] = pd.to_numeric(data["packed_cell_volume"],errors='coerce')
data["rbc_count"] = pd.to_numeric(data["rbc_count"],errors='coerce')
data["wbc_count"] = pd.to_numeric(data["wbc_count"],errors='coerce')

In [24]:
#replace all null values in each of the continuous columns with it's columnar mean value
data["blood_glucose_random"].fillna(data["blood_glucose_random"].mean(),inplace=True)
data["blood_pressure"].fillna(data["blood_pressure"].mean(),inplace=True)
data["blood_urea"].fillna(data["blood_urea"].mean(),inplace=True)
data["hemoglobin"].fillna(data["hemoglobin"].mean(),inplace=True)
data["packed_cell_volume"].fillna(data["packed_cell_volume"].mean(),inplace=True)
data["potassium"].fillna(data["potassium"].mean(),inplace=True)
data["rbc_count"].fillna(data["rbc_count"].mean(),inplace=True)
data["serum_creatinine"].fillna(data["serum_creatinine"].mean(),inplace=True)
data["sodium"].fillna(data["sodium"].mean(),inplace=True)
data["wbc_count"].fillna(data["wbc_count"].mean(),inplace=True)

In [25]:
#replace all null values in each of the categorical columns with it's columnar mode value
data["age"].fillna(data["age"].mode()[0],inplace=True)
data["hypertension"].fillna(data["hypertension"].mode()[0],inplace=True)
data["pus_cell_clumps"].fillna(data["pus_cell_clumps"].mode()[0],inplace=True)
data["appetite"].fillna(data["appetite"].mode()[0],inplace=True)
data["albumin"].fillna(data["albumin"].mode()[0],inplace=True)
data["pus_cell"].fillna(data["pus_cell"].mode()[0],inplace=True)
data["red_blood_cells"].fillna(data["red_blood_cells"].mode()[0],inplace=True)
data["coronary_artery_disease"].fillna(data["coronary_artery_disease"].mode()[0],inplace=True)
data["bacteria"].fillna(data["bacteria"].mode()[0],inplace=True)
data["anemia"].fillna(data["anemia"].mode()[0],inplace=True)
data["sugar"].fillna(data["sugar"].mode()[0],inplace=True)
data["diabetesmellitus"].fillna(data["diabetesmellitus"].mode()[0],inplace=True)
data["pedal_edema"].fillna(data["pedal_edema"].mode()[0],inplace=True)
data["specific_gravity"].fillna(data["specific_gravity"].mode()[0],inplace=True)

In [26]:
#encoding labels of each of the categorical columns
for col in cat_cols:
  print("Label Encoding of : ",col)
  label_encoder = LabelEncoder()
  print(C(data[col]))
  data[col] = label_encoder.fit_transform(data[col])
  print(C(data[col]))
  print("*"*10+'\n')

Label Encoding of :  bacteria
Counter({'notpresent': 378, 'present': 22})
Counter({0: 378, 1: 22})
**********

Label Encoding of :  red_blood_cells
Counter({'normal': 353, 'abnormal': 47})
Counter({1: 353, 0: 47})
**********

Label Encoding of :  sugar
Counter({0.0: 339, 2.0: 18, 3.0: 14, 4.0: 13, 1.0: 13, 5.0: 3})
Counter({0: 339, 2: 18, 3: 14, 4: 13, 1: 13, 5: 3})
**********

Label Encoding of :  coronary_artery_disease
Counter({'no': 366, 'yes': 34})
Counter({0: 366, 1: 34})
**********

Label Encoding of :  diabetesmellitus
Counter({'no': 263, 'yes': 137})
Counter({0: 263, 1: 137})
**********

Label Encoding of :  hypertension
Counter({'no': 253, 'yes': 147})
Counter({0: 253, 1: 147})
**********

Label Encoding of :  pedal_edema
Counter({'no': 324, 'yes': 76})
Counter({0: 324, 1: 76})
**********

Label Encoding of :  class
Counter({'ckd': 250, 'notckd': 150})
Counter({0: 250, 1: 150})
**********

Label Encoding of :  pus_cell
Counter({'normal': 324, 'abnormal': 76})
Counter({1: 324,

In [27]:
#returns the information(col name,not null count,data type) of all columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      400 non-null    float64
 1   blood_pressure           400 non-null    float64
 2   specific_gravity         400 non-null    int64  
 3   albumin                  400 non-null    int64  
 4   sugar                    400 non-null    int64  
 5   red_blood_cells          400 non-null    int64  
 6   pus_cell                 400 non-null    int64  
 7   pus_cell_clumps          400 non-null    int64  
 8   bacteria                 400 non-null    int64  
 9   blood_glucose_random     400 non-null    float64
 10  blood_urea               400 non-null    float64
 11  serum_creatinine         400 non-null    float64
 12  sodium                   400 non-null    float64
 13  potassium                400 non-null    float64
 14  hemoglobin               4

In [28]:
#seaprate the independent & dependent columns to train the model and print the shape of the input and ouptut data frames
inp_cols = ["red_blood_cells","pus_cell","blood_glucose_random","blood_urea","pedal_edema","anemia","diabetesmellitus","coronary_artery_disease"]
x = pd.DataFrame(data,columns=inp_cols)
y = pd.DataFrame(data,columns=["class"])
print(x.shape)
print(y.shape)


(400, 8)
(400, 1)


In [29]:
print(x.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   red_blood_cells          400 non-null    int64  
 1   pus_cell                 400 non-null    int64  
 2   blood_glucose_random     400 non-null    float64
 3   blood_urea               400 non-null    float64
 4   pedal_edema              400 non-null    int64  
 5   anemia                   400 non-null    int64  
 6   diabetesmellitus         400 non-null    int64  
 7   coronary_artery_disease  400 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 25.1 KB
None


In [30]:
#split the dataset to training and testing data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(320, 8)
(80, 8)
(320, 1)
(80, 1)


In [31]:
#build the logistic regression model with the training data
lgr_model = LogisticRegression()
lgr_model.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [32]:
#predict with the testing data
y_pred = lgr_model.predict(x_test)
print(y_pred)


[0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 1 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1
 0 0 1 0 1 0 0 0 1 0 1 1 1 0 0 0 1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1
 0 1 0 1 1 0]


In [34]:
#predict with custom data
y_samp_pred = lgr_model.predict([[129,99,1,0,0,1,0,1]])

print(y_samp_pred)


[1]


  "X does not have valid feature names, but"


In [33]:
#return the accuracy score of the built model
accuracy_score(y_test,y_pred)

0.925

In [35]:
#return the confusion matrix of the testing data against the predicted result
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

array([[48,  6],
       [ 0, 26]])

In [36]:
#save the model in a pickle file to use with flask web application
pickle.dump(lgr_model,open('CKD_NLP.pkl','wb'))