In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("dataset/kidney_disease.csv")

# Rename columns (if needed) to consistent names
df.columns = [
    'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
    'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
    'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium', 'potassium',
    'hemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
    'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite',
    'pedal_edema', 'anemia', 'classification', 'id'
]

# Drop ID column
df.drop(columns=['id'], inplace=True)

# Convert numeric-looking object columns to float
for col in ['packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count', 'blood_glucose_random']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Strip and standardize object columns
df['classification'] = df['classification'].str.strip().str.lower()
df['classification'] = df['classification'].map({'yes': 1, 'no': 0})


In [3]:
# Clean and standardize
df['classification'] = df['classification'].astype(str).str.strip().str.lower()

# Check unique cleaned values
print(df['classification'].unique())



['0.0' '1.0' 'nan']


In [4]:
# Numerical features
numerical_cols = [
    'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
    'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
    'potassium', 'hemoglobin', 'packed_cell_volume',
    'white_blood_cell_count', 'red_blood_cell_count'
]

# Categorical features
categorical_cols = [
    'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
    'hypertension', 'diabetes_mellitus', 'coronary_artery_disease',
    'appetite', 'pedal_edema', 'anemia'
]

# Handle missing values
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

for col in categorical_cols:
    df[col] = df[col].str.strip().str.lower()
    df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

AttributeError: Can only use .str accessor with string values!