## Part 1.1 Data Preprocessing

#### Importing Libraries

In [31]:
import pandas as pd
import numpy as np
import re

#### Preprocessing

In [32]:
kidney_data = pd.read_csv("../Dataset/ckd-dataset-v2.csv")
kidney_data = kidney_data.iloc[2:].reset_index(drop=True)
kidney_data.head(5)

Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0,0,1.019 - 1.021,1 - 1,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
1,0,0,1.009 - 1.011,< 0,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
2,0,0,1.009 - 1.011,≥ 4,ckd,1,< 0,1,0,1,...,0,0,0,1,0,0,127.281 - 152.446,s1,1,< 12
3,1,1,1.009 - 1.011,3 - 3,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,127.281 - 152.446,s1,1,< 12
4,0,0,1.015 - 1.017,< 0,ckd,0,< 0,0,0,0,...,0,1,0,1,1,0,127.281 - 152.446,s1,1,12 - 20


In [33]:
def encode_range_variables(df, column_name):
    """
    Encodes ordered ranges into ordinal numeric labels.

    Parameters:
    - df: Pandas DataFrame
    - column_name: str, column to encode

    Returns:
    - df with new encoded columns
    """
    def get_sorted_values(val):
        val = val.strip()
        if re.match(r'^<\s*\d+', val):
            return float(val.replace('<', '').strip()) - 1e-5  
        elif re.match(r'^≥\s*\d+', val):
            return float(val.replace('≥', '').strip()) + 1e-5  
        elif re.match(r'^\d+', val):
            parts = re.findall(r'\d+(?:\.\d+)?', val)
            if len(parts) == 2:
                return (float(parts[0]) + float(parts[1])) / 2 
        return float('inf')  

    unique_values = df[column_name].dropna().unique()
    sorted_values = sorted(unique_values, key=get_sorted_values)

    range_mapping = {val: idx for idx, val in enumerate(sorted_values)}

    df[column_name] = df[column_name].map(range_mapping)

    return df

def encode_binary_variables(df, column):
    """
    Encodes binary strings into binary integer labels.

    Parameters:
    - df: Pandas DataFrame
    - column_name: str, column to encode

    Returns:
    - df with new encoded columns
    """
    mapping = {
        '0': 0, '1': 1,
        'ckd': 1, 'notckd': 0, # for class (target) variable
    }

    df[column] = df[column].astype(str).str.strip().str.lower()
    df[column] = df[column].map(mapping)

    return df

In [34]:
# Binary Variables Encoding
binary_vars = ['bp (Diastolic)', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']

for var in binary_vars:
    kidney_data = encode_binary_variables(kidney_data, var)

In [35]:
# Range Variables Encoding
range_vars = [var for var in kidney_data.columns if var not in binary_vars]
for var in range_vars:
    kidney_data = encode_range_variables(kidney_data, var)

#### Unknown Value Handling

In [36]:
# Handle missing values in 'grf' column (unknown value 'p')
kidney_data['grf'] = kidney_data['grf'].replace(10, np.nan)
mode_value = kidney_data['grf'].mode().iloc[0]
kidney_data['grf'] = kidney_data['grf'].fillna(mode_value) # mode imputation
kidney_data['grf'] = kidney_data['grf'].astype(int)

#### Futher processing

In [37]:
kidney_data = kidney_data.rename(columns={'bp (Diastolic)': 'bp_diastolic', 'bp limit': 'bp_limit'})
kidney_data = kidney_data.drop(columns=['affected']) # affected = class

In [38]:
kidney_data.head(5)

Unnamed: 0,bp_diastolic,bp_limit,sg,al,class,rbc,su,pc,pcc,ba,...,wbcc,htn,dm,cad,appet,pe,ane,grf,stage,age
0,0,0,3,1,1,0,0,0,0,0,...,2,0,0,0,0,0,0,9,0,0
1,0,0,1,0,1,0,0,0,0,0,...,4,0,0,0,0,0,0,9,0,0
2,0,0,1,4,1,1,0,1,0,1,...,5,0,0,0,1,0,0,5,0,0
3,1,1,1,3,1,0,0,0,0,0,...,2,0,0,0,0,0,0,5,0,0
4,0,0,2,0,1,0,0,0,0,0,...,2,0,1,0,1,1,0,5,0,1


In [39]:
kidney_data.to_csv("../Dataset/cleaned_kidney_data.csv", index=False)