Load pips

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


Load Dataset

In [None]:
data_set = 'data_set.csv'

In [None]:
# read data_set
df = pd.read_csv(data_set)

Data Info

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
df.tail(3)

Delete Extra Column 

In [None]:
# Delete ID
df.drop('id' , axis=1 , inplace=True)

In [None]:
df.head(2)

Find mean max std

In [None]:
# numeric values
df.describe()

In [None]:
# All values
df.describe(include='all')

In [None]:
# find name of columns
df.columns = ['age' , 'blood_Pressure' , 'Specific Gravity of urine' , 'Albumin' ,'Sugar' ,'Red Blood Cells' ,
              'Pus Cells' ,'Pus Cell Clumps' ,'Bacteria' ,'Blood Glucose Random' ,'Blood Urea','Serum Creatinine' ,
              'Sodium' ,'Potassium', 'Hemoglobin' ,'Packed Cell Volume (%)','White Blood Cell Count' , 'Red Blood Cell Count' ,
            'Hypertension' ,'Diabetes Mellitus','Coronary Artery Disease','Appetite' ,'Pedal Edema' ,'Anemia' ,'Kidney Disease Status'] 

In [None]:
df.head(3)

In [None]:
text_col = ['Packed Cell Volume (%)', 'White Blood Cell Count', 'Red Blood Cell Count']  # identify wrong data type

for i in text_col:
    print(f'{i} : {df[i].dtype}')


In [None]:
def convert_text_to_number(df, column):
    df[column] = pd.to_numeric(df[column], errors='coerce')  # convert to numeric, non-numeric → NaN

for col in text_col:
    convert_text_to_number(df, col)
    print(f'{col} : {df[col].dtype}')


Find Null Values

In [None]:
null = df.isnull().sum()
null[null>0].sort_values(ascending=False)

Fill Null values
Nemeric data in mean or median
Obj data is mod

In [None]:
def fill_missing_values(df):                     
    for col in df.columns:
        if df[col].dtype != 'object':  
            # numeric column → fill with mean
            df[col] = df[col].fillna(df[col].mean())
        else:
            # categorical column → fill with mode
            df[col] = df[col].fillna(df[col].mode()[0])
    return df


df = fill_missing_values(df) # call function

print('filled datas')

In [None]:
print(df.isnull().sum()[df.isnull().sum() > 0].sort_values(ascending=False)) # chek null value

null[null>0].sort_values(ascending=False)

Find Unique Values

In [None]:
print(f"Diabetes Mellitus :{df['Diabetes Mellitus'].unique()}")
print(f"Coronary Artery Disease :{df['Coronary Artery Disease'].unique()}")
print(f"Kidney Disease Status :{df['Kidney Disease Status'].unique()}")

In [None]:
for col in ['Diabetes Mellitus', 'Coronary Artery Disease', 'Kidney Disease Status']:  # recheck
    if col in df.columns:
        print(f"{col}: {df[col].unique()}")
    else:
        print(f"{col} not found in DataFrame")


In [None]:
df.head(2)

Feature Scaling
0 or 1

In [None]:
# --- Binary Feature Encoding ---

# Ensure expected canonical values after cleaning
expected_binary_maps = {
    'Red Blood Cells': {'normal': 1, 'abnormal': 0},
    'Pus Cells': {'normal': 1, 'abnormal': 0},
    'Pus Cell Clumps': {'present': 1, 'notpresent': 0},
    'Bacteria': {'present': 1, 'notpresent': 0},
    'Hypertension': {'yes': 1, 'no': 0},
    'Diabetes Mellitus': {'yes': 1, 'no': 0},
    'Coronary Artery Disease': {'yes': 1, 'no': 0},
    'Appetite': {'good': 1, 'poor': 0},
    'Pedal Edema': {'yes': 1, 'no': 0},
    'Anemia': {'yes': 1, 'no': 0},
    'Kidney Disease Status': {'ckd': 1, 'notckd': 0}
}

for column_name, mapping in expected_binary_maps.items():
    if column_name in df.columns:
        df[column_name] = df[column_name].map(mapping)

# After mapping, fill any NaNs that may occur due to unexpected values
for column_name in expected_binary_maps.keys():
    if column_name in df.columns and df[column_name].isna().any():
        # For target, drop rows with NaN; for features, impute with mode  
        if column_name == 'Kidney Disease Status':
            df = df.dropna(subset=[column_name])
        else:
            df[column_name] = df[column_name].fillna(df[column_name].mode()[0])
print('done')

In [None]:
df.head(3)

Find Coor relation 

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(),annot=True ,linewidths=0.5)
plt.show()

In [None]:
x_axis = ['age' , 'blood_Pressure' , 'Specific Gravity of urine' , 'Albumin' ,'Sugar' ,'Red Blood Cells' ,
              'Pus Cells' ,'Pus Cell Clumps' ,'Bacteria' ,'Blood Glucose Random' ,'Blood Urea','Serum Creatinine' ,
              'Sodium' ,'Potassium', 'Hemoglobin' ,'Packed Cell Volume (%)','White Blood Cell Count' , 'Red Blood Cell Count' ,
            'Hypertension' ,'Diabetes Mellitus','Coronary Artery Disease','Appetite' ,'Pedal Edema' ,'Anemia' ,'Kidney Disease Status']

for x in x_axis:
    fig, axes = plt.subplots(1, 2, figsize=(18, 4))
    
    # Histogram (distribution)
    sns.histplot(df[x], ax=axes[0], kde=False)
    axes[0].set_title(f'Distribution of {x}')
    
    # Boxplot
    sns.boxplot(x=df[x], ax=axes[1])
    axes[1].set_title(f'Boxplot of {x}')
    
    plt.tight_layout()
    plt.show()

In [None]:
target_corr = df.corr()['Kidney Disease Status'].abs().sort_values(ascending=False)[1:]
print(target_corr)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
x = df.drop(['Kidney Disease Status'] , axis =1)

In [None]:
x.head()

In [None]:
y = df['Kidney Disease Status']
y.head()

Split Train and Test 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.25 , random_state=25 )

In [None]:
print('x_train : ' , x_train.shape)
print('x_test : ' , x_test.shape)

Find Algorithms in Models

In [None]:
from sklearn.linear_model import LinearRegression

# Linear Regression
lr = LinearRegression()
lr.fit(x_train ,y_train)


Output Accur....

In [None]:
y_pred=lr.predict(x_test)
y_pred

from sklearn.metrics import r2_score
score1 = r2_score(y_test, y_pred)

score1

In [None]:
from sklearn.svm import SVR

# Create an instance of SVR
svm = SVR(kernel='rbf')   # you can change kernel to 'linear', 'poly', or 'sigmoid'

# Fit the model
svm.fit(x_train, y_train)

y_pred=svm.predict(x_test)
score2 = r2_score(y_test , y_pred)
score2

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()   # create an instance of the model
rf.fit(x_train, y_train)       # train the model

y_pred=rf.predict(x_test)
score3 = r2_score(y_test , y_pred)
score3

In [None]:
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier


In [None]:
# Decision Tree (Regression)
dt_reg = DecisionTreeRegressor()

dt_reg.fit(x_train, y_train)  

y_pred=rf.predict(x_test)
score4 = r2_score(y_test , y_pred)
score4



In [None]:
# Decision Tree (Classification)
dt_clf = DecisionTreeClassifier()

dt_clf.fit(x_train, y_train)  

y_pred=rf.predict(x_test)
score5 = r2_score(y_test , y_pred)
score5

In [None]:
df.tail(3)

New Prediction

In [None]:
new_data = {
    'age' : 48 , 
    'blood_Pressure' : 80 , 
    'Specific Gravity of urine' : 1.02 , 
    'Albumin': 1 ,
    'Sugar' : 0,
    'Red Blood Cells' : 1 ,
    'Pus Cells' : 1,
    'Pus Cell Clumps' : 0 ,
    'Bacteria' : 0 ,
    'Blood Glucose Random' : 121.000000 ,  
    'Blood Urea' : 36,
    'Serum Creatinine' : 1.2 ,
    'Sodium'  : 137.52875399361,
    'Potassium' : 4.62724358974359 , 
    'Hemoglobin' : 15.4 ,
    'Packed Cell Volume (%)' : 44,
    'White Blood Cell Count' : 7800 , 
    'Red Blood Cell Count' : 5.2 ,
    'Hypertension' : 1 ,
    'Diabetes Mellitus' : 1 ,
    'Coronary Artery Disease' : 0 ,
    'Appetite' : 1 ,
    'Pedal Edema' : 0 ,
    'Anemia' : 0

    
}

new_df = pd.DataFrame(new_data , index=[0])
new_df

In [None]:
lr_score = lr.predict(new_df)
svm_score = svm.predict(new_df)
rf_score = rf.predict(new_df)
dt_reg_score = dt_reg.predict(new_df)
dt_clf_score = dt_clf.predict(new_df)


print(f'lr : {lr_score}\nsvm : {svm_score}\nrf : {rf_score}\ndt_reg : {dt_reg_score}\ndt_clf : {dt_clf_score}\n')


In [None]:
new_data_1 = {
    'age': 67.0,
    'blood_Pressure': 60.0,
    'Specific Gravity of urine': 1.025,
    'Albumin': 1.0,
    'Sugar': 1.0,
    'Red Blood Cells': 1,
    'Pus Cells': 1,
    'Pus Cell Clumps': 0,
    'Bacteria': 0,
    'Blood Glucose Random': 114.0,
    'Blood Urea': 36.0,
    'Serum Creatinine': 1.2,
    'Sodium': 137.0,
    'Potassium': 4.5,
    'Hemoglobin': 15.0,
    'Packed Cell Volume (%)': 51.0,
    'White Blood Cell Count': 7200.0,
    'Red Blood Cell Count': 5.9,
    'Hypertension': 1,
    'Diabetes Mellitus': 1,
    'Coronary Artery Disease': 0,
    'Appetite': 0,
    'Pedal Edema': 1,
    'Anemia': 1
}

new_df = pd.DataFrame(new_data_1 , index=[0])
new_df

In [None]:
lr_score = lr.predict(new_df)
svm_score = svm.predict(new_df)
rf_score = rf.predict(new_df)
dt_reg_score = dt_reg.predict(new_df)
dt_clf_score = dt_clf.predict(new_df)


print(f'lr : {lr_score}\nsvm : {svm_score}\nrf : {rf_score}\ndt_reg : {dt_reg_score}\ndt_clf : {dt_clf_score}\n')


Save Model

In [None]:
import joblib

joblib.dump(dt_reg , 'Chronic_dices_1.pkl')