In [None]:
# Importing necessary libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline
from scipy.stats import chi2_contingency, mannwhitneyu, median_test, ttest_ind, kruskal
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# importing data 
ckd_df = pd.read_csv(r"C:\Users\visha\Desktop\Dissertation\Chronic_Kidney_Disease\chronic_kidney_disease_full.csv")
ckd_df.head(5)

In [None]:
ckd_df.info()

In [None]:
# replacing '?' with NaN values
ckd_df.replace('?', np.nan, inplace=True)

In [None]:
ckd_df = ckd_df.drop('id',axis=1)

In [None]:
ckd_df.columns = ['age','blood_pressure','specific_gravity','albumin','sugar','red_blood_cells','pus_cell','puss_cell_clumps','bacteria',
                  'blood_glucose','blood_urea','serum_creatinine','sodium','potassium','haemoglobin','packed_cell_volume',
                  'white_blood_cells_count','red_blood_cell_count','hypertension','diabetes_mellitus','coronary_artery_disease',
                  'appetite','pedal_edema','anemia','chronic_kidney_disease']

ckd_df.columns


In [None]:
ckd_df.info()

In [None]:
#Since all the features are showing as object data type, converting necessary features to numeric as they should be
columns_to_convert = ['age', 'blood_pressure', 'specific_gravity', 'albumin','sugar','blood_glucose', 'blood_urea', 'serum_creatinine', 
                      'sodium', 'potassium', 'haemoglobin', 'packed_cell_volume', 
                      'white_blood_cells_count', 'red_blood_cell_count']

ckd_df[columns_to_convert] = ckd_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [None]:
ckd_df.info()

In [None]:
# creating categorical columns and numerical columns variables for further use
cat_cols = []

for col in ckd_df.columns:
    if ckd_df[col].dtype == 'object':
        cat_cols.append(col)



num_cols = []

for col in ckd_df.columns:
    if ckd_df[col].dtype != 'object':
        num_cols.append(col)


In [None]:
# checking for unique values in each feature to understand if data needs any further processing

for col in cat_cols:
    print(f'{col} feature has {ckd_df[col].unique()} values')

In [None]:
for col in num_cols:
    print(f'{col} feature has {ckd_df[col].unique()} values')

# Exploratory Data Analysis

In [None]:
#descriptive statistics
ckd_df.describe()

In [None]:
import matplotlib.style as style
style.use('fivethirtyeight')

In [None]:
n_rows, n_cols = (7,2)

figure, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(20, 50))
figure.suptitle('\nDistributions of Numerical Features', fontsize=60)

for index, column in enumerate(num_cols):
    
    i,j = (index // n_cols), (index % n_cols)
    
    miss_perc="%.2f"%(100*(1-(ckd_df[column].dropna().shape[0])/ckd_df.shape[0]))
    
    collabel=column+"\n({}% is missing)".format(miss_perc)
    
    fig=sns.distplot(ckd_df[column], color="g", label=collabel, norm_hist=True,
    
    ax=axes[i,j], kde_kws={"lw":4})
    
    fig=fig.legend(loc='best', fontsize=18)
    
    axes[i,j].set_ylabel("Probability Density",fontsize='medium')
    
    axes[i,j].set_xlabel(None)

plt.show()

In [None]:
style.use('seaborn-darkgrid')


In [None]:

n_rows, n_cols = (6, 2)

figure, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(30, 50))
figure.suptitle('\nCountplots of Categorical Features', fontsize=60)

for index, column in enumerate(cat_cols):

    i, j = index // n_cols, index % n_cols

    miss_perc = "%.2f" % (100 * (1 - (ckd_df[column].dropna().shape[0]) / ckd_df.shape[0]))

    collabel = column + "\n({}% is missing)".format(miss_perc)

    fig = sns.countplot(x=column, data=ckd_df, label=collabel, palette=sns.cubehelix_palette(rot=-.35, light=0.85, hue=1),

    ax=axes[i, j])

    axes[i, j].set_title(collabel, fontsize=30)

    axes[i, j].set_xlabel(None)

    axes[i, j].set_ylabel("Count", fontsize=20)

    axes[i, j].set_xticklabels(axes[i, j].get_xticklabels(), fontsize=28)

plt.show()

In [None]:
ckd_df.info()

In [None]:
ckd_df.head()

In [None]:
#boxplots to understand more about outliers 

n_rows, n_cols = (7,2)


fig, axes = plt.subplots(nrows=n_rows,ncols=n_cols,figsize=(18,25))


for index,col in enumerate(num_cols):
    [i,j] = index//n_cols, index%n_cols

    fig = sns.boxplot(data=ckd_df,x=col,ax=axes[i,j],notch=True,flierprops={"marker": "x"},color="#B4E2B0")
    
plt.tight_layout()   

   

In [None]:
#converting target variable to numeric for correlation analysis

df = pd.DataFrame()

plt.figure(figsize=(12,8))

df[num_cols] = ckd_df[num_cols]

df["target"] = ckd_df["chronic_kidney_disease"].map({'ckd':1,'notckd':0})

df_cor = df.corr()

mask = np.triu(np.ones_like(df_cor))

sns.heatmap(data=df_cor,mask=mask,annot=True,linewidths=3,fmt='.2f')

plt.tight_layout


###high positive correlations:
1. blood-glucose and sugar 
2. packed_cell_volume and haemoglobin
3. red_blood_cell_count and haemoglobin
4. red_blood_cell_count and packed_cell_volume

high negative correlations:
1. specific_gravity and target
2. haemoglobin and target
3. packed_cell_volume and target
4. red_blood_cell_count and target

In [None]:
#violin plots and scatter plot to understand the correlation:
def violin(col):
    fig = sns.violinplot(ckd_df, y=col, x="chronic_kidney_disease", box=True)
    return plt.show()

def scatter(col1, col2):
    fig = sns.scatterplot(ckd_df, x=col1, y=col2, hue="chronic_kidney_disease")
    return plt.show()

In [None]:
# 1. blood_glucose and sugar 

scatter("blood_glucose","sugar")

In [None]:
# 2. packed_cell_volume and haemoglobin
scatter("packed_cell_volume","haemoglobin")

In [None]:
# 3. red_blood_cell_count and haemoglobin
scatter("red_blood_cell_count","haemoglobin")

In [None]:
#4. red_blood_cell_count and packed_cell_volume
scatter("red_blood_cell_count","packed_cell_volume")

In [None]:
# 1. specific_gravity and target
violin("specific_gravity")

In [None]:
# 2. haemoglobin and target
violin("haemoglobin")


In [None]:
# 3. packed_cell_volume and target
violin("packed_cell_volume")

In [None]:
# 4. red_blood_cell_count and target
violin("red_blood_cell_count")

In [None]:
#one hot encoding 
ohe_data = pd.get_dummies(ckd_df,drop_first=True,prefix_sep=':',dtype=int,dummy_na=False)
ohe_data.isna().sum()


In [None]:
#the missing values have been converted in to 0s for cat columns, converting them back 

names={}
for name in ckd_df.columns:
    for ohe in ohe_data.columns:
        if name+':' in ohe and name in cat_cols:
            names[name]=ohe
            for i in range(400):
                if type(ckd_df.loc[i,name])!=str:
                    if math.isnan(ckd_df.loc[i,name]):
                        ohe_data.loc[i,ohe]=ckd_df.loc[i,name]




In [None]:
ohe_data.isna().sum()

In [None]:
ohe_cat_cols = list(ohe_data.columns.values)
ohe_num_cols = ohe_cat_cols[:14]
ohe_cat_cols = ohe_cat_cols[14:]

In [None]:
ohe_data.iloc[:,14:]

In [None]:
pipe = make_pipeline(StandardScaler())
print(pipe)

In [None]:
df = [ohe_data]
df1 = pd.DataFrame(pipe.fit_transform(ohe_data),columns = ohe_data.columns)
df.append(df1)

In [None]:
df


In [None]:
import missingno as msno

msno.bar(ckd_df,color="turquoise",sort="ascending")

In [None]:
#KNN imputation 

imputer = KNNImputer(weights='distance',n_neighbors=8)

In [None]:
rrr = [ohe_data.to_numpy()]
rrr.append(imputer.fit_transform(df[1]))

In [None]:
arr = [rrr[0]]
for i in range(1,len(rrr)):
    arr.append(pipe[i-1].inverse_transform(rrr[i]))

In [None]:
imputed_df=[]
for i in range(len(arr)):
    imputed_df.append(pd.DataFrame(arr[i],columns=ohe_data.columns))

In [None]:
imputed_df

In [None]:
ohe_data = imputed_df[1].copy()

In [None]:
ohe_data.dropna().shape #no missing values 

In [None]:
msno.bar(ohe_data,color="aquamarine")

In [None]:
ohe_data.dropna().shape

# Hypothesis Testing

In [None]:
#plotting categorical variables with target variable

figure, axes = plt.subplots(6, 2,figsize=(50, 100))
figure.suptitle('\nCrossTabs of Categorical Variables with Target Variable', fontsize=70)

for index, col in enumerate(cat_cols):
    
    i,j = (index // 2), (index % 2)
    
    sns.heatmap(pd.crosstab(ckd_df[col],ckd_df['chronic_kidney_disease']),
                ax=axes[i,j],            
                square='True',
                cbar=False,
                annot=True,
                annot_kws={'fontsize':90},
                fmt='d')
        
    axes[i,j].set_xlabel("Disease", fontsize=90)

    axes[i,j].set_ylabel(col,fontsize=90)
    
    axes[i,j].set_yticklabels(axes[i,j].get_yticklabels(),fontsize=50)
    
    axes[i,j].set_xticklabels(["No CKD","CKD"],fontsize=50)

plt.show()

nothing interesting with the categorical variables. lets see if the numerical variable have any effect. 

In [None]:
#plotting numerical variables with target variable

figure, axes = plt.subplots(7, 2,figsize=(50, 100))
figure.suptitle('\nBox Plots of Numerical Variables with Target Variable', fontsize=100)

for index, col in enumerate(num_cols):
    
    i,j = (index // 2), (index % 2)
    
    sns.boxenplot(data=ckd_df,y=ckd_df[col],x=ckd_df['chronic_kidney_disease'],
                ax=axes[i,j],color='aquamarine')
        
    axes[i,j].set_xlabel("Disease", fontsize=90)

    axes[i,j].set_ylabel(col,fontsize=90)
    
    axes[i,j].set_yticklabels(axes[i,j].get_yticklabels(),fontsize=30)
    axes[i,j].set_xticklabels(["No CKD","CKD"],fontsize=30)

plt.show()

In [None]:
ohe_data.columns

In [None]:
# Hypothesis 1 - Chi-Square Test for Impact of Specific Gravity on Chronic Kidney Disease
contingency_table = pd.crosstab(ohe_data['specific_gravity'], ohe_data['chronic_kidney_disease:notckd'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
if p_value < 0.05:
    print(" Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

In [None]:
# Hypothesis 2 - No significant difference in albumin levels between noCKD and CKD groups
albumin_noCKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 1]['albumin']
albumin_CKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 0]['albumin']

# Perform Mann-Whitney U test
statistic, p_value = mannwhitneyu(albumin_noCKD, albumin_CKD, alternative='two-sided')
if p_value < 0.05:
    print(" Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

In [None]:
# Hypothesis 3 - No significant difference in sugar levels between noCKD and CKD groups
sugar_noCKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 1]['sugar']
sugar_CKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 0]['sugar']

# Perform Mann-Whitney U test
statistic, p_value = mannwhitneyu(sugar_noCKD, sugar_CKD, alternative='two-sided')
if p_value < 0.05:
    print(" Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

In [None]:
#Hypothesis 4 - No significant difference in blood glucose levels between nockd and ckd
blood_glucose_noCKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 1]['blood_glucose']
blood_glucose_CKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 0]['blood_glucose']

# Perform Mood's Median Test
statistic, p_value, medians, table = median_test(blood_glucose_noCKD, blood_glucose_CKD)

if p_value < 0.05:
    print(" Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

In [None]:
#Hypothesis 5 - No significant difference in blood urea levels between nockd and ckd
blood_urea_noCKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 1]['blood_urea']
blood_urea_CKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 0]['blood_urea']

# Perform Mood's Median Test
statistic, p_value, medians, table = median_test(blood_urea_noCKD, blood_urea_CKD)

if p_value < 0.05:
    print(" Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

In [None]:
# Hypothesis 6 - No significant difference in serum creatinine levels between noCKD and CKD groups
serum_noCKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 1]['serum_creatinine']
serum_CKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 0]['serum_creatinine']

# Perform Mann-Whitney U test
statistic, p_value = mannwhitneyu(serum_noCKD, serum_CKD, alternative='two-sided')
if p_value < 0.05:
    print(" Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

In [None]:
#Hypothesis 7 - No significant difference in sodium levels between nockd and ckd
sodium_noCKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 1]['sodium']
sodium_CKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 0]['sodium']

# Perform Mood's Median Test
statistic, p_value, medians, table = median_test(sodium_noCKD, sodium_CKD)

if p_value < 0.05:
    print(" Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

In [None]:
#Hypothesis 8 - No significant difference in haemoglobin levels between nockd and ckd

haemoglobin_noCKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 1]['haemoglobin']
haemoglobin_CKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 0]['haemoglobin']

# Perform two-sample t-test
statistic, p_value = ttest_ind(haemoglobin_noCKD, haemoglobin_CKD)

if p_value < 0.05:
    print(" Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

In [None]:
#Hypothesis 9 - blood_glucose, sugar and the target variable

# for blood_glucose and chronic_kidney_disease
kw_statistic_blood_glucose, p_value_blood_glucose = kruskal(blood_glucose_noCKD, blood_glucose_CKD)

# Sugar and chronic_kidney_disease
kw_statistic_sugar, p_value_sugar = kruskal(sugar_noCKD, sugar_CKD)

print(f"Kruskal-Wallis - Blood Glucose: H = {kw_statistic_blood_glucose}, p = {p_value_blood_glucose}")
print(f"Kruskal-Wallis - Sugar: H = {kw_statistic_sugar}, p = {p_value_sugar}")


In [None]:
#Hypothesis 10 - packed_cell_volume, red_blood_cell_count and CKD


pcv_noCKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 1]['packed_cell_volume']
pcv_CKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 0]['packed_cell_volume']

rbc_noCKD = ohe_data[ohe_data['chronic_kidney_disease:notckd'] == 1]['red_blood_cell_count']
rbc_CKD = ohe_data[ohe_data['chronic_kidney_disease:notckd']==0]['red_blood_cell_count']

# Perform two-sample t-test for packed_cell_volume and chronic_kidney_disease
t_statistic_pcv, p_value_pcv = ttest_ind(pcv_noCKD, pcv_CKD)

# Perform two-sample t-test for red_blood_cell_count and chronic_kidney_disease
t_statistic_rbc, p_value_rbc = ttest_ind(rbc_noCKD, rbc_CKD)

# Print results
print(f"Two-sample t-test - Packed Cell Volume: t = {t_statistic_pcv}, p = {p_value_pcv}")
print(f"Two-sample t-test - Red Blood Cell Count: t = {t_statistic_rbc}, p = {p_value_rbc}")


# Scaling

In [None]:
#scaling using standard scaler 
X = ohe_data.drop("chronic_kidney_disease:notckd",axis=1,inplace=False)
y = ohe_data["chronic_kidney_disease:notckd"]

In [None]:
ohe_scaled_df = StandardScaler().fit_transform(ohe_data)
scaled_df = StandardScaler().fit_transform(X)

# Splitting Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df,y,test_size=0.3,random_state=30)

# KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, auc, roc_curve

In [None]:
k_values = [3, 5, 7, 9, 11, 13, 15, 17, 19, 21]
accuracy_scores = []

for k in k_values:
    knn_model = KNeighborsClassifier(n_neighbors=k) # Create and fit the k-nearest neighbors model
    knn_model.fit(X_train, y_train)

    y_pred_test = knn_model.predict(X_test)
    knn_accuracy = accuracy_score(y_test,y_pred_test) #accuracy 

    print(f"Results for k = {k}")
    print(f"Test Accuracy of KNN is {knn_accuracy} \n")

    print(f"Confusion Matrix :- \n{confusion_matrix(y_test, y_pred_test)}\n")
    print(f"Classification Report :- \n{classification_report(y_test, y_pred_test)}")
    print("=" * 50)

    accuracy_scores.append(knn_accuracy)






In [None]:
plt.figure(figsize=(10, 6))
plt.plot(k_values, accuracy_scores, marker='o')
plt.xlabel('K-Value')
plt.ylabel('Accuracy')
plt.title('K-Value vs. Accuracy')
plt.xticks(np.arange(1, 21))
plt.grid(True)
plt.show()

In [None]:
#considering k=15 as optimal k-value
knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

knn_accuracy = accuracy_score(y_test,y_pred)

y_pred_probs = knn_model.predict_proba(X_test)[:, 1]  # Probabilities for positive class
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)
confusion = confusion_matrix(y_test,y_pred)

print(f"AUC of ROC curve: {roc_auc}\n")


In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues',
            xticklabels=["noCKD","CKD"], yticklabels=["noCKD","CKD"],cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(criterion='entropy',splitter='random')
dt.fit(X_train,y_train)

y_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred)

y_pred_probs = dt.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test,y_pred_probs)
roc_auc = auc(fpr, tpr)
confusion = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy is {dt_accuracy} \n")

print(f"Confusion Matrix :- \n{confusion}\n")
print(f"Classification Report :- \n {classification_report(y_test, dt.predict(X_test))}")



print(f"AUC of ROC curve: {roc_auc}\n")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues',
            xticklabels=["noCKD","CKD"], yticklabels=["noCKD","CKD"],cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

## Random forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from matplotlib.legend_handler import HandlerLine2D

In [None]:
#lets see how default random forest classifier performs

rf = RandomForestClassifier()
rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test,y_pred)

y_pred_probs = rf.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test,y_pred_probs)
roc_auc = auc(fpr,tpr)

print(f"The accuracy score is {rf_accuracy}")
print(f"the ROC AUC is {roc_auc}")      

In [None]:
#finding the optinum number of trees in forest 
n_estimators = [2,3,4,5,10,15,20,25,30,50, 100, 150, 200, 250, 300]
train_results = []
test_results = []
for estimator in n_estimators:
   rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
   rf.fit(X_train, y_train)   
   train_pred = rf.predict(X_train)   
   fpr, tpr, _ = roc_curve(y_train, train_pred)
   roc_auc = auc(fpr, tpr)
   train_results.append(roc_auc)  
   y_pred = rf.predict(X_test)   
   fpr, tpr, _ = roc_curve(y_test, y_pred)
   roc_auc = auc(fpr, tpr)
   test_results.append(roc_auc)
   

line1, = plt.plot(n_estimators, train_results, 'b', label="Train AUC")
line2, = plt.plot(n_estimators, test_results, 'r', label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("n_estimators")
plt.show()

In [None]:
#checking model with recommended parameters
rf = RandomForestClassifier(criterion = 'entropy', max_depth = 11, 
                            max_features = 'auto', min_samples_leaf = 2, 
                            min_samples_split = 3, n_estimators = 130)
rf.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of random forest

rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)

y_pred_probs = rf.predict_proba(X_test)[:, 1]  # Probabilities for positive class
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)

print(f"Training Accuracy is {accuracy_score(y_train, rf.predict(X_train))}")
print(f"Test Accuracy is {rf_accuracy} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, rf.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, rf.predict(X_test))}")



print(f"AUC of ROC curve: {roc_auc}\n")

## AdaBoost/Adaptive Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
#training and fitting model, then printing scores
ada = AdaBoostClassifier(base_estimator = dt)
ada.fit(X_train, y_train)

y_pred = ada.predict(X_test)
ada_accuracy = accuracy_score(y_test, y_pred)

y_pred_probs = ada.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test,y_pred_probs)
roc_auc = auc(fpr, tpr)
confusion = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy is {ada_accuracy} \n")

print(f"Confusion Matrix :- \n{confusion}\n")
print(f"Classification Report :- \n {classification_report(y_test, ada.predict(X_test))}")



print(f"AUC of ROC curve: {roc_auc}\n")

In [None]:
#plotting ROC 
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues',
            xticklabels=["noCKD","CKD"], yticklabels=["noCKD","CKD"],cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

## Gradient Boosting Clasifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#finding the optinum number of n_estimator
n_estimators = [2,3,4,5,10,15,20,25,30,50]
train_results = []
test_results = []
for estimator in n_estimators:
   gb = GradientBoostingClassifier(n_estimators=estimator,criterion='squared_error')
   gb.fit(X_train, y_train)   
   train_pred = gb.predict(X_train)   
   fpr, tpr, _ = roc_curve(y_train, train_pred)
   roc_auc = auc(fpr, tpr)
   train_results.append(roc_auc)  
   y_pred = gb.predict(X_test)   
   fpr, tpr, _ = roc_curve(y_test, y_pred)
   roc_auc = auc(fpr, tpr)
   test_results.append(roc_auc)
   

line1, = plt.plot(n_estimators, train_results, 'b', label="Train AUC")
line2, = plt.plot(n_estimators, test_results, 'r', label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("n_estimators")
plt.show()

In [None]:
gb = GradientBoostingClassifier(n_estimators=25, validation_fraction=0.2, n_iter_no_change=10,
                                learning_rate=0.1, max_depth=2)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred)

y_pred_probs = gb.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)
confusion = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy is {gb_accuracy} \n")
print(f"Confusion Matrix:\n{confusion}\n")
print(f"Classification Report:\n{classification_report(y_test, gb.predict(X_test))}")
print(f"AUC of ROC curve: {roc_auc}\n")

In [None]:
#plotting ROC 
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues',
            xticklabels=["noCKD","CKD"], yticklabels=["noCKD","CKD"],cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

## Stochastic Gradient Boosting Classifier


In [None]:
#finding the optinum number of n_estimator
n_estimators = [2,3,4,5,10,15,20,25,30,50,100,200,300]
train_results = []
test_results = []
for estimator in n_estimators:
   sgb = GradientBoostingClassifier(n_estimators=estimator,criterion='squared_error')
   sgb.fit(X_train, y_train)   
   train_pred = gb.predict(X_train)   
   fpr, tpr, _ = roc_curve(y_train, train_pred)
   roc_auc = auc(fpr, tpr)
   train_results.append(roc_auc)  
   y_pred = gb.predict(X_test)   
   fpr, tpr, _ = roc_curve(y_test, y_pred)
   roc_auc = auc(fpr, tpr)
   test_results.append(roc_auc)
   

line1, = plt.plot(n_estimators, train_results, 'b', label="Train AUC")
line2, = plt.plot(n_estimators, test_results, 'r', label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("n_estimators")
plt.show()

In [None]:
sgb = GradientBoostingClassifier(max_depth=2, subsample=1, max_features=1, n_estimators=25, random_state=42)
sgb.fit(X_train, y_train)

y_pred = sgb.predict(X_test)
sgb_accuracy = accuracy_score(y_test, y_pred)

y_pred_probs = sgb.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)
confusion = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy is {sgb_accuracy} \n")
print(f"Confusion Matrix:\n{confusion}\n")
print(f"Classification Report:\n{classification_report(y_test, sgb.predict(X_test))}")
print(f"AUC of ROC curve: {roc_auc}\n")

In [None]:
#plotting ROC 
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues',
            xticklabels=["noCKD","CKD"], yticklabels=["noCKD","CKD"],cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

## XgBoost Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(objective = 'binary:logistic', learning_rate = 0.5, max_depth = 4, n_estimators = 200)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred)

y_pred_probs = xgb.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)
confusion = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy is {xgb_accuracy} \n")
print(f"Confusion Matrix:\n{confusion}\n")
print(f"Classification Report:\n{classification_report(y_test, xgb.predict(X_test))}")
print(f"AUC of ROC curve: {roc_auc}\n")

In [None]:
#plotting ROC 
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues',
            xticklabels=["noCKD","CKD"], yticklabels=["noCKD","CKD"],cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

## CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier

In [None]:
cat = CatBoostClassifier(iterations=7,verbose=0)
cat.fit(X_train, y_train)

y_pred = cat.predict(X_test)
cat_accuracy = accuracy_score(y_test, y_pred)

y_pred_probs = cat.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)
confusion = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy is {cat_accuracy} \n")
print(f"Confusion Matrix:\n{confusion}\n")
print(f"Classification Report:\n{classification_report(y_test, cat.predict(X_test))}")
print(f"AUC of ROC curve: {roc_auc}\n")


In [None]:
#plotting ROC 
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues',
            xticklabels=["noCKD","CKD"], yticklabels=["noCKD","CKD"],cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

## Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_predict

In [None]:
et = ExtraTreesClassifier(n_estimators=100, max_depth=5, random_state=30)


# Implement cross-validation to control overfitting
y_pred_cv = cross_val_predict(et, X_train, y_train, cv=5)

# Calculate accuracy score
et_accuracy = accuracy_score(y_train, y_pred_cv)

et.fit(X_train, y_train)

y_pred = et.predict(X_test)



y_pred_probs = et.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)
confusion = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy is {et_accuracy} \n")
print(f"Confusion Matrix:\n{confusion}\n")
print(f"Classification Report:\n{classification_report(y_test, et.predict(X_test))}")
print(f"AUC of ROC curve: {roc_auc}\n")

In [None]:
#plotting ROC 
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues',
            xticklabels=["noCKD","CKD"], yticklabels=["noCKD","CKD"],cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

## LightGBM Classifier

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm = LGBMClassifier(learning_rate = 2, n_estimators=100,max_depth=3)

# Implement cross-validation to control overfitting
y_pred_cv = cross_val_predict(lgbm, X_train, y_train, cv=5)

# Calculate accuracy score
lgbm_accuracy = accuracy_score(y_train, y_pred_cv)

lgbm.fit(X_train, y_train)

y_pred = lgbm.predict(X_test)



y_pred_probs = lgbm.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)
confusion = confusion_matrix(y_test, y_pred)

print(f"Test Accuracy is {lgbm_accuracy} \n")
print(f"Confusion Matrix:\n{confusion}\n")
print(f"Classification Report:\n{classification_report(y_test, lgbm.predict(X_test))}")
print(f"AUC of ROC curve: {roc_auc}\n")


In [None]:
#plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues',
            xticklabels=["noCKD","CKD"], yticklabels=["noCKD","CKD"],cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

## Performance Comparison

In [None]:
performance = pd.DataFrame({
    'Model' : [ 'KNN', 'Decision Tree Classifier', 'Ada Boost Classifier',
             'Gradient Boosting Classifier', 'Stochastic Gradient Boosting', 'XgBoost', 'Cat Boost', 'Extra Trees Classifier', 'LGBM Classifier'],
    'Score' : [knn_accuracy, dt_accuracy, ada_accuracy, gb_accuracy, sgb_accuracy, xgb_accuracy, cat_accuracy, et_accuracy, lgbm_accuracy]
})


In [None]:
performance = performance.sort_values(by='Score', ascending=True).reset_index()


In [None]:

plt.figure(figsize=(20, 15))
sns.barplot(data=performance, x='Model', y='Score', palette=sns.color_palette("Blues", len(performance)))
plt.title('\nAccuracy of Various Models\n', fontsize=50)

plt.xticks(rotation=45, ha='right', fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel('Model',fontsize=25)
plt.ylabel('Score',fontsize=25)



for index, row in performance.iterrows():
    plt.text(index, row['Score'], round(row['Score'], 2), ha='center', va='bottom', fontsize=20)

plt.tight_layout()
plt.show()
