In [28]:
# Plot data below with three imports
import plotly as plot
import plotly.express as px
import matplotlib as matplot

# Manipulate data below with three imports
import numpy as np
import pandas as pd
import scipy

In [29]:
# Allow for OS-level manipulation
import os
import sys

In [30]:
contents = os.listdir("../dataset")
contents

['bp_disease_pavanbodanki.csv',
 'BP_jimohyusuf.csv',
 'diabetes.csv',
 'kidney_disease.csv']

In [31]:
df_bp_datasets = [ pd.read_csv(f"../dataset/{x}") for x in contents ]

In [32]:
df_bp_datasets[0]

Unnamed: 0,Patient_Number,Blood_Pressure_Abnormality,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Age,BMI,Sex,Pregnancy,Smoking,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Stress,Chronic_kidney_disease,Adrenal_and_thyroid_disorders
0,1,1,11.28,0.90,34,23,1,1.0,0,45961,48071,,2,1,1
1,2,0,9.75,0.23,54,33,1,,0,26106,25333,205.0,3,0,0
2,3,1,10.79,0.91,70,49,0,,0,9995,29465,67.0,2,1,0
3,4,0,11.00,0.43,71,50,0,,0,10635,7439,242.0,1,1,0
4,5,1,14.17,0.83,52,19,0,,0,15619,49644,397.0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,1,10.14,0.02,69,26,1,,1,26118,47568,144.0,3,1,0
1996,1997,1,11.77,1.00,24,45,1,1.0,1,2572,8063,,3,1,1
1997,1998,1,16.91,0.22,18,42,0,,0,14933,24753,,2,1,1
1998,1999,0,11.15,0.72,46,45,1,,1,18157,15275,253.0,3,0,1


In [33]:
df_bp_datasets[1]

Unnamed: 0,age,SBP,DBP,BS,BodyTemp,HeartRate,RiskLevel,MinExercise
0,25,130,80,15.0,98.0,86,high risk,32.0
1,35,140,90,13.0,98.0,70,high risk,36.0
2,29,90,70,8.0,100.0,80,high risk,28.0
3,30,140,85,7.0,98.0,70,high risk,34.0
4,35,120,60,6.1,98.0,76,low risk,24.0
...,...,...,...,...,...,...,...,...
1009,22,120,60,15.0,98.0,80,high risk,24.0
1010,55,120,90,18.0,98.0,60,high risk,36.0
1011,35,85,60,19.0,98.0,86,high risk,24.0
1012,43,120,90,18.0,98.0,70,high risk,36.0


In [34]:
px.imshow(df_bp_datasets[0].corr())

In [35]:
# MLXTEND CAN BE AN OPTIONAL IMPORT
from sklearn.preprocessing import \
    OneHotEncoder, LabelEncoder, \
    Normalizer

LabelEncoder = LabelEncoder()
df_bp_datasets[1]["RiskLevel"] = LabelEncoder.fit_transform(df_bp_datasets[1]["RiskLevel"])
df_bp_datasets[1]

Unnamed: 0,age,SBP,DBP,BS,BodyTemp,HeartRate,RiskLevel,MinExercise
0,25,130,80,15.0,98.0,86,0,32.0
1,35,140,90,13.0,98.0,70,0,36.0
2,29,90,70,8.0,100.0,80,0,28.0
3,30,140,85,7.0,98.0,70,0,34.0
4,35,120,60,6.1,98.0,76,1,24.0
...,...,...,...,...,...,...,...,...
1009,22,120,60,15.0,98.0,80,0,24.0
1010,55,120,90,18.0,98.0,60,0,36.0
1011,35,85,60,19.0,98.0,86,0,24.0
1012,43,120,90,18.0,98.0,70,0,36.0


In [36]:
# Visualize AGE Histogram
px.histogram(df_bp_datasets[1][0:3])

In [37]:
px.imshow(df_bp_datasets[1].corr())

# What does SBP, DBP, BS mean?
# SBP = Systolic Blood Pressure
# DBP = Diastolic Blood Pressure
# BS = Blood Sugar

print(df_bp_datasets[1].columns)

Index(['age', 'SBP', 'DBP', 'BS', 'BodyTemp', 'HeartRate', 'RiskLevel',
       'MinExercise'],
      dtype='object')


In [38]:
df_bp_datasets[2]
print(df_bp_datasets[2].columns)
px.imshow(df_bp_datasets[2].corr())

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [39]:
df_bp_datasets[3]=df_bp_datasets[3].dropna()
df_bp_datasets[3].shape
print(df_bp_datasets[3].columns)
df_bp_datasets[3].head()

def estimate_gfr(age, serum_creatinine):
    # Basic formula (simplified MDRD equation — can be refined later)
    return 175 * (serum_creatinine ** -1.154) * (age ** -0.203)

df_bp_datasets[3]['gfr'] = df_bp_datasets[3].apply(lambda row: estimate_gfr(row['age'], row['sc']), axis=1)

def classify_ckd_stage(gfr):
    if gfr >= 90:
        return "Stage 1 - Normal Function"
    elif gfr >= 60:
        return "Stage 2 - Mild CKD"
    elif gfr >= 45:
        return "Stage 3a - Mild to Moderate CKD"
    elif gfr >= 30:
        return "Stage 3b - Moderate to Severe CKD"
    elif gfr >= 15:
        return "Stage 4 - Severe CKD"
    else:
        return "Stage 5 - Kidney Failure"

df_bp_datasets[3]['ckd_stage'] = df_bp_datasets[3]['gfr'].apply(classify_ckd_stage)

print(df_bp_datasets[3][['gfr', 'ckd_stage']].head())


Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')
          gfr                 ckd_stage
3   17.087393      Stage 4 - Severe CKD
9    8.010292  Stage 5 - Kidney Failure
11  23.987121      Stage 4 - Severe CKD
14  14.584319  Stage 5 - Kidney Failure
20  15.795288      Stage 4 - Severe CKD


In [40]:
df_bp_datasets[3]

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,rc,htn,dm,cad,appet,pe,ane,classification,gfr,ckd_stage
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,3.9,yes,no,no,poor,yes,yes,ckd,17.087393,Stage 4 - Severe CKD
9,9,53.0,90.0,1.020,2.0,0.0,abnormal,abnormal,present,notpresent,...,3.7,yes,yes,no,poor,no,yes,ckd,8.010292,Stage 5 - Kidney Failure
11,11,63.0,70.0,1.010,3.0,0.0,abnormal,abnormal,present,notpresent,...,3.8,yes,yes,no,poor,yes,no,ckd,23.987121,Stage 4 - Severe CKD
14,14,68.0,80.0,1.010,3.0,2.0,normal,abnormal,present,present,...,2.6,yes,yes,yes,poor,yes,no,ckd,14.584319,Stage 5 - Kidney Failure
20,20,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,...,3.2,yes,yes,yes,poor,yes,yes,ckd,15.795288,Stage 4 - Severe CKD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,4.9,no,no,no,good,no,no,notckd,172.636665,Stage 1 - Normal Function
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,6.2,no,no,no,good,no,no,notckd,66.396209,Stage 2 - Mild CKD
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,5.4,no,no,no,good,no,no,notckd,190.536294,Stage 1 - Normal Function
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,5.9,no,no,no,good,no,no,notckd,98.459339,Stage 1 - Normal Function


In [41]:
kidney_disease_data_headers = df_bp_datasets[3].columns
kidney_disease_data_headers_dtypes = df_bp_datasets[3].dtypes

for v in zip(kidney_disease_data_headers, kidney_disease_data_headers_dtypes):
    if str.lower(str(v[1])) == "object":
        df_bp_datasets[3][v[0]] = LabelEncoder.fit_transform(df_bp_datasets[3][v[0]])

df_bp_datasets[3].head()
px.imshow(df_bp_datasets[3].corr())

In [42]:
[df_bp_datasets[3][['age', 'wc', 'sod', 'bp']].min(), df_bp_datasets[3][['age', 'wc', 'sod', 'bp']].max()]

[age      6.0
 wc       0.0
 sod    111.0
 bp      50.0
 dtype: float64,
 age     83.0
 wc      70.0
 sod    150.0
 bp     110.0
 dtype: float64]

In [43]:
df_bp_datasets[3].head()
df_bp_datasets[3] = df_bp_datasets[3].drop('id', axis=1)

In [44]:
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer

def pca_analysis(df_bp_datasets, index):
    # Standardize the data
    data = df_bp_datasets[index]
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(data)

    # Normalize the data if required
    normalized_data = Normalizer().fit_transform(standardized_data)

    # Apply PCA
    pca = PCA()
    pca.fit(normalized_data)

    # Calculate explained variance ratio
    explained_variance = pca.explained_variance_ratio_

    # Calculate cumulative explained variance
    cumulative_explained_variance = np.cumsum(explained_variance)

    # Create a plot with Plotly
    fig = px.line(
        x=range(1, len(cumulative_explained_variance) + 1),
        y=cumulative_explained_variance,
        markers=True,
        labels={
            'x': 'Number of Components',
            'y': 'Cumulative Explained Variance'
        },
        title='Cumulative Explained Variance by Number of Principal Components'
    )

    # Add a horizontal line at 95% cumulative explained variance
    fig.add_shape(
        type='line',
        x0=1,
        y0=0.95,
        x1=len(cumulative_explained_variance),
        y1=0.95,
        line=dict(color='red', dash='dash')
    )

    fig.show()

    # Find the optimal number of components that explain at least 95% of the variance
    optimal_n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1

    print(f'Optimal number of components: {optimal_n_components}')
    return optimal_n_components


In [45]:
data = df_bp_datasets[3]
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data)

# Normalize the data if required
normalized_data = Normalizer().fit_transform(standardized_data)

# Apply PCA
pca = PCA(2)
data_transformed = pca.fit_transform(normalized_data)
data_transformed

px.scatter(x=data_transformed[:, 0], y=data_transformed[:, 1], labels={'x': 'PC1', 'y': 'PC2'}, title='PCA Analysis', color=df_bp_datasets[3]['ckd_stage'])


In [46]:
# Try to make model...

import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from imblearn.over_sampling import SMOTE
import plotly.express as px

# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Load your dataset
data = df_bp_datasets[3]
X = data.drop(columns=['ckd_stage'])  # Features
y = data['ckd_stage']  # Target variable

# Check feature category counts with a BARCHART
px.bar(x=y.value_counts().index, y=y.value_counts().values, title='CKD Stage Distribution').show()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data with adjusted k_neighbors
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the distribution of the target variable in the resampled training set
px.bar(x=y_train_resampled.value_counts().index, y=y_train_resampled.value_counts().values, title='Resampled CKD Stage Distribution').show()

# Define the base estimator
base_estimator = DecisionTreeClassifier()

# Define the parameter grid for Bagging
param_grid_bagging = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 1.0],
    'max_features': [0.5, 1.0]
}

# Define the parameter grid for Boosting
param_grid_boosting = {
    'n_estimators': [10, 50, 100],
    'learning_rate': [0.01, 0.1, 1.0]
}

# Perform Grid Search for Bagging with resampled data
grid_search_bagging = GridSearchCV(BaggingClassifier(base_estimator), param_grid_bagging, cv=5, scoring='accuracy')
grid_search_bagging.fit(X_train_resampled, y_train_resampled)
best_bagging = grid_search_bagging.best_estimator_

# Perform Grid Search for Boosting with resampled data
grid_search_boosting = GridSearchCV(AdaBoostClassifier(base_estimator), param_grid_boosting, cv=5, scoring='accuracy')
grid_search_boosting.fit(X_train_resampled, y_train_resampled)
best_boosting = grid_search_boosting.best_estimator_

# Evaluate the best Bagging model
y_pred_bagging = best_bagging.predict(X_test)
print("Best Bagging Classifier with SMOTE")
print(f'Accuracy: {accuracy_score(y_test, y_pred_bagging)}')
print(classification_report(y_test, y_pred_bagging))

# Evaluate the best Boosting model
y_pred_boosting = best_boosting.predict(X_test)
print("Best Boosting Classifier with SMOTE")
print(f'Accuracy: {accuracy_score(y_test, y_pred_boosting)}')
print(classification_report(y_test, y_pred_boosting))

Best Bagging Classifier with SMOTE
Accuracy: 0.96875
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.92      1.00      0.96        11
           2       0.00      0.00      0.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         5

    accuracy                           0.97        32
   macro avg       0.82      0.83      0.83        32
weighted avg       0.94      0.97      0.95        32

Best Boosting Classifier with SMOTE
Accuracy: 0.96875
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.92      1.00      0.96        11
           2       0.00      0.00      0.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         2
           5       1.00  

In [47]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
import plotly.express as px

# Assuming 'gfr' is one of the columns in df_bp_datasets[3]
data = df_bp_datasets[3].drop(columns=['gfr'])
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data)

# Normalize the data if required
normalized_data = Normalizer().fit_transform(standardized_data)

# Apply PCA
pca = PCA(2)
data_transformed = pca.fit_transform(normalized_data)

# Create a DataFrame for the loadings
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=data.columns)

# Display the loadings
print(loadings)

# Create a spider plot for the loadings
loadings_reset = loadings.reset_index().melt(id_vars='index')
fig = px.line_polar(loadings_reset, r='value', theta='index', color='variable', line_close=True,
                    title='PCA Loadings Spider Plot')
fig.show()

                     PC1       PC2
age            -0.110878  0.856505
bp             -0.083850 -0.152052
sg              0.235593 -0.054948
al             -0.272684 -0.040623
su             -0.139489 -0.016149
rbc             0.173913  0.018078
pc              0.228917  0.058602
pcc            -0.147645 -0.039474
ba             -0.133494 -0.037752
bgr            -0.169499  0.059485
bu             -0.195367 -0.055911
sc             -0.203609 -0.051915
sod             0.176624  0.203417
pot            -0.030336 -0.023673
hemo            0.252109  0.029274
pcv             0.236880  0.057155
wc              0.117581 -0.409631
rc              0.227968 -0.035917
htn            -0.255302 -0.024927
dm             -0.225768 -0.015469
cad            -0.126829 -0.012417
appet          -0.178129 -0.033879
pe             -0.184718 -0.028897
ane            -0.153815 -0.055902
classification  0.297895  0.030853
ckd_stage      -0.269564 -0.004415


In [48]:
# plot loadings as a barplot (horizontal, but absolute, but sorted)
loadings_reset['abs_value'] = loadings_reset['value'].abs()
loadings_reset = loadings_reset.sort_values('abs_value', ascending=False)
fig = px.bar(loadings_reset, x='abs_value', y='index', color='variable', orientation='h',
             title='PCA Loadings Barplot')
fig.show()

In [49]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
import plotly.express as px

# Assuming 'gfr' is one of the columns in df_bp_datasets[3]
data = df_bp_datasets[3].drop(columns=['gfr'])
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data)

# Normalize the data if required
normalized_data = Normalizer().fit_transform(standardized_data)

# Apply PCA
pca = PCA(2)
data_transformed = pca.fit_transform(normalized_data)

# Create a DataFrame for the loadings
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=data.columns)

# Display the loadings
#print(loadings)

# Create a spider plot for the loadings
loadings_reset = loadings.reset_index().melt(id_vars='index')
print(loadings_reset)  # Display the reshaped DataFrame
fig = px.line_polar(loadings_reset, r='value', theta='index', color='variable', line_close=True,
                    title='PCA Loadings Spider Plot')
fig.show()

             index variable     value
0              age      PC1 -0.110878
1               bp      PC1 -0.083850
2               sg      PC1  0.235593
3               al      PC1 -0.272684
4               su      PC1 -0.139489
5              rbc      PC1  0.173913
6               pc      PC1  0.228917
7              pcc      PC1 -0.147645
8               ba      PC1 -0.133494
9              bgr      PC1 -0.169499
10              bu      PC1 -0.195367
11              sc      PC1 -0.203609
12             sod      PC1  0.176624
13             pot      PC1 -0.030336
14            hemo      PC1  0.252109
15             pcv      PC1  0.236880
16              wc      PC1  0.117581
17              rc      PC1  0.227968
18             htn      PC1 -0.255302
19              dm      PC1 -0.225768
20             cad      PC1 -0.126829
21           appet      PC1 -0.178129
22              pe      PC1 -0.184718
23             ane      PC1 -0.153815
24  classification      PC1  0.297895
25       ckd

In [50]:
loadings_reset

Unnamed: 0,index,variable,value
0,age,PC1,-0.110878
1,bp,PC1,-0.08385
2,sg,PC1,0.235593
3,al,PC1,-0.272684
4,su,PC1,-0.139489
5,rbc,PC1,0.173913
6,pc,PC1,0.228917
7,pcc,PC1,-0.147645
8,ba,PC1,-0.133494
9,bgr,PC1,-0.169499


In [51]:
px.imshow(df_bp_datasets[1].corr())

# What does SBP, DBP, BS mean?
# SBP = Systolic Blood Pressure
# DBP = Diastolic Blood Pressure
# BS = Blood Sugar

print(df_bp_datasets[1].columns)

Index(['age', 'SBP', 'DBP', 'BS', 'BodyTemp', 'HeartRate', 'RiskLevel',
       'MinExercise'],
      dtype='object')


In [52]:
#to export:
#- standardized_data (standardized transformer)
#- normalized_data (normalized transformer)
#- data_transformed (PCA'd transformer)
#- grid_search_bagging model
#- grid_search_boosting model

import joblib

# Export the transformers and models
joblib.dump(standardized_data, '../exports/standardized_data.pkl')
joblib.dump(normalized_data, '../exports/normalized_data.pkl')
joblib.dump(data_transformed, '../exports/data_transformed.pkl')
joblib.dump(grid_search_bagging, '../exports/grid_search_bagging.pkl')
joblib.dump(grid_search_boosting, '../exports/grid_search_boosting.pkl')

['../exports/grid_search_boosting.pkl']