## Kidney Chronic Dataset

In [2]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import _tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px
from pulp import LpProblem, LpVariable, LpMaximize, lpSum, LpStatus, value
from sklearn.decomposition import PCA

import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("./data/Chronic_Kidney_Dsease_data.csv")

In [4]:
df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,SocioeconomicStatus,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,...,Itching,QualityOfLifeScore,HeavyMetalsExposure,OccupationalExposureChemicals,WaterQuality,MedicalCheckupsFrequency,MedicationAdherence,HealthLiteracy,Diagnosis,DoctorInCharge
0,1,71,0,0,0,2,31.069414,1,5.128112,1.67622,...,7.556302,76.0768,0,0,1,1.018824,4.966808,9.871449,1,Confidential
1,2,34,0,0,1,3,29.692119,1,18.609552,8.377574,...,6.836766,40.128498,0,0,0,3.923538,8.189275,7.161765,1,Confidential
2,3,80,1,1,0,1,37.394822,1,11.882429,9.607401,...,2.144722,92.872842,0,1,1,1.429906,7.624028,7.354632,1,Confidential
3,4,40,0,2,0,1,31.32968,0,16.020165,0.408871,...,7.077188,90.080321,0,0,0,3.226416,3.282688,6.629587,1,Confidential
4,5,43,0,1,1,2,23.726311,0,7.944146,0.780319,...,3.553118,5.258372,0,0,1,0.285466,3.849498,1.437385,1,Confidential


In [5]:
df.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'SocioeconomicStatus',
       'EducationLevel', 'BMI', 'Smoking', 'AlcoholConsumption',
       'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'FamilyHistoryKidneyDisease', 'FamilyHistoryHypertension',
       'FamilyHistoryDiabetes', 'PreviousAcuteKidneyInjury',
       'UrinaryTractInfections', 'SystolicBP', 'DiastolicBP',
       'FastingBloodSugar', 'HbA1c', 'SerumCreatinine', 'BUNLevels', 'GFR',
       'ProteinInUrine', 'ACR', 'SerumElectrolytesSodium',
       'SerumElectrolytesPotassium', 'SerumElectrolytesCalcium',
       'SerumElectrolytesPhosphorus', 'HemoglobinLevels', 'CholesterolTotal',
       'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides',
       'ACEInhibitors', 'Diuretics', 'NSAIDsUse', 'Statins',
       'AntidiabeticMedications', 'Edema', 'FatigueLevels', 'NauseaVomiting',
       'MuscleCramps', 'Itching', 'QualityOfLifeScore', 'HeavyMetalsExposure',
       'OccupationalExposureChemicals', 'WaterQu

In [6]:
df.drop(['PatientID','Ethnicity', 'SocioeconomicStatus','EducationLevel','DoctorInCharge'], axis = 1, inplace = True)

In [7]:
df.dropna(inplace = True)

In [8]:
X = df[df.columns[:48]]
y = df["Diagnosis"]

feature_names = X.columns
target_names = ["Diagnosis"]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

In [10]:
rf = RandomForestClassifier(n_estimators=30, random_state=42)
rf.fit(X_train, y_train)


#### SHAP Analysis

In [None]:
%%time
import shap
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_interaction_values(X_train)



CPU times: user 48.8 s, sys: 255 ms, total: 49.1 s
Wall time: 49.1 s


## Heart Dataset


In [2]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import _tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px
from pulp import LpProblem, LpVariable, LpMaximize, lpSum, LpStatus, value
from sklearn.decomposition import PCA

import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import plot_tree



In [None]:
df = pd.read_csv("../data/heart.csv")
df = df.rename(
    columns = {'cp':'chest_pain_type', 
               'trestbps':'resting_blood_pressure', 
               'chol': 'cholesterol',
               'fbs': 'fasting_blood_sugar',
               'restecg' : 'resting_electrocardiogram', 
               'thalach': 'max_heart_rate_achieved', 
               'exang': 'exercise_induced_angina',
               'oldpeak': 'st_depression', 
               'slope': 'st_slope', 
               'ca':'num_major_vessels', 
               'thal': 'thalassemia'}, 
    errors="raise")
X = df.drop("target", axis = 1)
y = df[["target"]]

feature_names = X.columns
target_names = y.columns
X = pd.DataFrame(X, columns = feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)
# Train RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)



In [7]:
%%time
import shap
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_interaction_values(X)



CPU times: user 284 ms, sys: 5.15 ms, total: 289 ms
Wall time: 286 ms


## Diabetes Dataset

In [3]:
df = pd.read_csv("../data/diabetes.csv")
X = df.drop("Outcome", axis = 1)
y = df[["Outcome"]]

feature_names = X.columns
target_names = y.columns

X = pd.DataFrame(X, columns = feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)
# Train RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)


  return fit_method(estimator, *args, **kwargs)


In [4]:
%%time
import shap
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_interaction_values(X)

CPU times: user 1.02 s, sys: 6.98 ms, total: 1.03 s
Wall time: 1.03 s
