In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
!wget https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv

In [2]:
df = pd.read_csv("diabetes.csv")
df.head()

In [3]:
target=df['Outcome']
df=df.drop(labels=['Outcome'],axis=1)
# train-test split
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)
# fit the model 
rfc=RandomForestClassifier(random_state=1234,max_depth=3,criterion='entropy')
rfc.fit(X_train,y_train)
# evaluate the results
rfc.score(X_test,y_test)

In [4]:
from sklearn.model_selection import GridSearchCV 
parameters={"n_estimators":[10,20,50,100,200],
           "max_depth":[2,3],
           "min_samples_split":[2,3,4],
           "max_features":('auto','log2'),
           "criterion":('gini','entropy')}
clf=GridSearchCV(rfc, parameters, cv=5)
clf.fit(X_train,y_train)
print(clf.best_params_)
estimator=clf.best_estimator_
print(estimator.score(X_test,y_test))

In [5]:
import seaborn as sns
sns.set(style="white", palette="colorblind", font_scale=1.2, 
        rc={"figure.figsize":(12,9)})
RANDOM_STATE = 420 
N_JOBS=8

In [6]:
features = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]
all_feat_imp_df = pd.DataFrame(data=[tree.feature_importances_ for tree in 
                                     rfc],
                               columns=features)

(sns.boxplot(data=all_feat_imp_df)
        .set(title='Feature Importance Distributions',
             ylabel='Importance'));

In [7]:
!pip install pydotplus
!sudo add-apt-repository universe
!sudo apt update
!pip install sklearn_gbmi
!pip install pdpbox

#You must go into terminal to install graphviz

# !sudo apt-get install graphviz

In [8]:
from IPython.display import Image  
from sklearn.tree import export_graphviz
import graphviz
import pydotplus
from io import StringIO  

# Get all trees of depth 3 in the random forest
depths3 = [tree for tree in rfc.estimators_ if tree.tree_.max_depth==3]
# grab the first one
tree = depths3[0]
# plot the tree
dot_data = StringIO()
export_graphviz(tree, out_file=dot_data, feature_names=features, 
                filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

# Local Interpretable Model-agnostic Explaination (LIME)

In [13]:
import lime
import lime.lime_tabular
classes=['non-diabetic','diabetic']
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.astype(int).values,  
mode='classification',training_labels=y_train,feature_names=features,class_names=classes)
#Let's take a look for the patient in 100th row
i = 100
exp = explainer.explain_instance(X_train.loc[i,features].astype(int).values, estimator.predict_proba, num_features=5)
# visualize the explanation
exp.show_in_notebook(show_table=True)

# SHAP

In [14]:
import shap
# create our SHAP explainer
shap_explainer = shap.TreeExplainer(estimator)
# calculate the shapley values for our data
shap_values = shap_explainer.shap_values(X_train.iloc[7])
shap.initjs()
shap.force_plot(shap_explainer.expected_value[1], shap_values[1], X_train.iloc[7])

In [15]:
shap_values = shap_explainer.shap_values(X_train)
shap.summary_plot(shap_values[1], X_train,auto_size_plot=False)