In [None]:
"""
****************************************************************************************

Project: Diabetes Prediction Using Decision Tree
Author: Clarizza Morales
Date: Fall 2022
Description: 
This notebook analyzes health indicators from the BRFSS 2015 dataset to predict diabetes status 
using a decision tree classifier. It includes data cleaning, entropy calculation, model training, 
and visualization of the decision tree.

****************************************************************************************
"""

In [10]:
#Import needed libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import graphviz

# Import Dataset as a Excel file and inspect
df = pd.read_excel("diabetes_binary_health_indicators_BRFSS2015.xlsx")
print(df.columns)
# Delete duplicated data
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')
(253680, 22)
(229474, 22)


In [11]:
# Setup features and class
df.loc[df["Diabetes_binary"] == 0, "Diabetes_binary"] = 'No Diabetes'
df.loc[df["Diabetes_binary"] == 1, "Diabetes_binary"] = 'Diabetes'
X = df.drop(['Diabetes_binary','HighChol','CholCheck','BMI','Smoker','Stroke','HeartDiseaseorAttack','Fruits','AnyHealthcare','NoDocbcCost','GenHlth','MentHlth','PhysHlth','DiffWalk','Sex','Age','Education','Income'], axis=1, inplace=False)
print(X.columns)
y = df['Diabetes_binary']
print(y.unique())

Index(['HighBP', 'PhysActivity', 'Veggies', 'HvyAlcoholConsump'], dtype='object')
['No Diabetes' 'Diabetes']


In [12]:
# Compute the maximum entropy value
k = len(y.unique())
maxE = np.log2(k) 
p_data = y.value_counts(normalize=True)
entropy = scipy.stats.entropy(p_data)
normalizedE = entropy/maxE
print("Entropy value:", entropy)
print("Normalized Value:", normalizedE)
# Train model
avg_score = 0.0
ntimes = 30
for _ in range(ntimes):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model = DecisionTreeClassifier(criterion='entropy')
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)
    avg_score += accuracy_score(y_test, predictions)
avg_score /= ntimes
print('Normalized entropy value: %.3f'% normalizedE)
print('Average accuracy score: %.3f' % avg_score)

Entropy value: 0.42778439649376787
Normalized Value: 0.42778439649376787
Normalized entropy value: 0.428
Average accuracy score: 0.847


In [13]:
# Clear data structures for space
del df, X, X_train, X_test, y_train, y_test
# Generate pdf of last calculated decision tree and save file
dot_data = tree.export_graphviz(model, out_file=None,
                    feature_names=['HighBP', 'PhysActivity', 'Veggies', 'HvyAlcoholConsump'],
                    class_names=sorted(y.unique()),
                    label='all',
                    rounded=True,
                    filled=True)
graph = graphviz.Source(dot_data)
graph.render("decision_tree")

'decision_tree.pdf'