Using the Pima Diabetes DataSet to train and test a decision tree classifier as well as a random forest classifier to predict the diabetes label (positive or negative).

In [81]:
#import dependecies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [82]:
#read in data for analysis
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [83]:
#check for missing data
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [84]:
#overview of data
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [85]:
#define the predictor variables and the target variables
X = data.drop(['Outcome','Pregnancies'], axis=1)
y = data['Outcome']

In [86]:
#split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Decision Tree Classifier

In [87]:
#train and fit the model
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5)

model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [88]:
#score the model
from sklearn.metrics import mean_squared_error
predictions = model.predict(X_test)

score = model.score(X_test, y_test)
mse = mean_squared_error(predictions, y_test)

print(f'Model score: {score}, Model error: {mse}')

Model score: 0.78125, Model error: 0.21875


# Random Forest Classifier

In [89]:
#train and fit random forest model for comparison
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=5, min_samples_leaf=5)

rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [90]:
#score the model
rf_predictions = rf.predict(X_test)

rf_score = rf.score(X_test, y_test)
rf_mse = mean_squared_error(rf_predictions, y_test)

print(f'Model score: {rf_score}, Model error: {rf_mse}')

Model score: 0.7708333333333334, Model error: 0.22916666666666666


In [93]:
#what are the most important features in our dataset?
feature_names = X.columns
pd.DataFrame({'Feature Name':feature_names, 'Importance':rf.feature_importances_}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

Unnamed: 0,Feature Name,Importance
0,Glucose,0.395328
1,BMI,0.17642
2,Age,0.17581
3,Insulin,0.092453
4,DiabetesPedigreeFunction,0.072515
5,BloodPressure,0.052792
6,SkinThickness,0.034681
