In [1]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics

In [2]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']

location = "https://github.com/gridflowai/gridflowAI-datasets-icons/raw/master/AI-DATASETS/01-MISC/pima.txt"

# load dataset
pima = pd.read_csv(location)

pima.columns = col_names

In [3]:
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
pima.dtypes

pregnant      int64
glucose       int64
bp            int64
skin          int64
insulin       int64
bmi         float64
pedigree    float64
age           int64
label         int64
dtype: object

In [5]:
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']

X = pima[feature_cols] # Features
y = pima.label         # Target variable

In [6]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [7]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion='entropy')

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [8]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7532467532467533


In [17]:
# import graphviz 
# from sklearn.tree import export_graphviz

In [18]:
# dot_data = export_graphviz(clf, 
#                            out_file=None, 
#                            feature_names=X.columns,  
#                            class_names=['0', '1'],  
#                            filled=True, 
#                            rounded=True,  
#                            special_characters=True)  

# graph = graphviz.Source(dot_data)  

# #graph.render("C:\\Users\\Admin\\Desktop\\pima") 
# graph 

## feature importance

- higher score the better

In [13]:
clf.feature_importances_

array([0.09266266, 0.03672581, 0.19085581, 0.13032268, 0.27314371,
       0.12743073, 0.1488586 ])

In [14]:
X.columns

Index(['pregnant', 'insulin', 'bmi', 'age', 'glucose', 'bp', 'pedigree'], dtype='object')

In [15]:
df_feature_importance = pd.DataFrame(data= {'col_name': X.columns, 'feature_importance': clf.feature_importances_})

In [16]:
df_feature_importance.sort_values(['feature_importance'], ascending=False)

Unnamed: 0,col_name,feature_importance
4,glucose,0.273144
2,bmi,0.190856
6,pedigree,0.148859
3,age,0.130323
5,bp,0.127431
0,pregnant,0.092663
1,insulin,0.036726
