In [1]:
import numpy as np
import pandas as pd
from collections import Counter as ctr
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Get Data

In [2]:
df = pd.read_csv("diabetes.csv")
df = df[(df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']] != 0).all(axis=1)]
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1
8,2,197,70,45,543,30.5,0.158,53,1
13,1,189,60,23,846,30.1,0.398,59,1


## Node Class

In [3]:
class Node():
    '''Class to define a node in a decision tree'''
    
    # constructor
    def __init__(self, feature=None, threshold=None, l_child=None, r_child=None, info_gain=None, value=None):
    
        # instance variables
        self.feature = feature
        self.threshold = threshold
        self.l_child = l_child
        self.r_child = r_child
        self.info_gain = info_gain
        # for leaf node
        self.value = value

## Entropy & Information Gain Functions

**function to find entropy**

In [4]:
def entropy(c):
    
    counts = np.bincount(c)
    percents = counts / len(c)
    
    entropy = 0
    for p in percents:
        if p > 0:
            entropy += p * np.log2(p)
    
    return -entropy

**function to find information gain**

In [5]:
def info_gain(parent, l_child, r_child):
    
    left = len(l_child) / len(parent)
    right = len(r_child) / len(parent)
    
    gain = entropy(parent) - (left * entropy(l_child) + right * entropy(r_child))
    
    return gain

## Decision Tree Class

In [93]:
class DTClassifier:
    '''Class implements decision tree classifier algo'''
    
    def __init__(self, min_samp_split=5, max_depth=10):
        '''Constructor'''
    
        # initialize root node
        self.root = None
        # stopping criterion
        self.min_samp_split = min_samp_split
        self.max_depth = max_depth
    
    def best_split(self, features, label):
        '''Find the best split'''
        
        best_split = {}
        best_ig = -1
        num_rows, num_cols = features.shape
        
        # for every feature in dataset
        for feat in range(num_cols):
            curr_feat = features[:, feat]
            
            # for every unique value in feature set
            for thresh in np.unique(curr_feat):
                df = df.concatenate((features, label.reshape(1, -1).T), axis=1)
                left_tree = np.array([row for row in df if row[feat] <= thresh])
                right_tree = np.array([row for row in df if row[feat] > thresh])
                
                # if there is data in both left and right dataset, do info gain calculations
                if (len(df_left) > 0 and len(df_right) > 0):
                    # get values from each subset
                    parent = df[:, -1]
                    l_child = left_tree[:, -1]
                    r_child = right_tree[:, -1]
                    
                    # calculate info gain and save split parameters if this split is better than previous split
                    gain = self.info_gain(parent, l_child, r_child)
                    if (gain > best_info_gain):
                        best_split = {
                            'feature' : feat,
                            'thresh' : thresh,
                            'left_tree' : left_tree,
                            'right_tree' : right_tree,
                            'gain' : gain
                        }
                        best_info_gain = gain
                
        return best_split
        
    def build_tree(self, features, label, depth=0):
        '''Recursive function to build tree'''
        
        num_rows = features.shape[0]
        num_cols = len(features)
        
        # make sure current node isn't leaf
        if num_rows >= self.min_samp_split and num_cols <= self.max_depth:
            # get best split
            A1_split = self.best_split(features, label)
            # if split is impure
            if A1_split['gain'] > 0:
                # build tree on left
                left = self.build_tree(
                    features = A1_split['left_tree'][:, :-1],
                    label = A1_split['left_tree'][:, -1],
                    depth = depth + 1
                )
                # build tree on right
                right = self.build_tree(
                    features = A1_split['right_tree'][:, :-1],
                    label = A1_split['right_tree'][:, -1],
                    depth = depth + 1
                )
                return Node (
                    feature = A1_split['feature'],
                    thresh = A1_split['thresh'],
                    l_tree = left,
                    r_tree = right,
                    gain = A1_split['gain']
                )
        # leaf node, where value is the most common label count
        return Node(
            value = ctr(label).most_common(1)[0][0]
        )
    
    def fit(self, features, label):
        '''Recursively build tree from root'''
        
        self.root = self.build_tree(features, label)
        
    def tree_traversal(self, observation, tree):
        '''Traverse tree to predict single data instance'''
        
        # if leaf node
        if tree.value != None:
            return tree.value
        feat_value = observation[tree.feature]
        
        # go left
        if feat_value <= tree.thresh:
            return self.tree_traversal(observation=observation, tree=tree.l_tree)
        
        # go right
        if feat_value > tree.thresh:
            return self.tree_traversal(observation=observation, tree=tree.r_tree)
        
    def predict(self, feature):
        '''Classify new instances'''
        
        return [self.tree_traversal(feat, self.root) for feat in feature]

## Evaluate Decision Tree Classifier

In [94]:
X = np.array([[df.Pregnancies, df.Glucose, df.BloodPressure, df.SkinThickness, df.Insulin,
       df.BMI, df.DiabetesPedigreeFunction, df.Age]])
y = df.Outcome
X = X.transpose()

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
model = DTClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [98]:
accuracy_score(y_test, preds)

0.6582278481012658