In [1]:
import numpy as np
import csv
from sklearn import neighbors, datasets, model_selection, metrics, __version__
from math import log2
import os

### Creating the the different functions.

In [2]:
#Simple tree structure with maximum 2 children
class Tree:
    #Containing a value, either a lable or tuple with information about split. Then each child is a new tree object.
    def __init__(self, value=None):
        self.value = value
        self.left = None
        self.right = None
        
    

In [3]:
def impurity(arr, impurity_measure = 'entropy'):
    #Starting sum
    entropy = 0
    gini = 0
    #Finding the counts for each possible label. Eihter one or two labels. 
    labels, counts = np.unique(arr, return_counts=True)
    #Calculating the entropy and gini for the array
    for count in counts:
        prob = count/len(arr)
        entropy -= prob*log2(prob)
        gini += prob*(1-prob)

    if impurity_measure == 'entropy':
        return entropy
    elif impurity_measure == 'gini':
        return gini

In [152]:
def split_label(feature, labels):
    #Finding feature mean
    feature_mean = feature.mean()
    #Splitting the feature on the mean
    left_labels = labels[np.nonzero(feature < feature_mean)]
    right_labels = labels[np.nonzero(feature >= feature_mean)]
    #np.nonzero function finds the index in an array with a condition. Here the conditions are based on the mean.
    return left_labels, right_labels, feature_mean
    

In [153]:
def cond_impurity(feature, labels, impurity_measure='entropy'):
    #Getting the labels for the split
    left_labels, right_labels, f_mean = split_label(feature, labels)
    
    #Finding the count of labels on each side
    left_len = len(left_labels)/len(feature)
    right_len = len(right_labels)/len(feature)

    #Calculating the impurity measure for the feature and each split, and multiplying with the split ratio
    impurity_left = impurity(left_labels, impurity_measure)*left_len
    impurity_right = impurity(right_labels, impurity_measure)*right_len

    #returning the conditional entropy
    return impurity_left + impurity_right

In [154]:
def investigation_score(feature, labels, impurity_measure='entropy'):

    #Getting the entropies
    entropy_feature = impurity(labels, impurity_measure)
    cond_entropy_feature = cond_impurity(feature, labels, impurity_measure)

    #Returning the investigation score
    return entropy_feature - cond_entropy_feature
    

In [155]:
def investigation_score_old(feature, labels, impurity_measure = 'entropy'):
    #Finding the mean of the feature
    feature_mean = feature.mean()

    #Getting the labels for the split
    left_labels = labels[np.nonzero(feature < feature_mean)]
    right_labels = labels[np.nonzero(feature >= feature_mean)]
    #np.nonzero function finds the index in an array with a condition. Here the conditions are based on the mean.

    #Finding the count of labels on each side
    left_len = len(left_labels)/len(feature)
    right_len = len(right_labels)/len(feature)

    #Calculating the enropy for the feature and each split
    entropy_feature = entropy(labels)
    entropy_left = entropy(left_labels)*left_len
    entropy_right = entropy(right_labels)*right_len

    #Calculating the investigation score
    #print(entropy_feature, entropy_left, entropy_right, entropy_feature - entropy_left - entropy_right, labels, left_labels, right_labels)
    #print(len(labels),labels, left_labels, right_labels)
    return entropy_feature - entropy_left - entropy_right
    

In [156]:
def find_best_feature(data, labels, impurity_measure = 'entropy'):
    #Making a dictionary to store the feature key and the inv_score as value. 
    best_i = {}
    #Iterating through each feature
    for feature in range(data.shape[1]):
        best_i[feature] = investigation_score(data[:, feature], labels, impurity_measure)

    #Finding best feature with max function, where the key is the values. 
    #print(best_i.values())
    #if max(best_i.values()) > 0:
    best_feature_index = max(best_i, key= best_i.get)
    return best_feature_index

In [157]:
def split_data(data, labels, impurity_measure = 'entropy', feature_index='best'):
    
    #If we want to find best_feature or use a predetermined
    if feature_index == 'best':
        feature_index = find_best_feature(data, labels, impurity_measure)

    #Get the feature
    best_feature = data[:, feature_index]

    #Splitting the labels | Could have returned the labels in investigation_score to save compute, but this should be marginal.
    left_labels, right_labels, best_feature_mean = split_label(best_feature, labels)
    
    #Splitting the data based on the indexes of what points is lower or higher than mean, given specific feature
    left_data = data[best_feature < best_feature_mean.mean()]
    right_data = data[best_feature >= best_feature_mean.mean()]
    
    #Returning a 3-tuple consisting of the left side, the right side and information about the split (what feature, the mean)
    #The latter will be stored in each branch when building the tree. 
    return (left_data, left_labels), (right_data, right_labels), (feature_index, best_feature_mean)
    

In [158]:
def identical_features(data, labels):
    count = 0
    for feature in range(data.shape[1]):
        if len(np.unique(data[:,feature])) == 1:
            count += 1
    
    if count == data.shape[1]:
        uniques, counts = np.unique(labels, return_counts=True)
        return uniques[np.argmax(counts)]

In [159]:
def id3(data, labels, tree, impurity_measure = 'entropy'):
    #Finding uniques and checking for identical features
    identical = identical_features(data, labels)
    #If all data points have the same label:
    if impurity(labels) == 0:
        tree.value = labels[0]
        return labels[0]

    #Else if all data points have identical feature values
    
    elif identical != None:
        tree.value = identical
        return


    #Else
    else:
        #Extracting the information from the split
        left, right, root = split_data(data, labels, impurity_measure)

        if root != 0:
            #Setting this root to indicate the split
            tree.value = root
    
            #Making left branch
            new_left = Tree()
            tree.left = new_left
            id3(left[0], left[1], new_left, impurity_measure)
    
            #Making right branch
            new_right = Tree()
            tree.right = new_right 
            id3(right[0], right[1], new_right, impurity_measure)
        
        
        
    

In [160]:
def search_tree(tree,count=0):
    if tree != None:
        counts = count
        counts += 1
        print(tree.value)
    
        search_tree(tree.left,counts)
        search_tree(tree.right,counts)

In [161]:
def total_nodes(tree):
    if tree == None:
        return 0

    l = total_nodes(tree.left)
    r = total_nodes(tree.right)

    return 1 + l + r
    

In [162]:
def predict(data_point, tree):
    if type(tree.value) == tuple:
        feature, split_point = tree.value
        if data_point[feature] < split_point:
            return predict(data_point, tree.left)
        else:
            return predict(data_point, tree.right)
    else:
        return tree.value
    #print(tree.value)
    
    

In [163]:
with open('wine_dataset.csv', 'r') as r:
    reader = csv.reader(r)
    data = list(reader)

feature_names = data[0][:-1]
data_ar = np.array(data[1:], dtype=float)
targets = data_ar[:, -1]
data = data_ar[:, :-1]

In [164]:
seed = 521
X_train, X_val_test, y_train, y_val_test = model_selection.train_test_split(data, targets, test_size=0.3, shuffle=True, random_state=seed)

In [165]:
def accuracy(data, labels, tree):
    labels_len = len(labels)
    if labels_len == 0:
        return 0
    count = 0
    for counts, data_point in enumerate(data):
        if predict(data_point, tree) == labels[counts]:
            count += 1
    return count/labels_len

In [166]:
def predict_flat_label(data, labels, prediction=0):
    labels_len = len(labels)
    if labels_len == 0:
        return 0
    count = 0
    for counts, data_points in enumerate(data):
        if prediction == labels[counts]:
            count += 1
    return count/labels_len

In [167]:
def prune(data, labels, tree):
    #Just want to do the pruning on a node that is splitting, not on a label node.
    if type(tree.value) == tuple:
        feature, mean = tree.value
        left, right, values = split_data(data, labels, feature_index = feature)
        
        #If there is a left 
        if type(tree.left) == Tree:
            prune(left[0], left[1], tree.left)
        if type(tree.right) == Tree:
            prune(right[0], right[1], tree.right)

        #Accuracy of either splitting or giving hard label
        for label in [0,1]:
            if predict_flat_label(data, labels, prediction=label) > accuracy(data, labels, tree):
                #print('true')
                #print(tree.value, left[1], right[1], predict_flat_label(data, labels, prediction=label), accuracy(data, labels, tree))
                tree.value = label
                tree.left, tree.right = None, None
                #print(tree.value)

    

In [168]:
#Using the id3 algorithm to return a decision tree.
def learn(X, y, impurity_measure='entropy', pruning="false", ratio=0.8):
    #Making the root for the tree
    tree = Tree()

    #Checking whether the pruning is true
    if pruning == 'True':
        #Now we need to split the data
        ratio = ratio
        X_train, X_prune = np.split(X, [int(ratio*len(X))])
        y_train, y_prune = np.split(y, [int(ratio*len(y))])

        #Making the tree with training data
        id3(X_train, y_train, tree, impurity_measure)
        total_nodes(tree)

        #Pruning the tree with the pruning data
        prune(X_prune, y_prune, tree)
        total_nodes(tree)

        #Returning the pruned tree
        return tree
    
    #Else if pruning is false | Just make the tree and return it.
    else:
        print('false')
        id3(X, y, tree, impurity_measure)
        return tree

In [169]:
tree = Tree()
tree= learn(X_train, y_train, impurity_measure='gini')

false


In [170]:
tree = Tree()
tree= learn(X_train, y_train, impurity_measure='entropy', pruning="True", ratio=0.9)

  feature_mean = feature.mean()


In [171]:
tree.left.left.value
total_nodes(tree)


519

In [172]:
accuracy(X_train, y_train, tree)

0.9352100089365505

In [173]:
accuracy(X_val_test, y_val_test, tree)


0.865625

In [174]:
tree_2 = Tree()
tree_2 = learn(X_train, y_train, impurity_measure='entropy')

total_nodes(tree_2), accuracy(X_val_test, y_val_test, tree_2)

false


(759, 0.8645833333333334)

In [175]:
prune(X_val_test, y_val_test, tree_2)

  feature_mean = feature.mean()


In [113]:
total_nodes(tree_2)

489

In [62]:
accuracy(X_val_test, y_val_test, tree), accuracy(X_val_test, y_val_test, tree_2)

(0.8677083333333333, 0.9072916666666667)

In [63]:
predict_flat_label(X_val_test, y_val_test, 1)

0.496875