### Eiber1849 obligatory assignment 1
# Make a decision tree classifier

In [1]:
#Important modules

#Modules for the implementation
import numpy as np
import time
from math import log2

#Modules for testing
import csv
from sklearn import neighbors, datasets, model_selection, metrics, __version__
from sklearn import tree as sk_tree
import pandas as pd

#Incase wrong working directory
#import os


# Creating the the different functions.

## Structure of the implementation

* learn(X, y, impurity_measure='entropy', pruning=False, train_ratio=0.8)
  >Takes in training data, labels and tree object. Can choose impurity_measure with default='entropy' or 'gini'. Can also enable pruning, which will split the input data into training and pruning. Returns a trained tree
* predict(data_point, tree)
  >Takes a datapoint and does a search on the tree. Returns predicted label

These are the main functions. Since I am implementing this functionally, the order of functions will be bottom-up. I.e The main function for last. 

#### The learn function's main sub-functions are;

* id3(data, labels, tree, impurity_measure = 'entropy')
  >Takes data, labels, tree and modifies the tree. Void function, but builds on tree object
* prune(data, labels, tree, impurity_measure)
  >Takes data, labels, tree and modifies the tree. Checking if subtrees are necessary.

#### id3 and prune use the same smaller help functions;

**figuring out how to split**
1. find_best_feature(data, labels, impurity_measure = 'entropy')
   >takes data and labels, then returns the best feature and value
2. investigation_score(feature, labels, split_point, impurity_measure='entropy')
   >returns a score on how good it is to split on the feature 
3. cond_impurity(feature, labels, split_point, impurity_measure='entropy')
   >returns impurity of splitting on a feature
4. impurity(labels_arr, impurity_measure = 'entropy')
   >returns a value on how pure the data is. 0 is pure, higher is more unpure.

**Splitting data**
1. split_data(data, labels, split_feature, split_point)
   >Takes data, labels, where to split and on what value. Returns left(data, labels), right(data, labels) and information on how the split was done
2. split_label(feature, labels, split_point)
   >Returns left(labels) and right(labels). Splits labels based on indexes where feature values are lower or higher than split_point

### starting with the tree class
Binary tree structure with left and right child. The value is the differantiatior between root and leaf. Root nodes have tuples, while leaf node has integers. Could rather have implemented self.label, self.split_point, self.feature_split. This is more compact.

In [2]:
#Simple tree structure with maximum 2 children
class Tree:
    
    #self.value is eihter a tuple or an int. If it is a tuple, then it is a root with (feature, split_value). If it is an int, then it a lable. 
    def __init__(self):
        #Value for split and majority label
        self.value = None
        self.major_label = None

        #Children of the tree
        self.left = None
        self.right = None
        
        
    

### Split labels
split labels on a value, split_point. Could sort the array and split on given value with np.split. But that adds sorting complexity, so using np.nonzero to get indexes of datapoints that satisfies requirement. Then index them out of labels and return the two arrays.

In [3]:
#Take feature, lable and split_point. Return two splitted arrays 
def split_label(feature, labels, split_point):
    #Splitting the feature on the split_point
    left_labels = labels[np.nonzero(feature < split_point)]
    right_labels = labels[np.nonzero(feature >= split_point)]
    
    #np.nonzero function finds the index in an array with a condition. Here the conditions are based on the mean.
    return left_labels, right_labels
    

### Impurity and investigation score

##### Vectorized or for loop?

In [4]:
#Take an array of labels, then return a score to its purity. Either with entropy or gini
def impurity(labels_arr, impurity_measure = 'entropy'):
    
    #Finding the counts for each possible label. Eihter one or two labels. 
    labels, counts = np.unique(labels_arr, return_counts=True)
    
    #Making a temp function to find possibility. Not necessary but the code is more readable later.  
    prob = lambda x: x/len(labels_arr)

    #If the measure is entropy use the entropy formula. And return entropy
    if impurity_measure == 'entropy':
        entropy = 0
        for count in counts:
            entropy -= prob(count)*log2(prob(count))
        return entropy

    #If measure is gini, use gini formula and return gini
    #Gini should be faster to compute, since it uses substraction instead of log2
    elif impurity_measure == 'gini':
        gini = 0
        for count in counts:
            gini += prob(count)*(1-prob(count))
        return gini

In [5]:
#Same as above
def impurity_vec(labels_arr, impurity_measure = 'entropy'):
    #Finding the counts for each possible label. Eihter one or two labels. 
    labels, counts = np.unique(labels_arr, return_counts=True)
    
    #Using temp function that implements the entropy formula, -sum(prob*log2(prob)), then vectorizes the counts array so we can take the sum to calculate entropy.
    #Has to return the negative of the sum, since log2 of decimal is negative.
    if impurity_measure == 'entropy':
        entropy = lambda x: x/len(labels_arr)*log2(x/len(labels_arr))
        return -np.vectorize(entropy)(counts).sum()

    #Doing the same but with gini formula. sum(prob*(1-prob)
    elif impurity_measure == 'gini':
        gini = lambda y: y/len(labels_arr)*(1-y/len(labels_arr))
        return np.vectorize(gini)(counts).sum()

In [6]:
#Run this to see which is faster
def impurity_speed(impurity_option=impurity, measure='entropy'):
    start_time = time.time()
    for x in range(0,500):
        impurity_option([1,0], measure)
    end_time = time.time()
    return end_time-start_time

speed_test = np.array([[impurity_speed(impurity, 'entorpy') - impurity_speed(impurity_vec, 'gini')] for x in range(0,100)])

print(f'Doing vectorized is {speed_test.mean()} seconds faster per impurity computation. If it is negative, then it is slower')

impurity([1,1,0]), impurity_vec([1,1,0])

Doing vectorized is -0.012667222023010254 seconds faster per impurity computation. If it is negative, then it is slower


(0.9182958340544896, 0.9182958340544896)

Running code above shows that it is more effective to not vectorize and just use for loops

In [7]:
#Take an array of labels, then split it based on given feature and split point. Then return the impurity if the labels are split.
def cond_impurity(feature, labels, split_point, impurity_measure='entropy'):
    #Finding mean
    
    #Getting the labels for the split
    left_labels, right_labels = split_label(feature, labels, split_point=split_point)
    
    #Finding the count of labels on each side
    left_len = len(left_labels)/len(feature)
    right_len = len(right_labels)/len(feature)

    #Calculating the impurity measure for the feature and each split, and multiplying with the split ratio
    impurity_left = impurity(left_labels, impurity_measure)*left_len
    impurity_right = impurity(right_labels, impurity_measure)*right_len

    #returning the conditional impurity
    return impurity_left + impurity_right

In [8]:
#Take an array of labels, check how good it is to split it given feature and split_point. Return the score.
def investigation_score(feature, labels, split_point, impurity_measure='entropy'):

    #Getting the impurity
    impurity_feature = impurity(labels, impurity_measure)
    cond_impurity_feature = cond_impurity(feature, labels, split_point, impurity_measure)

    #Returning the investigation score
    return impurity_feature - cond_impurity_feature
    

### Finding the best feature to split on, by investigation score

In [9]:
#Find the best feature given data and labels. Check each feature for its investigation score, then return the best feature, and split
def find_best_feature(data, labels, impurity_measure = 'entropy'):
    #Making a dictionary to store the feature key and the inv_score as value. 
    best_i = {}
    #Iterating through each feature
    for feature_index in range(data.shape[1]):
        feature = data[:, feature_index]

        #Making a tuple for each feature with index and value
        best_i[feature_index] = investigation_score(data[:, feature_index], labels, feature.mean(), impurity_measure=impurity_measure), feature.mean()

    #Finding best feature with max function, where the key is the values. 
    best_feature_index = max(best_i, key= lambda x: best_i[x][0])

    return best_feature_index, best_i[best_feature_index][1]

### Splitting the data and labels into subarrays
Split the data and labels

In [10]:
#Split data and labels based on a given feature and value. Return the splitted data, labels and information about the split.
def split_data(data, labels, split_feature, split_point):
    feature = data[:, split_feature]

    #Splitting the labels | Could have returned the labels in investigation_score to save compute, but this should be marginal.
    left_labels, right_labels = split_label(feature, labels, split_point)  
    
    #Splitting the data based on the indexes of what points is lower or higher than mean, given specific feature
    
    #print(f'split feature: {feature__}')
    
    left_data = data[feature < split_point]
    right_data = data[feature >= split_point]
    
    #Returning a 3-tuple consisting of the left side, the right side and information about the split (what feature, the mean)
    #The latter will be stored in each branch when building the tree. 
    return (left_data, left_labels), (right_data, right_labels)
    

### Checking for identical features
If there is only one unique value in every column, then the multi array has identical rows. 

In [11]:
def identical_features(data, labels):
    count = 0
    for feature in range(data.shape[1]):
        if len(np.unique(data[:,feature])) == 1:
            count += 1
    
    if count == data.shape[1]:
        uniques, counts = np.unique(labels, return_counts=True)
        return uniques[np.argmax(counts)]

## Implementing the ID3 function by using the prior built functions

The algorithm takes in data, labels and a tree. It  has the feature impurity_measure, which decides how impurity is computed. It builds on the input tree by creating new subtrees. The function does not return the tree, but modifies it from top-down. 

In [12]:
#Take data and labels, then build on given tree by splitting or setting lables.
def id3(data, labels, tree, impurity_measure = 'entropy'):
    #Finding checking for identical features
    identical = identical_features(data, labels)
    
    #If all data points have the same label:
    if impurity(labels) == 0:
        tree.value = labels[0]
        return labels[0]

    #Else if all data points have identical feature values
    elif identical != None:
        tree.value = identical
        return


    #Else
    else:
        #Finding the best feature to split on
        split_feature, split_point = find_best_feature(data, labels, impurity_measure = impurity_measure)
        
        #Extracting the information from the split
        left, right = split_data(data, labels, split_feature, split_point)
        
        #Setting this root to indicate the split
        tree.value = split_feature, split_point

        #Setting the majority label
        lab, counts = np.unique(labels, return_counts=True)
        maj_index = np.where(counts == max(counts))[0][0]
        tree.majority_label = lab[maj_index]
    
        #Making left branch
        new_left = Tree()
        tree.left = new_left
        id3(left[0], left[1], new_left, impurity_measure)
    
        #Making right branch
        new_right = Tree()
        tree.right = new_right 
        id3(right[0], right[1], new_right, impurity_measure)
        
        
        
    

## Creating some functions to inspect the tree

In [13]:
#Takes a tree and prints the value for each node
def search_tree(tree):
    #As long as it is a tree object
    if tree != None:
        print(tree.value)
        
        #Go left and right
        search_tree(tree.left)
        search_tree(tree.right)

In [14]:
#Returns total amount of nodes in a tree.
def total_nodes(tree):
    #If it is not a tree, then return 0 and go up
    if tree == None:
        return 0

    #Go left most and then right
    l = total_nodes(tree.left)
    r = total_nodes(tree.right)

    #Return 1 and then the value of the left and right children.
    return 1 + l + r
    

## Prediction and accuracy functions

Binary tree properties, searching the tree will be in O(h) time, where h is the height. This makes the pruning efficient. 

In [15]:
#Takes a datapoint and a tree, then returns a label based on the tree
def predict(data_point, tree):
    #Only the leaf node is not a tuple. So go through all the tuples and go leaft or right until you reach a leaf
    while type(tree.value) == tuple: #and tree.value != None | is unecesarry since every split has a child or is a leaf. So it wont trigger non leafs or splits
        feature, split_point = tree.value
        
        #Go left or right based on split_point
        if data_point[feature] < split_point:
            tree = tree.left
        else:
            tree = tree.right

    #When you reach a leaf, return the value/label
    else:
        return tree.value
    #print(tree.value)
    

In [16]:
#Takes in data, labels and tree and returns the trees accuracy based on the data and labels. 
def tree_accuracy(data, true_labels, tree):
    #Make an array of all the predictions for the data_points, with given tree, then return the ratio of correctly labled predictions. 
    predictions = np.array([predict(data_point, tree) for data_point in data])
    
    #Return accuracy
    return len(predictions[predictions == true_labels])/len(true_labels)


In [17]:
#Takes an array of labels, and a prediction label. Then returns the accuracy of labels that match the prediction label.
def majority_accuracy(labels, prediction):
    return len(labels[labels == prediction])/len(labels)

## Pruning algorithm
A Depth first search. Tree structure has a left and right child. Recursively move down by going as much left as possible, then right. On each subtree calculate the accuracy, then compare to the accuracy if it is a leaf with majority label. If it is better or the same, change it to a leaf.

If the pruning labels are empty on a split further down, then it cannot make a certain decision. So it will just go up a level. Replacing the subtree in such condition can make the model underfitted.

In [18]:
#Take a tree, and change it based on some data and labels
def prune(data, labels, tree):
    #Just want to do the pruning on a node that is splitting, not on a label node.
    if type(tree.value) == tuple:
        feature, mean = tree.value

        #If the splitted labels are empty, then there is no reason to continue down
        #Leave it as subtree, because it is uncertain if a split or leaf is optimal.
        if len(labels) == 0:
            return

        #Else continue with the search
        else:
            left, right = split_data(data, labels, feature, mean)

        
        #If there is a left child go left
        if type(tree.left) == Tree:
            prune(left[0], left[1], tree.left)
            
        #If there is a right child go right
        if type(tree.right) == Tree:
            prune(right[0], right[1], tree.right)

        #Accuracy of either splitting or majority label from training 
        #If the accuracy of majority class is greater than the prediction of subtree, then replace the subtree with the majority lable. 
        if majority_accuracy(labels, tree.majority_label) >= tree_accuracy(data, labels, tree):
            #Replacing subtree with label
            tree.value = tree.majority_label
            tree.left, tree.right = None, None

    

# The main learning algorithm

In [19]:
#Using the id3 algorithm to return a decision tree, and perhaps alter it with pruning if turned on.
def learn(X, y, impurity_measure='entropy', pruning=False, train_ratio=0.8):
    #Making the root for the tree
    tree = Tree()

    #Checking whether the pruning is true
    if pruning == True:
        #Now we need to split the data
        train_ratio = train_ratio
        X_train, X_prune = np.split(X, [int(train_ratio*len(X))])
        y_train, y_prune = np.split(y, [int(train_ratio*len(y))])

        #Making the tree with training data
        id3(X_train, y_train, tree, impurity_measure)

        #Pruning the tree with the pruning data
        prune(X_prune, y_prune, tree)

        #Returning the pruned tree
        return tree
    
    #Else if pruning is false | Just make the tree and return it.
    else:
        id3(X, y, tree, impurity_measure)
        return tree

### read csv to numpy
Could also just use pandas and then to_numpy function

In [20]:
#Takes cvs file and splits to data and targets/labels
def cvs_numpy(name=''):
    with open(name, 'r') as r:
        reader = csv.reader(r)
        data = list(reader)
    
    feature_names = data[0][:-1]
    data_ar = np.array(data[1:], dtype=float)
    targets = data_ar[:, -1]
    data = data_ar[:, :-1]
    return data, targets, feature_names

# Testing and sanity checks with the wine dataset

In [21]:
#Loading in the wine dataset, and seperating data and labels
data, labels, target_names = cvs_numpy('wine_dataset.csv')

#Setting a fixed seed for reproducibility, tested with 332, 333, 521
seed = 521

#Splitting into training and val_test
X_train, X_val_test, y_train, y_val_test = model_selection.train_test_split(data, labels, test_size=0.2, shuffle=True, random_state=seed)

In [22]:
#Testing runtime and training acc for non pruning. Sanity check: training acc should be 1.0
start = time.time()
tree_1= learn(X_train, y_train, impurity_measure='entropy')
accu = tree_accuracy(X_train, y_train, tree_1)
tot_nodes = total_nodes(tree_1)
ending = time.time()
print(f'Time: {ending-start}, accuracy: {accu}, total nodes: {tot_nodes}')

Time: 0.2711043357849121, accuracy: 1.0, total nodes: 873


In [23]:
#Testing runtime for pruning. Sanity check: Should have lower training acc than without, and fewer nodes.
start = time.time()
tree_2 = learn(X_train, y_train, impurity_measure='entropy', pruning=True, train_ratio=0.7)
accu = tree_accuracy(X_train, y_train, tree_2)
tot_nodes = total_nodes(tree_2)
ending = time.time()
print(f'Time: {ending-start}, accuracy: {accu}, total nodes: {tot_nodes}')

Time: 0.19340777397155762, accuracy: 0.9159499609069586, total nodes: 219


It is taking shorter with pruning because it is training with less training data. Sanity checks looks good

In [24]:
val_acc_1 = tree_accuracy(X_val_test, y_val_test, tree_1)
val_acc_2 = tree_accuracy(X_val_test, y_val_test, tree_2)

print(f'Testing with val_test data. Tree1: {val_acc_1:.5f} | Tree2: {val_acc_2:.5f}')

Testing with val_test data. Tree1: 0.87344 | Tree2: 0.90000


Looks like pruning is more general


## Evaluating best modfel

Assess the performance of your algorithm using an appropriate performance
measure. Which setting should you select for this data (entropy or Gini,
pruning or no pruning)? What is your estimate for the performance of
the selected model on unseen data points? Report how you arrived at the
conclusi

ons.
Remember to use training, validation, and test sets properly. Note that in the
model selection step you select one out of the four models (settings) based
on performance on validation data, and in the model evaluation step you
evaluate the selected model on test data.

In [25]:
X_val, X_test = np.split(X_val_test, [int(len(X_val_test)/2)])
y_val, y_test = np.split(y_val_test, [int(len(y_val_test)/2)])

In [26]:
#Function to find the best training ratio for pruning. Can also find if no pruning is better
def best_ratio(train_data, train_labels, val_data, val_labels, impurity_measure):
    tree_prunefree = Tree()
    tree_prunefree = learn(train_data, train_labels, impurity_measure=impurity_measure)
    accuracy_prunefree = tree_accuracy(val_data, val_labels, tree_prunefree)
    #print(f'Prune free accuracy: {accuracy_prunefree}')
    ratios = {}
    
    for x in range(1,10):
        tree = Tree()
        tree = learn(train_data, train_labels, impurity_measure=impurity_measure, pruning=True, train_ratio=x/10)
        accuracy_tree = tree_accuracy(val_data, val_labels, tree)
        ratios[x/10] = accuracy_tree
        #print(f'Testing training ratio: {x/10}, with impurity_measure: gini, accuracy: {accuracy_tree:.3f} | Difference in accuracy: {accuracy_tree- accuracy_prunefree:.5f}')

    #Adding no pruning to the ratios with 1 as key
    ratios[1] = accuracy_prunefree

    #Finding the best ratio based on the values with .get
    best_ratio = max(ratios, key=ratios.get)

    #In the case that no pruning is best
    pruning = False

    #If pruning is better
    if best_ratio != 1:
        pruning = True

    #Printing some information
    print(f'Impurity_measure: {impurity_measure}\nAccuracy without pruning: {accuracy_prunefree:.5f}\
            \nAccuracy with best pruning: {ratios[best_ratio]:.5f}, training ratio: {best_ratio}\n')

    #Returning the best model given impurity measure
    return [best_ratio, ratios[best_ratio], impurity_measure, pruning]

In [27]:
best_entropy = best_ratio(X_train, y_train, X_val, y_val, impurity_measure='entropy')
best_gini = best_ratio(X_train, y_train, X_val, y_val, impurity_measure= 'gini')

models = [best_entropy, best_gini]

best_model = max(models, key=lambda y: y[1])

training_ratio, accuracy, impurity_measure, pruning = best_model 
print(f'The best model is {impurity_measure}, with pruning: {pruning}, training ratio: {training_ratio}, validation accuracy: {accuracy:.5f} ')

Impurity_measure: entropy
Accuracy without pruning: 0.89062            
Accuracy with best pruning: 0.91250, training ratio: 0.7

Impurity_measure: gini
Accuracy without pruning: 0.88125            
Accuracy with best pruning: 0.91875, training ratio: 0.8

The best model is gini, with pruning: True, training ratio: 0.8, validation accuracy: 0.91875 


With seed=521. The best performing model on the validation data is using gini and pruning with data split into 80% training and 20% pruning.

Sanity check: Looks like the different impurity measures gives different accuracy. That is good.

Now lets test accuracy with the test data




In [28]:
best_model = learn(X_train, y_train, impurity_measure=impurity_measure, pruning=pruning, train_ratio=training_ratio)

test_accuracy = tree_accuracy(X_test, y_test, best_model)

print(f'The best model has {test_accuracy}')

The best model has 0.896875


### Conclusion on tests

gini seems to be more accurate with pruning, but entropy is more accurate without pruning

gini with pruning is overall the most accurate on this test data and seed

# SKLEARN comparison

The following code using the sklearn library is heavily inspired by the documentation samples: https://scikit-learn.org/stable/modules/tree.html


In [29]:
#Making and training a DecisionTreeClassifier
dtclf = sk_tree.DecisionTreeClassifier(criterion="gini",)
dtclf = dtclf.fit(X_train, y_train)

#sk_tree.plot_tree(clf) #If we want to see the tree

#Training my own tree with the best model 
tree = learn(X_train, y_train, impurity_measure='gini', pruning=True, train_ratio=0.8)


In [30]:
train_pred, val_pred, test_pred = dtclf.predict(X_train), dtclf.predict(X_val), dtclf.predict(X_test)

train_acc_sk = metrics.accuracy_score(y_train, train_pred)
val_acc_sk = metrics.accuracy_score(y_val, val_pred) # Would be same as len(val_pred[val_pred == y_val])/len(y_val)
test_acc_sk =metrics.accuracy_score(y_test, test_pred)

train_acc_my = tree_accuracy(X_train, y_train, tree) 
val_acc_my = tree_accuracy(X_val, y_val, tree)
test_acc_my = tree_accuracy(X_test, y_test, tree)

print(f'Sklearn | train: {train_acc_sk:.5f}, val: {val_acc_sk:.5f}, test: {test_acc_sk:.5f}')
print(f'Mine    | train: {train_acc_my:.5f}, val: {val_acc_my:.5f}, test: {test_acc_my:.5f}')

#Small test to see if predict works like sklearn 
val_pred = [(predict(x, tree)) for x in X_test]

print('\nTesting prediction functions')
print('accuracy_score',metrics.accuracy_score(y_test, val_pred), '\npredict_func',tree_accuracy(X_test, y_test, tree)) 
#cv = model_selection.ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

#Testing some cross validation from sklearn library on the 
cross_val = model_selection.cross_val_score(dtclf, X_train, y_train, cv=5, scoring='f1_macro').mean()

print(f'\nResults from testing the sklearn tree with cross validation. Mean accuracy: {cross_val:.5f}')

Sklearn | train: 1.00000, val: 0.91563, test: 0.89062
Mine    | train: 0.91087, val: 0.91875, test: 0.89687

Testing prediction functions
accuracy_score 0.896875 
predict_func 0.896875

Results from testing the sklearn tree with cross validation. Mean accuracy: 0.87604


### Experimenting with some cost pruning
https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html

In [31]:
def sklearn_pruning(X_train, y_train, X_test, y_test):
    clf = sk_tree.DecisionTreeClassifier()
    path = clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    clfs = []
    for ccp_alpha in ccp_alphas:
        clf = sk_tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
        clf.fit(X_train, y_train)
        clfs.append(clf)

    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]

    return train_scores, test_scores

In [32]:
train_scores, test_scores = sklearn_pruning(X_train, y_train, X_test, y_test)

#Finding the model with highest test score. Then its training score
train_score, test_score = train_scores[np.nonzero(test_scores == max(test_scores))[0][0]], max(test_scores)

print(f'Best training socre: {train_score}, best test score: {test_score}')

Best training socre: 0.9593432369038312, best test score: 0.903125


## Testing with different dataset

The algorithm should be general enough to be used on different datasets. The data has to be clean, and non-categorical however. 

Lets see how it does with multiple labels, that also are strings

found on kaggle: https://www.kaggle.com/datasets/abineshkumark/carsdata

The set has some feature that are continious, and a brand. 

In [33]:
df = pd.read_csv('cars.csv')
df

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,US.
1,31.9,4,89,71,1925,14,1980,Europe.
2,17.0,8,302,140,3449,11,1971,US.
3,15.0,8,400,150,3761,10,1971,US.
4,30.5,4,98,63,2051,17,1978,US.
...,...,...,...,...,...,...,...,...
256,17.0,8,305,130,3840,15,1980,US.
257,36.1,4,91,60,1800,16,1979,Japan.
258,22.0,6,232,112,2835,15,1983,US.
259,18.0,6,232,100,3288,16,1972,US.


In [34]:
#Converting the data to numpy
data = df.to_numpy()

#Getting the labels. They were at the end in this dataset
labels = data[:, -1]

#Getting everything except the labels
data = data[:, :-1] 

In [35]:
#The data was not in perfect condition. It had some broken data, and two columns were in the wrong format.
#Cleaning the affected columns.
data[data[:, 2] == ' '] = 0 
for x in data[:,2]:
    x = int(x)

data[:, 2] = data[:, 2].astype(int)

data[data[:, 4] == ' '] = 0 
for x in data[:,4]:
    x = int(x)

data[:, 4] = data[:, 4].astype(int)

In [36]:
#Splitting the data into training and test
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(data, labels, test_size=0.2, shuffle=True)

In [37]:
#Training the tree
car_tree = learn(train_data, train_labels, impurity_measure='entropy')

In [38]:
#Giving a random data_point to predict
predict(test_data[5], car_tree)

' Europe.'

In [39]:
#Checking training accuracy and total nodes
tree_accuracy(train_data, train_labels, car_tree), total_nodes(car_tree)


(0.9951923076923077, 105)

Seems there are some identical data with different label. Lets see if the sklarn tree gets similar results

In [40]:
clf = sk_tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(train_data, train_labels)

In [41]:
predictions = clf.predict(train_data)

In [42]:
print('Sklearn accuracy is:',metrics.accuracy_score(train_labels, predictions), 'with total nodes', clf.tree_.node_count)

Sklearn accuracy is: 0.9951923076923077 with total nodes 65


Seems to be similar, except the sklearn tree has fewer nodes.