In [1]:
import numpy as np
import csv
from sklearn import neighbors, datasets, model_selection, metrics, __version__
from sklearn import tree as sk_tree
from math import log2
import pandas as pd
import os
import time

## Creating the the different functions.

### starting with the tree class

In [2]:
#Simple tree structure with maximum 2 children
class Tree:
    #Containing a value, either a lable or tuple with information about split. Then each child is a new tree object.
    def __init__(self, value=None):
        self.value = value
        self.left = None
        self.right = None
        self.major_label = None
        
    

### Split labels
Making a function to split a list of labels on its mean. The split function is too simple, because it would split it on the datapoint with the mean.
I could sort the array, but I find it more simple and effective to retrieve the indexes instead and slice the points from the labels.

In [3]:
def split_label_mean(feature, labels):
    #Finding feature mean
    feature_mean = np.nanmean(feature)
    #Splitting the feature on the mean
    left_labels = labels[np.nonzero(feature < feature_mean)]
    right_labels = labels[np.nonzero(feature >= feature_mean)]
    #np.nonzero function finds the index in an array with a condition. Here the conditions are based on the mean.
    return left_labels, right_labels, feature_mean
    

In [4]:
def split_label(feature, labels, split_point):
    #Splitting the feature on the split_point
    left_labels = labels[np.nonzero(feature < split_point)]
    right_labels = labels[np.nonzero(feature >= split_point)]
    #np.nonzero function finds the index in an array with a condition. Here the conditions are based on the mean.
    return left_labels, right_labels, split_point
    

### Impurity and investigation score

In [5]:
def impurity(arr, impurity_measure = 'entropy'):
    #Starting sum
    entropy = 0
    gini = 0
    #Finding the counts for each possible label. Eihter one or two labels. 
    labels, counts = np.unique(arr, return_counts=True)
    #Calculating the entropy and gini for the array
    for count in counts:
        prob = count/len(arr)
        entropy -= prob*log2(prob)
        gini += prob*(1-prob)

    if impurity_measure == 'entropy':
        return entropy
    elif impurity_measure == 'gini':
        return gini

In [6]:
def cond_impurity(feature, labels, split_point, impurity_measure='entropy'):
    #Finding mean
    
    #Getting the labels for the split
    left_labels, right_labels, f_mean = split_label(feature, labels, split_point=feature.mean())
    
    #Finding the count of labels on each side
    left_len = len(left_labels)/len(feature)
    right_len = len(right_labels)/len(feature)

    #Calculating the impurity measure for the feature and each split, and multiplying with the split ratio
    impurity_left = impurity(left_labels, impurity_measure)*left_len
    impurity_right = impurity(right_labels, impurity_measure)*right_len

    #returning the conditional entropy
    return impurity_left + impurity_right

In [7]:
def investigation_score(feature, labels, split_point, impurity_measure='entropy'):

    #Getting the entropies
    entropy_feature = impurity(labels, impurity_measure)
    cond_entropy_feature = cond_impurity(feature, labels, impurity_measure)

    #Returning the investigation score
    return entropy_feature - cond_entropy_feature
    

### Finding the best feature to split on, by investigation score

In [8]:
def find_best_feature(data, labels, split_point, impurity_measure = 'entropy'):
    #Making a dictionary to store the feature key and the inv_score as value. 
    best_i = {}
    #Iterating through each feature
    for feature in range(data.shape[1]):
        best_i[feature] = investigation_score(data[:, feature], labels, split_point, impurity_measure=impurity_measure)

    #Finding best feature with max function, where the key is the values. 
    #print(best_i.values())
    #if max(best_i.values()) > 0:
    best_feature_index = max(best_i, key= best_i.get)
    return best_feature_index

### Splitting the data and labels into subarrays
I chose to return a triple containing the left side, right side and value for the node

In [53]:
def split_data(data, labels, impurity_measure = 'entropy', feature_index='best', feature_mean='best'):
    
    #If we want to find best_feature or use a predetermined
    if feature_index == 'best':
        feature_index = find_best_feature(data, labels, impurity_measure)
        best_feature_mean = 'best'

    #Get the feature
    best_feature = data[:, feature_index]

    #Splitting the labels | Could have returned the labels in investigation_score to save compute, but this should be marginal.
    left_labels, right_labels, best_feature_mean = split_label_mean(best_feature, labels)  
    
    #Splitting the data based on the indexes of what points is lower or higher than mean, given specific feature
    left_data = data[best_feature < best_feature_mean]
    right_data = data[best_feature >= best_feature_mean]
    
    #Returning a 3-tuple consisting of the left side, the right side and information about the split (what feature, the mean)
    #The latter will be stored in each branch when building the tree. 
    return (left_data, left_labels), (right_data, right_labels), (feature_index, best_feature_mean)
    

In [78]:
def split_data_new(data, labels, impurity_measure='entropy', split_point='find', feature_index='find'):
    #If we want to find best_feature or use a predetermined
    if feature_index == 'find':
        feature_index = find_best_feature(data, labels, impurity_measure)
        split_point = 'find'
    
    #Get the feature
    split_feature = data[:, feature_index]

    if split_point == 'find':
        split_point = split_feature.mean()

    #Splitting the labels | Could have returned the labels in investigation_score to save compute, but this should be marginal.
    left_labels, right_labels, split_point = split_label(split_feature, labels, split_point)  
    
    #Splitting the data based on the indexes of what points is lower or higher than mean, given specific feature
    left_data = data[split_feature < split_point]
    right_data = data[split_feature >= split_point]
    
    #Returning a 3-tuple consisting of the left side, the right side and information about the split (what feature, the mean)
    #The latter will be stored in each branch when building the tree. 
    return (left_data, left_labels), (right_data, right_labels), (feature_index, split_point)
    

### Checking for identical features
The approach is if there is only one unique value in every column, then the multi array has identical rows. 

In [79]:
def identical_features(data, labels):
    count = 0
    for feature in range(data.shape[1]):
        if len(np.unique(data[:,feature])) == 1:
            count += 1
    
    if count == data.shape[1]:
        uniques, counts = np.unique(labels, return_counts=True)
        return uniques[np.argmax(counts)]

## Implementing the ID3 function by using the prior built functions


In [80]:
def id3(data, labels, tree, impurity_measure = 'entropy', split='mean'):
    #Finding checking for identical features
    identical = identical_features(data, labels)
    
    #If all data points have the same label:
    if impurity(labels) == 0:
        tree.value = labels[0]
        return labels[0]

    #Else if all data points have identical feature values
    elif identical != None:
        tree.value = identical
        return


    #Else
    else:
        #Extracting the information from the split
        left, right, root = split_data(data, labels, impurity_measure=impurity_measure)

        if root != 0:
            #Setting this root to indicate the split
            tree.value = root

            #Setting the majority label
            lab, counts = np.unique(labels, return_counts=True)
            maj_index = np.where(counts == max(counts))[0][0]
            tree.majority_label = lab[maj_index]
    
            #Making left branch
            new_left = Tree()
            tree.left = new_left
            id3(left[0], left[1], new_left, impurity_measure)
    
            #Making right branch
            new_right = Tree()
            tree.right = new_right 
            id3(right[0], right[1], new_right, impurity_measure)
        
        
        
    

## Creating some functions to inspect the tree

In [81]:
def search_tree(tree,count=0):
    if tree != None:
        counts = count
        counts += 1
        print(tree.value)
    
        search_tree(tree.left,counts)
        search_tree(tree.right,counts)

In [82]:
def total_nodes(tree):
    if tree == None:
        return 0

    l = total_nodes(tree.left)
    r = total_nodes(tree.right)

    return 1 + l + r
    

## Prediction and accuracy functions

Because I implemented the tree structure with binary tree properties, then searching the tree will be in O(logn) time. This makes the pruning really efficient. The predict function works like a binary search. It goes under or over a value in a given feature untill it has found a leaf node. 

In [83]:
def predict(data_point, tree):
    #Only the leaf node is not a tuple. So go through all the tuples and go leaft or right until you reach a leaf
    while type(tree.value) == tuple: #and tree.value != None | is unecesarry since every split has a child or is a leaf. So it wont trigger non leafs or splits
        feature, split_point = tree.value
        if data_point[feature] < split_point:
            tree = tree.left
        else:
            tree = tree.right

    #When you reach a leaf, return the value/label
    else:
        return tree.value
    #print(tree.value)
    

In [84]:
def tree_accuracy(data, labels, tree):
    #Make an array of all the predictions for the data_points, with given tree, then return the ratio of correctly labled predictions. 
    predictions = np.array([predict(data_point, tree) for data_point in data])
    return len(predictions[predictions == labels])/len(labels)

#I was contemplating wheter it was possible to optimize with vectorization, but since data is a multi-dim array, then it would be more tricky.
#I concluded that using list comprehension was more pythonic. And the check with extracting correctely labled predictions with np is quite efficient.

In [85]:
def majority_accuracy(labels, prediction):
    return len(labels[labels == prediction])/len(labels)

## Pruning algorithm
In essence a Depth first search. Since I implemented the tree structure to have a left and right child, I dont have a child list. But it works the same since when using a for loop on a list of children, you start with the first one and its first one etc. So I'm doing left side first, then calling right side when the left is searched. Instead of for loop im just using running it recusively as long as the child is a tree object. Then on each child im calculating if the accuracy is higher with a lable instead of a split, and if so, the new value is a label.

If there is an empty array, then I'm not changing its branch. This is because the pruning data does not have the same data points as the training data. But when you calculate the accuracy and predictions with labels, they are all 0%. So the pruning data is not really suitable to determine wheter this split is necesarry or not. I have done some observations with different seeds and on average the accuracy declines when pruning these empty arrays. Hence im leaving the branches untouched.  

In [86]:
def prune(data, labels, tree, impurity_measure = 'entropy'):
    #Just want to do the pruning on a node that is splitting, not on a label node.
    if type(tree.value) == tuple:
        feature, mean = tree.value

        #Incase there are empty branches. If the the data and labels are empty, then there is no reason to split it, so the datapoints can stay as they were.
        #Since the accuracy is 0 with given data, but also 0 for either 0 or 1, so it would be unfair to choose one, because there is not enough data in pruning, to make a good decision.
        if len(labels) == 0:
            return
        else:
            left, right, values = split_data_new(data, labels, impurity_measure=impurity_measure, feature_index = feature, split_point=mean)

        
        #If there is a left 
        if type(tree.left) == Tree:
            prune(left[0], left[1], tree.left)
        if type(tree.right) == Tree:
            prune(right[0], right[1], tree.right)

        #Accuracy of either splitting or giving hard label
        #If the accuracy of majority class is greater than the prediction of subtree, then replace the subtree with the majority lable. Also remove children.
        if majority_accuracy(labels, tree.majority_label) >= tree_accuracy(data, labels, tree):
            #print('true')
            #print(tree.value, left[1], right[1], predict_flat_label(data, labels, prediction=label), accuracy(data, labels, tree))
            tree.value = tree.majority_label
            tree.left, tree.right = None, None
            #print(tree.value)

    

# The main learning algorithm

In [87]:
#Using the id3 algorithm to return a decision tree.
def learn(X, y, impurity_measure='entropy', pruning=False, train_ratio=0.8):
    #Making the root for the tree
    tree = Tree()

    #Checking whether the pruning is true
    if pruning == True:
        #Now we need to split the data
        train_ratio = train_ratio
        X_train, X_prune = np.split(X, [int(train_ratio*len(X))])
        y_train, y_prune = np.split(y, [int(train_ratio*len(y))])

        #Making the tree with training data
        id3(X_train, y_train, tree, impurity_measure)
        total_nodes(tree)

        #Pruning the tree with the pruning data
        prune(X_prune, y_prune, tree)
        total_nodes(tree)

        #Returning the pruned tree
        return tree
    
    #Else if pruning is false | Just make the tree and return it.
    else:
        id3(X, y, tree, impurity_measure)
        return tree

### read csv to numpy
Nice to have if we want to load more datasets

In [88]:
def cvs_numpy(name=''):
    with open(name, 'r') as r:
        reader = csv.reader(r)
        data = list(reader)
    
    feature_names = data[0][:-1]
    data_ar = np.array(data[1:], dtype=float)
    targets = data_ar[:, -1]
    data = data_ar[:, :-1]
    return data, targets, feature_names

# Testing withthe wine dataset

In [89]:
data, labels, target_names = cvs_numpy('wine_dataset.csv')#
seed = 333#332#333#521
X_train, X_val_test, y_train, y_val_test = model_selection.train_test_split(data, labels, test_size=0.3, shuffle=True, random_state=seed)

In [90]:
data, labels, target_names

(array([[ 0.13,  1.6 ,  3.34,  0.59,  9.2 ],
        [ 0.1 ,  2.8 ,  3.6 ,  0.66, 10.2 ],
        [ 0.32,  1.9 ,  3.2 ,  0.55,  9.5 ],
        ...,
        [ 0.44,  1.6 ,  3.38,  0.86,  9.9 ],
        [ 0.36,  4.5 ,  3.4 ,  0.57, 10.4 ],
        [ 0.34,  6.4 ,  2.99,  0.4 , 10.8 ]]),
 array([1., 1., 1., ..., 1., 0., 0.]),
 ['citric acid', 'residual sugar', 'pH', 'sulphates', 'alcohol'])

In [124]:
start = time.time()
tree= learn(X_train, y_train, impurity_measure='gini')
accu = tree_accuracy(X_train, y_train, tree)
ending = time.time()
print(f'Time: {ending-start}, accuracy: {accu}')

Time: 0.7377147674560547, accuracy: 1.0


In [125]:
start = time.time()
tree= learn(X_train, y_train, impurity_measure='gini', pruning=True, train_ratio=0.7)
accu = tree_accuracy(X_train, y_train, tree)
ending = time.time()
print(f'Time: {ending-start}, accuracy: {accu}')

Time: 0.3968312740325928, accuracy: 0.9311885612153709


It is taking shorter with pruning because it is training with less training data

### prior output: pruning takes 0.017923593521118164s
prune_start = time.time()
prune_end = time.time()
print(f'pruning takes {prune_end-prune_start}s')

In [93]:
tree.left.left.value
total_nodes(tree)


223

In [94]:
tree_accuracy(X_train, y_train, tree)

0.9311885612153709

In [95]:
tree_accuracy(X_val_test, y_val_test, tree)


0.8770833333333333

In [96]:
tree_2 = Tree()
tree_2 = learn(X_train, y_train, impurity_measure='gini')

total_nodes(tree_2), tree_accuracy(X_val_test, y_val_test, tree_2)

(757, 0.865625)

In [97]:
#prune(X_val_test, y_val_test, tree_2)

In [98]:
total_nodes(tree_2)

757

In [99]:
(tree_accuracy(X_val_test, y_val_test, tree), total_nodes(tree)), (tree_accuracy(X_val_test, y_val_test, tree_2), total_nodes(tree_2))

((0.8770833333333333, 223), (0.865625, 757))

In [100]:
#predict_flat_label(X_val_test, y_val_test, 1)

In [101]:
search_tree(tree)

(1, 4.440644955300128)
(3, 0.6008839779005525)
(3, 0.4994230769230769)
(3, 0.4223715415019763)
(0, 0.31336206896551727)
(2, 3.1584375)
0.0
(1, 2.043548387096774)
0.0
(1, 2.8714285714285714)
(4, 11.299999999999999)
0.0
(0, 0.24200000000000005)
1.0
0.0
0.0
0.0
(0, 0.26328467153284674)
(0, 0.13734375000000001)
1.0
(4, 10.308796296296297)
(2, 3.2412499999999995)
(4, 9.536363636363637)
(1, 2.5)
(1, 1.7)
0.0
1.0
0.0
(0, 0.202)
1.0
(0, 0.225)
0.0
1.0
1.0
(1, 1.8749999999999998)
0.0
(2, 3.403333333333333)
(2, 3.3274999999999997)
0.0
1.0
0.0
(1, 1.86986301369863)
0.0
(4, 10.574074074074073)
(4, 9.760000000000002)
1.0
(2, 3.271666666666667)
1.0
0.0
0.0
(0, 0.2374932614555256)
(3, 0.5551479289940828)
(3, 0.529277108433735)
(3, 0.5113333333333333)
(0, 0.13444444444444448)
1.0
(2, 3.191)
0.0
1.0
1.0
1.0
1.0
(4, 10.536386138613864)
(4, 9.785833333333334)
1.0
(4, 10.191379310344827)
(1, 2.228)
(0, 0.30062500000000003)
(2, 3.3266666666666667)
1.0
0.0
0.0
1.0
(1, 1.8424242424242423)
0.0
(0, 0.362)
(2, 

In [34]:
v_func = np.vectorize(predict)

In [35]:
#len(predictions[predictions == y_test])/len(y_test)

In [36]:

predictions = np.array([predict(data_point, tree) for data_point in X_val_test]) 
tree_accuracy(X_val_test, y_val_test, tree), metrics.accuracy_score(y_val_test, predictions)

(0.8614583333333333, 0.8614583333333333)

In [126]:
#predict_flat_label(X_test, y_test, 0)

In [127]:
len(y_test[y_test == 0])/len(y_test)

0.48541666666666666

## Evaluating algorithm

Assess the performance of your algorithm using an appropriate performance
measure. Which setting should you select for this data (entropy or Gini,
pruning or no pruning)? What is your estimate for the performance of
the selected model on unseen data points? Report how you arrived at the
conclusi

ons.
Remember to use training, validation, and test sets properly. Note that in the
model selection step you select one out of the four models (settings) based
on performance on validation data, and in the model evaluation step you
evaluate the selected model on test data.

In [128]:
X_val, X_test = np.split(X_val_test, [int(len(X_val_test)/2)])
y_val, y_test = np.split(y_val_test, [int(len(y_val_test)/2)])

In [129]:
def best_ratio(train_data, train_labels, val_data, val_labels, impurity_measure):
    tree_prunefree = Tree()
    tree_prunefree = learn(train_data, train_labels, impurity_measure=impurity_measure)
    accuracy_prunefree = tree_accuracy(val_data, val_labels, tree_prunefree)
    #print(f'Prune free accuracy: {accuracy_prunefree}')
    ratios = {}
    
    for x in range(1,10):
        tree = Tree()
        tree = learn(train_data, train_labels, impurity_measure=impurity_measure, pruning=True, train_ratio=x/10)
        accuracy_tree = tree_accuracy(val_data, val_labels, tree)
        ratios[x/10] = accuracy_tree
        #print(f'Testing training ratio: {x/10}, with impurity_measure: gini, accuracy: {accuracy_tree:.3f} | Difference in accuracy: {accuracy_tree- accuracy_prunefree:.5f}')
    ratios[1] = accuracy_prunefree
    best_ratio = max(ratios, key=ratios.get)

    pruning = False
    if best_ratio != 1:
        pruning = True
    
    return [best_ratio, ratios[best_ratio], impurity_measure, pruning]

In [130]:
testing = np.array([best_ratio(X_train, y_train, X_val, y_val, impurity_measure='entropy'),
                    best_ratio(X_train, y_train, X_val, y_val, impurity_measure= 'gini')])
train_ratio, accuracy, impurity_measure, pruning = testing[np.argmax(testing[:, 1])]
print(f'The best model uses impurity measure: {impurity_measure}, with pruning: {pruning}, training ratio of: {train_ratio}, and an accuracy of: {accuracy.astype(float):.5f}')
print((pruning))
best_tree = learn(X_train, y_train, impurity_measure=impurity_measure, pruning=(pruning=='True'), train_ratio=train_ratio.astype(float))
print(f'The test accuracy on the best model is: {tree_accuracy(X_test, y_test, best_tree):.5f}')

The best model uses impurity measure: entropy, with pruning: True, training ratio of: 0.7, and an accuracy of: 0.88542
True
The test accuracy on the best model is: 0.86875


The best model uses impurity measure: entropy, with pruning: True, training ratio of: 0.7, and an accuracy of: 0.87292
True
The test accuracy on the best model is: 0.87500


## SKLEARN test

path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
clfs = []
for ccp_alpha in ccp_alphas:
    clf = sk_tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

In [105]:
clf = sk_tree.DecisionTreeClassifier(criterion="gini",)

clf = clf.fit(X_train, y_train)
#sk_tree.plot_tree(clf)
tree = learn(X_train, y_train, impurity_measure='gini')


In [None]:
clfs

In [106]:
train_pred = clf.predict(X_train)
val_pred = clf.predict(X_val)
test_pred = clf.predict(X_test)

train_acc = len(train_pred[train_pred == y_train])/len(y_train)
val_acc = len(val_pred[val_pred == y_val])/len(y_val)
test_acc =len(test_pred[test_pred == y_test])/len(y_test)
print(f'train: {train_acc}, {val_acc}, {test_acc}')
print("Accuracy sklearn:",metrics.accuracy_score(y_test, test_pred))
val_pred = []
for x in X_val:
    val_pred.append(predict(x, tree))

print("Accuracy my implementation:",metrics.accuracy_score(y_val, val_pred))


train: 1.0, 0.86875, 0.8770833333333333
Accuracy sklearn: 0.8770833333333333
Accuracy my implementation: 0.8666666666666667


In [None]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

In [None]:

train_scores[np.nonzero(test_scores == max(test_scores))[0][0]], max(test_scores)

## Testing with different dataset
found on kaggle: https://www.kaggle.com/datasets/abineshkumark/carsdata

In [107]:
df = pd.read_csv('cars.csv')
df

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,US.
1,31.9,4,89,71,1925,14,1980,Europe.
2,17.0,8,302,140,3449,11,1971,US.
3,15.0,8,400,150,3761,10,1971,US.
4,30.5,4,98,63,2051,17,1978,US.
...,...,...,...,...,...,...,...,...
256,17.0,8,305,130,3840,15,1980,US.
257,36.1,4,91,60,1800,16,1979,Japan.
258,22.0,6,232,112,2835,15,1983,US.
259,18.0,6,232,100,3288,16,1972,US.


In [108]:
data = df.to_numpy()
data

array([[14.0, 8, '350', ..., 12, 1972, ' US.'],
       [31.9, 4, '89', ..., 14, 1980, ' Europe.'],
       [17.0, 8, '302', ..., 11, 1971, ' US.'],
       ...,
       [22.0, 6, '232', ..., 15, 1983, ' US.'],
       [18.0, 6, '232', ..., 16, 1972, ' US.'],
       [22.0, 6, '250', ..., 15, 1977, ' US.']], dtype=object)

In [109]:
labels = data[:, -1]
data = data[:, :-1] 

In [110]:
data

array([[14.0, 8, '350', ..., '4209', 12, 1972],
       [31.9, 4, '89', ..., '1925', 14, 1980],
       [17.0, 8, '302', ..., '3449', 11, 1971],
       ...,
       [22.0, 6, '232', ..., '2835', 15, 1983],
       [18.0, 6, '232', ..., '3288', 16, 1972],
       [22.0, 6, '250', ..., '3353', 15, 1977]], dtype=object)

In [111]:
data[data[:, 2] == ' '] = 0 
for x in data[:,2]:
    x = int(x)

data[:, 2] = data[:, 2].astype(int)

data[data[:, 4] == ' '] = 0 
for x in data[:,4]:
    x = int(x)

data[:, 4] = data[:, 4].astype(int)

In [136]:
train_data, test_data, train_labels, test_label = model_selection.train_test_split(data, labels, test_size=0.3, shuffle=True)

In [137]:
train_labels

array([' US.', ' US.', ' US.', ' US.', ' US.', ' US.', ' US.', ' US.',
       ' US.', ' Japan.', ' Europe.', ' Japan.', ' US.', ' US.', ' US.',
       ' Europe.', ' Europe.', ' Japan.', ' Europe.', ' Japan.', ' US.',
       ' US.', ' US.', ' US.', ' US.', ' Japan.', ' US.', ' US.',
       ' Europe.', ' US.', ' Japan.', ' US.', ' Europe.', ' Japan.',
       ' US.', ' US.', ' Japan.', ' US.', ' US.', ' US.', ' Europe.',
       ' US.', ' Europe.', ' Japan.', ' US.', ' US.', ' US.', ' Japan.',
       ' Japan.', ' Europe.', ' US.', ' US.', ' Europe.', ' Japan.',
       ' US.', ' US.', ' US.', ' Japan.', ' US.', ' US.', ' US.', ' US.',
       ' US.', ' Japan.', ' US.', ' Europe.', ' US.', ' Europe.',
       ' Europe.', ' US.', ' Japan.', ' US.', ' Europe.', ' US.',
       ' Europe.', ' Europe.', ' Europe.', ' Europe.', ' US.', ' US.',
       ' Japan.', ' Japan.', ' US.', ' US.', ' Europe.', ' US.',
       ' Europe.', ' Europe.', ' Europe.', ' US.', ' US.', ' Japan.',
       ' US.', ' US.', '

In [139]:
np.unique(test_label, return_counts=True)

(array([' Europe.', ' Japan.', ' US.'], dtype=object),
 array([ 9, 20, 50], dtype=int64))

In [140]:
tree_3 = Tree()
tree_3 = learn(train_data, train_labels, impurity_measure='entropy', pruning='True')

In [141]:
impurity(labels[:70])

1.4013498193719667

In [148]:
tree_accuracy(train_data, train_labels, tree_3)

0.9945054945054945

In [143]:
predict(data[0], tree_3)

' US.'

In [144]:
labels[0]

' US.'

In [145]:
total_nodes(tree_3)

91

In [146]:
clf = sk_tree.DecisionTreeClassifier(criterion="gini")

clf = clf.fit(train_data, train_labels)

In [151]:
predictions = clf.predict(train_data)

In [152]:
predictions

array([' US.', ' US.', ' US.', ' US.', ' US.', ' US.', ' US.', ' US.',
       ' US.', ' Japan.', ' Europe.', ' Japan.', ' US.', ' US.', ' US.',
       ' Europe.', ' Europe.', ' Japan.', ' Europe.', ' Japan.', ' US.',
       ' US.', ' US.', ' US.', ' US.', ' Japan.', ' US.', ' US.',
       ' Europe.', ' US.', ' Japan.', ' US.', ' Europe.', ' Japan.',
       ' US.', ' Europe.', ' Japan.', ' US.', ' US.', ' US.', ' Europe.',
       ' US.', ' Europe.', ' Japan.', ' US.', ' US.', ' US.', ' Japan.',
       ' Japan.', ' Europe.', ' US.', ' US.', ' Europe.', ' Japan.',
       ' US.', ' US.', ' US.', ' Japan.', ' US.', ' US.', ' US.', ' US.',
       ' US.', ' Japan.', ' US.', ' Europe.', ' US.', ' Europe.',
       ' Europe.', ' US.', ' Japan.', ' US.', ' Europe.', ' US.',
       ' Europe.', ' Europe.', ' Europe.', ' Europe.', ' US.', ' US.',
       ' Japan.', ' Japan.', ' US.', ' US.', ' Europe.', ' US.',
       ' Europe.', ' Europe.', ' Europe.', ' US.', ' US.', ' Japan.',
       ' US.', ' US.

In [153]:
print("Accuracy sklearn:",metrics.accuracy_score(train_labels, predictions))

Accuracy sklearn: 0.9945054945054945


In [154]:
clf.tree_.node_count

61