# Exercise 4
## Fundamentals of Machine Learning - WiSe 20/21

#### Authors: Catherine Knobloch, Elias Olofsson, Julia Siegl

#### Version information:
        2020-12-22: v.1.0. First public release.

# Preliminaries

In [1]:
# import modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format

import sklearn
from sklearn.datasets import load_digits

In [2]:
# base classes

class Node:
    pass

class Tree:
    def __init__(self):
        self.root = Node()
    
    def find_leaf(self, x):
        node = self.root
        while hasattr(node, "feature"):
            j = node.feature
            if x[j] <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node

<span style="color:green;font-weight:bold">Base tree structure untouched from the given code template.</span>

# Density Tree

In [3]:
class DensityTree(Tree):
    def __init__(self):
        super(DensityTree, self).__init__()
        
    def train(self, data, prior, n_min=20):
        '''
        data: the feature matrix for the digit under consideration
        prior: the prior probability of this digit
        n_min: termination criterion (don't split if a node contains fewer instances)
        '''
        self.prior = prior
        N, D = data.shape
        D_try = int(np.sqrt(D)) # number of features to consider for each split decision

        # find and remember the tree's bounding box, 
        # i.e. the lower and upper limits of the training feature set
        m, M = np.min(data, axis=0), np.max(data, axis=0)
        self.box = m.copy(), M.copy()
        
        # identify invalid features and adjust the bounding box
        # (If m[j] == M[j] for some j, the bounding box has zero volume, 
        #  causing divide-by-zero errors later on. We must exclude these
        #  features from splitting and adjust the bounding box limits 
        #  such that invalid features have no effect on the volume.)
        valid_features   = np.where(m != M)[0]
        invalid_features = np.where(m == M)[0]
        M[invalid_features] = m[invalid_features] + 1

        # initialize the root node
        self.root.data = data
        self.root.box = m.copy(), M.copy()

        # build the tree
        stack = [self.root]
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0] # number of instances in present node
            if n >= n_min:
                # Call 'make_density_split_node()' with 'D_try' randomly selected 
                # indices from 'valid_features'. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                rand_indices = np.random.permutation(valid_features)[:D_try]
                left, right = make_density_split_node(node, N, rand_indices)
                stack.append(left)
                stack.append(right)
            else:
                # Call 'make_density_leaf_node()' to turn 'node' into a leaf node.
                make_density_leaf_node(node, N)

    def predict(self, x):
        leaf = self.find_leaf(x)
        # return p(x | y) * p(y) if x is within the tree's bounding box 
        # and return 0 otherwise
        if np.all(x >= self.box[0]) and np.all(x <= self.box[1]): 
            return leaf.response * self.prior
        else: 
            return 0 

<span style="color:green;font-weight:bold">Our code is essentially identical to the sample solution.</span>

In [4]:
def make_density_split_node(node, N, feature_indices):
    '''
    node: the node to be split
    N:    the total number of training instances for the current class
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape
    m, M = node.box

    # find best feature j (among 'feature_indices') and best threshold t for the split
    e_min = float("inf")
    j_min, t_min = None, None
    
    for j in feature_indices:
        # Hint: For each feature considered, first remove duplicate feature values using 
        # 'np.unique()'. This is necessary since placing a threshold between two identical feature values
        # would create ambiguity as to which node a value is belong to.
        data_unique = np.unique(node.data[:, j])
        # Compute candidate thresholds
        tj = (data_unique[1:] + data_unique[:-1]) / 2
        
        # Illustration: for loop - hint: vectorized version is possible
        for t in tj:
            # Number of instances in left and right nodes
            n_left  = (node.data[:,j] < t).sum() 
            n_right = n - n_left

            # Bounding boxes for left and right node
            m_left = m
            M_left = M.copy()
            M_left[j] = t  
            
            m_right = m.copy()
            m_right[j] = t
            M_right = M

            v_left  = np.prod(M_left-m_left) 
            v_right = np.prod(M_right-m_right)

            # Compute the errors
            loo_error_left  = n_left/(N*v_left)*(n_left/N - 2*(n_left-1)/(N-1))
            loo_error_right = n_right/(N*v_right)*(n_right/N - 2*(n_right-1)/(N-1))

            loo_error = loo_error_left + loo_error_right
            
            # choose the best threshold that
            if loo_error < e_min:
                e_min = loo_error
                j_min = j
                t_min = t

    # create children
    left = Node()
    right = Node()
    
    # initialize 'left' and 'right' with the data subsets and bounding boxes
    # according to the optimal split found above

    m_left         = m.copy()
    M_left         = M.copy()
    M_left[j_min]  = t_min

    m_right        = m.copy()
    m_right[j_min] = t_min
    M_right        = M.copy()

    # store bounding box in left and right nodes
    left.box  = m_left, M_left   
    right.box = m_right, M_right

    # store data in left node -- for subsequent splits
    left.data  = node.data[(node.data[:,j_min] < t_min), :]
    right.data = node.data[(node.data[:,j_min] > t_min), :]

    # turn the current 'node' into a split node
    # (store children and split condition)
    node.left      = left
    node.right     = right
    node.feature   = j_min
    node.threshold = t_min

    # return the children (to be placed on the stack)
    return left, right

<span style="color:green;font-weight:bold">Our code is in its approach essentially identical to the sample solution. However, the sample solution is a bit less cluttered and easier to read and understand than to our implementation. Furthermore, the volume computation of the left and right nodes are done in a much more clever and efficient way in the sample solution, compared to our brute-force way of doing it.</span>

In [5]:
def make_density_leaf_node(node, N):
    '''
    node: the node to become a leaf
    N:    the total number of training instances for the current class
    '''
    # compute and store leaf response
    n = node.data.shape[0]
    m, M = node.box
    v = np.prod(M-m)
    node.response = n/(N*v)

<span style="color:green;font-weight:bold">Identical to the sample solution.</span>

# Decision Tree

In [6]:
class DecisionTree(Tree):
    def __init__(self):
        super(DecisionTree, self).__init__()
        
    def train(self, data, labels, n_min=20):
        '''
        data: the feature matrix for all digits
        labels: the corresponding ground-truth responses
        n_min: termination criterion (don't split if a node contains fewer instances)
        '''
        N, D = data.shape
        D_try = int(np.sqrt(D)) # how many features to consider for each split decision

        # initialize the root node
        self.root.data = data
        self.root.labels = labels
        
        stack = [self.root]
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0] # number of instances in present node
            if n >= n_min and not node_is_pure(node):
                # Call 'make_decision_split_node()' with 'D_try' randomly selected 
                # feature indices. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                rand_indices = np.random.permutation(D)[:D_try]
                left, right = make_decision_split_node(node, rand_indices)
                stack.append(left)
                stack.append(right)
            else:
                # Call 'make_decision_leaf_node()' to turn 'node' into a leaf node.
                make_decision_leaf_node(node)
                
    def predict(self, x):
        leaf = self.find_leaf(x)
        # compute p(y | x)
        return leaf.response

<span style="color:green;font-weight:bold">Essentially identical to the sample solution.</span>

In [7]:
def make_decision_split_node(node, feature_indices):
    '''
    node: the node to be split
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape

    e_min = float("inf")
    j_min, t_min = None, None

    # find best feature j (among 'feature_indices') and best threshold t for the split
    for j in feature_indices:
        data_unique = np.unique(node.data[:, j])
        
        # Compute candidate thresholds
        tj = (data_unique[1:] + data_unique[:-1]) / 2

        for t in tj:
            # Create masks for each node.
            mask_left  = (node.data[:,j] < t)
            mask_right = (node.data[:,j] > t)

            # Total number of instances in each node 
            n_left  = mask_left.sum()
            n_right = n - n_left

            # Number of instances per class in each node
            c_left, n_left_k   = np.unique(node.labels[mask_left], return_counts=True)
            c_right, n_right_k = np.unique(node.labels[mask_right], return_counts=True)

            # Calculating the Gini impurities for each node
            gini_left  = n_left  * (1 - np.sum(n_left_k**2)/n_left**2)
            gini_right = n_right * (1 - np.sum(n_right_k**2)/n_right**2)

            gini = gini_left + gini_right

            if gini < e_min:
                e_min = gini
                j_min = j
                t_min = t


    # create children
    left  = Node()
    right = Node()
    
    # initialize 'left' and 'right' with the data subsets and labels
    # according to the optimal split found above
    mask_left   = (node.data[:,j_min] < t_min)
    mask_right  = (node.data[:,j_min] > t_min)

    # data in left and right node
    left.data   = node.data[mask_left,:] 
    right.data  = node.data[mask_right,:]

    # corresponding labels in left and right node
    left.labels  = node.labels[mask_left] 
    right.labels = node.labels[mask_right] 

    # turn the current 'node' into a split node
    # (store children and split condition)
    node.left       = left
    node.right      = right
    node.feature    = j_min
    node.threshold  = t_min

    # return the children (to be placed on the stack)
    return left, right    

<span style="color:green;font-weight:bold">Same as above, our code is in the approach used esssentially the same as the sample solution. But also similarly to above, I perceive the sample solution as more concise and less cluttered, and thus eaiser to understand.</span>

In [8]:
def make_decision_leaf_node(node):
    '''
    node: the node to become a leaf
    '''
    # Compute and store leaf response.
    # Choose the majority class label as the response of the node.
    node.N = node.data.shape[0]
    node.response = np.bincount(node.labels).argmax()

<span style="color:green;font-weight:bold">Here we assign the node response as the majority label of the current node, while the sample solution assignes the repsonse to be an array of posterior probabilities for each of the 10 digits, calculated from the relative frequencies of labels within the node. I'm not sure which approach is the better way of doing it, the only reason we chose our current approach is since this is how the classifier was fomulated in the lecture. I can however see that the sample solution's way may be beneficial for later analysis, since you don't "throw away" any information from within the nodes.</span>

In [9]:
def node_is_pure(node):
    '''
    check if 'node' ontains only instances of the same digit
    '''
    # Compare all labels in the node to the first label within the node.
    return np.all(node.labels[0] == node.labels)

<span style="color:green;font-weight:bold">Slightly different method, but same result as the sample solution.</span>

# Evaluation of Density and Decision Tree

In [10]:
# Import the digits dataset
digits = load_digits()

print(digits.keys())

data         = digits["data"]
images       = digits["images"]
target       = digits["target"]
target_names = digits["target_names"]

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])


### Density tree evaluation

In [11]:
n_min = 20
density_trees = []
for digit in range(10):
    # Append a new density tree to list.
    density_trees.append(DensityTree()) 
    
    # Only consider a single digit
    mask = (target == digit)
    
    # Calculate the prior for the current digit
    prior = np.sum(mask)/target.shape[0]

    # Train the current density tree
    density_trees[digit].train(data[mask], prior, n_min)


<span style="color:green;font-weight:bold">I believe that the sample solution had a much sleeker looking implementation of the generative classifier due to creation of a new class for it. However, our solution is essenially the same, just a bit less fancy way of doing it. Dividing the dataset into subsets is a thing we could have done though to make life a bit simpler for ourselves.</span>

In [12]:
# Get predictions on the training data
posterior = np.zeros(10)
y_pred    = np.zeros_like(target)

for i in range(data.shape[0]):
    # Extract training instance
    x = data[i,:]

    # For each digit, get the posterior probability for x
    for digit in range(10):
        posterior[digit] = density_trees[digit].predict(x)

    # Choose the label that has the largest probability
    y_pred[i] = target_names[np.argmax(posterior)] 

# Get training error
density_tree_train_err = np.sum(target != y_pred)/data.shape[0]
print(f'Density tree training error: {density_tree_train_err:.4}')

Density tree training error: 0.1953


<span style="color:green;font-weight:bold">Same as above, our solution accomplishes essentially the same as the sample solution, however not as stylishly.</span>

In [13]:
def confuse(groundtruth, predictions, normalize=False):
    '''
    Create a confusion matrix.

    Parameters:
    ----------
    groundtruth: np.array
        vector of shape (N,) containing the actual labels.
    predictions: np.array
        vector of shape (N,) containing the predicted labels, corresponding to
        the ground thruth vector.
    normalize: bool
        Normalize each row.
    
    Returns:
    --------
    np.array shape=(N,N)
        Confusion matrix with row numbering corresponding to the ground truth,
        and columns corresponding to predicted labels.
    '''
    # Make sure sizes of input vectors are valid.
    assert groundtruth.shape == predictions.shape 
    N = groundtruth.shape[0]

    # Get the unique labels. 
    unique_labels = np.unique(groundtruth)
    C = unique_labels.shape[0]
    
    # Pre-allocation.
    conf_mat = np.zeros((C,C))

    # For each class in the ground truth
    for i in range(C):
        mask_gr = (groundtruth == unique_labels[i])

        # For each class of the predictions
        for j in range(C):
            mask_pred = (predictions == unique_labels[j])
            conf_mat[i,j] = np.sum(mask_gr & mask_pred)
        
        #Normalize row
        if normalize:
            conf_mat[i,:] = conf_mat[i,:]/np.sum(conf_mat[i,:])
    
    return conf_mat

<span style="color:green;font-weight:bold">We created a dedicated function for getting the confusion matricies, which is nice and all, but the sample solution had a much simpler way of arriving at the same place. A single line of code for the confusion matrix, quite nice I have to say.</span>

In [14]:
# Confusion matrix for the density tree
confusion_mat = confuse(target, y_pred)

def fade_zeros(s):
    return ['color: lightgray' if (v == 0) else 'color: black' for v in s]

display(
    pd.DataFrame(data=confusion_mat, index=target_names, columns=target_names)
    .rename_axis('Actual Label', axis = 'rows')
    .rename_axis('Predicted Label', axis = 'columns')
    .style.apply(fade_zeros)
    .format('{:3.0f}')
)

Predicted Label,0,1,2,3,4,5,6,7,8,9
Actual Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,178,0,0,0,0,0,0,0,0,0
1,0,139,13,1,4,1,0,2,22,0
2,0,10,105,19,0,4,0,0,38,1
3,0,2,0,127,0,2,0,5,43,4
4,0,2,0,0,144,3,0,28,1,3
5,0,0,0,23,0,147,0,1,7,4
6,0,0,0,0,0,0,180,0,1,0
7,0,0,0,3,0,0,0,173,3,0
8,0,5,0,5,0,2,0,4,158,0
9,0,7,2,45,4,1,0,6,20,95


<span style="color:green;font-weight:bold">Results are very similar to the sample solution, off-diagonal elements are the quantitatively similar for our confusion matrix as the one in the sample solution. However, or matrix is not normalized, while the sample solution has their nomalized to 100.</span>

### Decision tree evaluation

In [15]:
n_min = 20

# Create decision tree
decision_tree = DecisionTree()

# Train
decision_tree.train(data, target, n_min)

# Get predictions on training data
y_pred = np.zeros_like(target)
for i in range(data.shape[0]):
    y_pred[i] = decision_tree.predict(data[i,:])

# Calculate training error
decision_tree_train_err = np.sum(target != y_pred)/data.shape[0]
print(f'Decision tree training error: {decision_tree_train_err}')

Decision tree training error: 0.11018363939899833


In [16]:
# Get confusion matrix for the decision tree
confusion_mat = confuse(target, y_pred)

display(
    pd.DataFrame(data=confusion_mat, index=target_names, columns=target_names)
    .rename_axis('Actual Label', axis = 'rows')
    .rename_axis('Predicted Label', axis = 'columns')
    .style.apply(fade_zeros)
    .format('{:3.0f}')
)

Predicted Label,0,1,2,3,4,5,6,7,8,9
Actual Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,165,0,0,1,1,2,2,3,0,4
1,2,158,1,4,1,5,1,1,6,3
2,3,0,162,2,0,6,0,0,4,0
3,1,4,4,163,0,2,0,2,3,4
4,0,4,0,0,171,1,0,4,0,1
5,0,1,0,7,3,163,0,3,4,1
6,0,3,1,2,1,2,172,0,0,0
7,0,0,1,1,1,0,0,169,3,4
8,3,8,5,7,0,9,2,0,125,15
9,1,3,2,2,2,9,0,6,4,151


<span style="color:green;font-weight:bold">Same as above, quantitatively similar to the sample solution. We did however not play around with the parameter n_min, so we have no comparision or discussion for other values than n_min = 20.</span>

# Density and Decision Forest

In [28]:
class DensityForest():
    def __init__(self, n_trees):
        # create ensemble
        self.trees = [DensityTree() for i in range(n_trees)]
    
    def train(self, data, prior, n_min=20):
        for tree in self.trees:
            # train each tree, using a bootstrap sample of the data
            # what is meant by same size as original data? and replacing?
            
            sample_size = 300 
            
            index_data = np.linspace(0,len(data)-1,len(data))
            index_rand_data = np.random.choice( index_data, size=sample_size)
            
            sample_n = data[np.array(index_rand_data).astype(int)] 
            
            #sample = resample(data, replace=True, n_samples=50, random_state=1)
            tree.train(np.array(sample_n), prior)


    def predict(self, x):
        # compute the ensemble prediction
        def most_frequent(List): 
            return max(set(List), key = List.count) 

        predict=[]
        for tree in self.trees:
            predict.append(tree.predict(x))
            
        # return most frequent prediction 
        return  most_frequent(predict)# your code here

<span style="color:green;font-weight:bold">Here we missunderstood what a bootstrap data sample actually means, and thought it was only a random choice smaller than the original sample size. This is not the case however, since the bootstrap sample should have the same size as the original sample. Without having tested the code, I think that our code given above should work correctly if we instead set sample_size = len(data).</span>

In [29]:
class DecisionForest():
    def __init__(self, n_trees):
        # create ensemble
        self.trees = [DecisionTree() for i in range(n_trees)]
    
    def train(self, data, labels, n_min=0):
        for tree in self.trees:
            # train each tree, using a bootstrap sample of the data                       
            sample_size = 300
            
            index_data = np.linspace(0,len(data)-1,len(data))
            index_rand_data = np.random.choice( index_data, size=sample_size)
            
            sample_n = data[np.array(index_rand_data).astype(int)]
            y_n = labels[np.array(index_rand_data).astype(int)]         

            
            #sample = resample(data, replace=True, n_samples=50, random_state=1)
            tree.train(np.array(sample_n),np.array(y_n))

    def predict(self, x):
        # compute the ensemble prediction

        predict=[]
        for tree in self.trees:
            predict.append(tree.predict(x))
 
        return  np.bincount(predict).argmax() #stats.mode(predict, axis=0)[0][0]    # your code here


<span style="color:green;font-weight:bold">Same as above, we missunderstood the bootstrap sampling process. But also, similarly to above, I belive our code might work corretly by simply changing the parameter sample_size to correspond to the full original sample size.</span>

# Evaluation of Density and Decision Forest

### Density tree evaluation

In [30]:
n_min = 20
density_forests = []
for digit in range(10):
    # Append a new density tree to list.
    density_forests.append(DensityForest(20)) 
    
    # Only consider a single digit
    mask = (target == digit)
    
    # Calculate the prior for the current digit
    prior = np.sum(mask)/target.shape[0]

    # Train the current density tree
    density_forests[digit].train(data[mask], prior, n_min)
    
# Get predictions on the training data
posterior = np.zeros(10)
y_pred    = np.zeros_like(target)

for i in range(data.shape[0]):
    # Extract training instance
    x = data[i,:]

    # For each digit, get the posterior probability for x
    for digit in range(10):
        posterior[digit] = density_forests[digit].predict(x)

    # Choose the label that has the largest probability
    y_pred[i] = target_names[np.argmax(posterior)] 

# Get training error
density_forests_train_err = np.sum(target != y_pred)/data.shape[0]
print(f'Density tree training error: {density_forests_train_err:.4}')

Density tree training error: 0.2009


<span style="color:green;font-weight:bold">Similarly to above, the sample solution has a much sleeker solution by creating a new class GenerativeClassifierDensityForest, compared to our more manual way of performing the same task. Otherwise, we essentially perform the same thing.</span>

In [23]:
# Confusion matrix for the density tree
confusion_mat = confuse(target, y_pred)

def fade_zeros(s):
    return ['color: lightgray' if (v == 0) else 'color: black' for v in s]

display(
    pd.DataFrame(data=confusion_mat, index=target_names, columns=target_names)
    .rename_axis('Actual Label', axis = 'rows')
    .rename_axis('Predicted Label', axis = 'columns')
    .style.apply(fade_zeros)
    .format('{:3.0f}')
)

Predicted Label,0,1,2,3,4,5,6,7,8,9
Actual Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,174,0,0,0,4,0,0,0,0,0
1,14,116,4,0,1,0,0,0,41,6
2,19,5,123,2,0,0,0,0,28,0
3,15,0,4,116,0,1,0,2,36,9
4,18,1,0,0,155,1,0,6,0,0
5,20,4,0,6,0,139,0,0,11,2
6,23,0,0,0,0,0,157,0,1,0
7,21,0,0,0,1,1,0,156,0,0
8,19,9,2,6,0,2,0,4,132,0
9,20,0,0,10,0,3,0,9,12,126


<span style="color:green;font-weight:bold">This confusion matrix has a lot of differences compared to the equivalent one obatained in the sample solution. I believe however that this is due to our incorrect way of getting the bootstrap samples, based on our missunderstanding of the process. </span>

### Decision forst evaluation

In [24]:
n_min = 20

# Create decision forest
decision_forest = DecisionForest(20)

# Train
decision_forest.train(data, target, n_min)

# Get predictions on training data
y_pred = np.zeros_like(target)
for i in range(data.shape[0]):
    y_pred[i] = decision_forest.predict(data[i,:])

# Calculate training error
decision_forest_train_err = np.sum(target != y_pred)/data.shape[0]
print(f'Decision tree training error: {decision_forest_train_err}')

Decision tree training error: 0.06622148024485253


<span style="color:green;font-weight:bold">Essentially the same in method as the sample solution.</span>

In [25]:
# Get confusion matrix for the decision tree
confusion_mat = confuse(target, y_pred)

display(
    pd.DataFrame(data=confusion_mat, index=target_names, columns=target_names)
    .rename_axis('Actual Label', axis = 'rows')
    .rename_axis('Predicted Label', axis = 'columns')
    .style.apply(fade_zeros)
    .format('{:3.0f}')
)

Predicted Label,0,1,2,3,4,5,6,7,8,9
Actual Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,177,0,0,0,1,0,0,0,0,0
1,0,167,0,0,0,0,3,0,0,12
2,0,2,171,1,0,0,0,0,1,2
3,1,4,4,164,0,4,0,1,4,1
4,0,3,0,0,173,1,0,4,0,0
5,0,5,0,1,1,167,2,0,1,5
6,1,1,0,0,2,0,176,0,1,0
7,0,1,0,0,3,1,0,173,1,0
8,0,10,2,0,1,2,0,2,156,1
9,2,2,0,4,3,6,0,5,4,154


The Training error of the Density forest is not really better than the one from the Density tree. The reason for that is, that we still train with the same data. Maybe it is even a bit worse. But probablby we achieve a larger test set precision since our model does not overfit the trainingsdata any longer. 
In the Decision tree we seem to have a smaller error. But it is hard to jufge which model performs best as we do not have any evalation ond unseen data.

<span style="color:green;font-weight:bold">Here we also have a discrepancy compared to the confusion matrix obtained in the sample solution. Similarly to above, I do also believe here as well that this is due to our misunderstanding of the bootstrap sampling process. Furthermore we only performed the evaluation with parameter n_min = 20, while the sample solution tested additional values.</span>