In [30]:
import numpy as np


In [31]:
def read_dataset(filepath):
    """ Read in the dataset from the specified filepath

    Args:
        filepath (str): The filepath to the dataset file

    Returns:
        tuple: returns a tuple of (x, y, classes), each being a numpy array. 
               - x is a numpy array with shape (N, K), 
                   where N is the number of instances
                   K is the number of features/attributes
               - y is a numpy array with shape (N, ), and each element should be 
                   an integer from 0 to C-1 where C is the number of classes 
               - classes : a numpy array with shape (C, ), which contains the 
                   unique class labels corresponding to the integers in y
    """

    x = []
    y_labels = []
    for line in open(filepath):
        if line.strip() != "": # handle empty rows in file
            row = line.strip().split(",")
            x.append(list(map(float, row[:-1]))) 
            y_labels.append(row[-1])
    

    x = np.array(x, dtype=int)
    y = np.array(y_labels)
    return (x, y)


In [32]:
def proportion(y_label): 
    label_set = np.unique(y_label)
    for label in label_set: 
        print(f"{label}: {np.sum(label == y_label)*100/len(y_label)}") 

In [33]:
def column_range(x):
    max_range = np.max(x, axis = 0)
    min_range = np.min(x, axis = 0)
    range = max_range-min_range
    range = np.array(range, dtype=int)
    print(range)

In [34]:
(x_full, y_full_label) = read_dataset("data/train_full.txt")
# print(x_full)
print(y_full_label)
proportion(y_full_label)
print(x_full.shape)
column_range(x_full)

['Q' 'Q' 'G' ... 'C' 'E' 'O']
A: 17.102564102564102
C: 15.35897435897436
E: 16.897435897435898
G: 17.205128205128204
O: 16.333333333333332
Q: 17.102564102564102
(3900, 16)
[10 15 10 12 14 12 14 10 12 10 11 11 12 14 11 13]


In [35]:
(x_noisy, y_noisy_label) = read_dataset("data/train_noisy.txt")
# print(x_noisy)
print(y_noisy_label)
proportion(y_noisy_label)
print(x_noisy.shape)

['Q' 'Q' 'G' ... 'O' 'E' 'A']
A: 17.46153846153846
C: 14.64102564102564
E: 17.384615384615383
G: 15.948717948717949
O: 17.076923076923077
Q: 17.487179487179485
(3900, 16)


In [36]:
(x_sub, y_sub_label) = read_dataset("data/train_sub.txt")
# print(x_sub)
# print(y_sub_label)
proportion(y_sub_label)
print(x_sub.shape)


A: 15.666666666666666
C: 31.166666666666668
E: 21.5
G: 3.5
O: 18.833333333333332
Q: 9.333333333333334
(600, 16)


In [37]:
def noise_cal2(x_full, x_noisy, y_full, y_noisy):
    #count = 0
    for i, row in enumerate(x_full):
        noisy_indexes = np.where((x_noisy == row).all(axis=1))
        for index in noisy_indexes[0]:
            if y_noisy[index] == y_full[i]:
                # count += 1
                x_noisy = np.delete(x_noisy, index, 0)
                y_noisy = np.delete(y_noisy, index, 0)

    print(100 - (x_noisy.shape[0] * 100/x_full.shape[0]))
#noise_cal(x_full, x_noisy, y_full_label, y_noisy_label)
noise_cal2(x_full, x_noisy, y_full_label, y_noisy_label)


84.71794871794872


In [38]:
class Node:
    def __init__(self, split_val=-1, column=-1, label=None):
        self.left = None
        self.right = None
        self.split_val = split_val
        self.column = column
        self.label = label
        

    def add_child(self, child): 
        if self.left == None:
            self.left = child
        elif self.right == None:
            self.right = child
        else:
            print("no")

    def print_tree(self, d = 0):
        print("Depth = ", d)
        if self.left == None and self.right == None:
            if self.label != None: 
                print(self.label)
            return
        
        # else: 
        print(self.split_val,' ',self.column)
        if self.right != None: 
            print("right")
            d += 1
            self.right.print_tree(d)
        if self.left != None:
            print("left")
            self.left.print_tree(d)
        
        

In [114]:
#############################################################################
# Introduction to Machine Learning
# Coursework 1 Skeleton code
# Prepared by: Josiah Wang
#
# Your tasks: Complete the fit() and predict() methods of DecisionTreeClassifier.
# You are free to add any other methods as needed. 
##############################################################################


class DecisionTreeClassifier(object):
    """ Basic decision tree classifier
    
    Attributes:
    is_trained (bool): Keeps track of whether the classifier has been trained
    
    Methods:
    fit(x, y): Constructs a decision tree from data X and label y
    predict(x): Predicts the class label of samples X
    prune(x_val, y_val): Post-prunes the decision tree
    """

    def __init__(self):
        self.is_trained = False
        self.root = None

    def calculate_entropy(self, y):
        # Get unique labels
        label_set = np.unique(y)
        entropy = 0
        for label in label_set: 
            entropy += -np.sum(label == y)/len(y) * np.log2(np.sum(label == y)/len(y))
     
        return entropy

    def info_gain_calc(self, x, y, x_val, sort_col):

        # Calculate total entropy for overall data
        total_entropy = self.calculate_entropy(y)

        # Calculate entropy for left and right of split
        left_entropy = self.calculate_entropy(y[x[:, sort_col] < x_val])
        right_entropy = self.calculate_entropy(y[x[:, sort_col] >= x_val])

        # Calculate info gain
        info_gain = total_entropy - ((len(y[x[:, sort_col] < x_val])/len(y) * left_entropy) 
                                    + (len(y[x[:, sort_col] >= x_val])/len(y) * right_entropy))
        
        return info_gain

    def find_best_node(self, x, y):
        y = y.reshape(-1, 1)

        # To keep track of info gain
        max_gain = -1
        value_to_split_on = -1
        column_to_split_on = -1
                
        # loop through each column 
        for i in range(x.shape[1]):
            # sort by that column
            index_list = x[:, i].argsort()
            x = x[index_list]
            y = y[index_list]

            starting_label = y[0]
            starting_val = x[:, i][0]

            # loop through the column
            for x_val, y_val in zip(x[:, i], y):
                if (y_val != starting_label) and (x_val != starting_val):

                    # calculate information gain
                    info_gain = self.info_gain_calc(x, y, x_val, i)
                    
                    # update the max information gain
                    if info_gain > max_gain:
                        max_gain = info_gain
                        value_to_split_on = x_val
                        column_to_split_on = i
                    # Update starting label
                    starting_label = y_val
                    starting_val = x_val
                    
        return Node(value_to_split_on, column_to_split_on)


    def split_dataset(self, x, y, parent):
        child_data = []
        index = np.where(x[:, parent.column] == parent.split_val)[0][0]
        left_x = x[:index]
        left_y = y[:index]
        right_x = x[index:]
        right_y = y[index:]
        child_data.append((left_x, left_y))
        child_data.append((right_x, right_y))

        
        return child_data

    def induce_decision_tree(self, x, y):
        # check y count is 1 or node column returns -1
        if (len(np.unique(y)) == 1 or x.shape[0] == 1):
            leaf_node = Node(label=y[0])
            return leaf_node

        else:
            # find best node
            parent = self.find_best_node(x, y)
            index_list = x[:, parent.column].argsort()
            x = x[index_list]
            y = y[index_list]
            
            # get left and right datasets
            if parent.split_val < 0:
                label_set, count = np.unique(y, return_counts=True)
                label = label_set[np.argmax(count)]

                leaf_node = Node(label=label)
                return leaf_node
            child_data = self.split_dataset(x, y, parent)
            
            for i in child_data: 
                child_node = self.induce_decision_tree(i[0], i[1])
                parent.add_child(child_node)
            return parent 
            
    
    def fit(self, x, y):
        """ Constructs a decision tree classifier from data
        
        Args:
        x (numpy.ndarray): Instances, numpy array of shape (N, K) 
                           N is the number of instances
                           K is the number of attributes
        y (numpy.ndarray): Class labels, numpy array of shape (N, )
                           Each element in y is a str 
        """
        
        # Make sure that x and y have the same number of instances
        assert x.shape[0] == len(y), \
            "Training failed. x and y must have the same number of instances."
        
        #######################################################################
        #                 ** TASK 2.1: COMPLETE THIS METHOD **
        #######################################################################

        self.root = self.induce_decision_tree(x, y)
        #self.root.print_tree()


        
        # set a flag so that we know that the classr axis 0 with size ifier has been trained
        self.is_trained = True

    def traverseTree(self, instance, node):

        if node.label != None:
            return node.label

        if instance[node.column] >= node.split_val:
            return self.traverseTree(instance, node.right)
        
        return self.traverseTree(instance, node.left)
    
    def predict(self, x):
        """ Predicts a set of samples using the trained DecisionTreeClassifier.
        
        Assumes that the DecisionTreeClassifier has already been trained.
        
        Args:
        x (numpy.ndarray): Instances, numpy array of shape (M, K) 
                           M is the number of test instances
                           K is the number of attributes
        
        Returns:
        numpy.ndarray: A numpy array of shape (M, ) containing the predicted
                       class label for each instance in x
        """
        
        # make sure that the classifier has been trained before predicting
        if not self.is_trained:
            raise Exception("DecisionTreeClassifier has not yet been trained.")
        
        # set up an empty (M, ) numpy array to store the predicted labels 
        # feel free to change this if needed
        predictions = np.zeros((x.shape[0],), dtype=object)

        for i in range(len(x)):
            label = self.traverseTree(x[i], self.root)
            predictions[i] = label
        
        
        #######################################################################
        #                 ** TASK 2.2: COMPLETE THIS METHOD **
        #######################################################################
        
    
        # remember to change this if you rename the variable
        return predictions



In [40]:
(x_toy, y_toy) = read_dataset("data/train_full.txt")
foo = DecisionTreeClassifier()
foo.fit(x_toy, y_toy)

Depth =  0
3   10
right
Depth =  1
7   5
right
Depth =  2
4   14
right
Depth =  3
4   7
right
Depth =  4
3   12
right
Depth =  5
6   14
right
Depth =  6
8   10
right
Depth =  7
10   1
right
Depth =  8
6   7
right
Depth =  9
5   0
right
Depth =  10
Q
left
Depth =  10
E
left
Depth =  9
C
left
Depth =  8
6   3
right
Depth =  9
6   8
right
Depth =  10
9   1
right
Depth =  11
E
left
Depth =  11
Q
left
Depth =  10
Q
left
Depth =  9
9   1
right
Depth =  10
C
left
Depth =  10
5   0
right
Depth =  11
Q
left
Depth =  11
E
left
Depth =  7
6   15
right
Depth =  8
9   5
right
Depth =  9
6   13
right
Depth =  10
9   15
right
Depth =  11
3   4
right
Depth =  12
13   1
right
Depth =  13
6   8
right
Depth =  14
Q
left
Depth =  14
E
left
Depth =  13
Q
left
Depth =  12
10   5
right
Depth =  13
Q
left
Depth =  13
E
left
Depth =  11
11   1
right
Depth =  12
8   0
right
Depth =  13
G
left
Depth =  13
O
left
Depth =  12
Q
left
Depth =  10
A
left
Depth =  9
7   7
right
Depth =  10
11   11
right
Depth =  11
8 

In [41]:
def accuracy(x, y):
    return np.sum(x==y)*100/len(x)

In [42]:
from numpy.random import default_rng

def split_dataset(x, y, test_proportion, random_generator=default_rng()):
    """ Split dataset into training and test sets, according to the given
        test set proportion.

    Args:
        x (np.ndarray): Instances, numpy array with shape (N,K)
        y (np.ndarray): Class labels, numpy array with shape (N,)
        test_proportion (float): the desired proportion of test examples
                                 (0.0-1.0)
        random_generator (np.random.Generator): A random generator

    Returns:
        tuple: returns a tuple of (x_train, x_test, y_train, y_test)
               - x_train (np.ndarray): Training instances shape (N_train, K)
               - x_test (np.ndarray): Test instances shape (N_test, K)
               - y_train (np.ndarray): Training labels, shape (N_train, )
               - y_test (np.ndarray): Test labels, shape (N_train, )
    """

    # TODO: Complete this function
    indices = np.arange(len(x))
    random_generator.shuffle(indices)
    x = x[indices]
    y = y[indices]

    x_row, x_col = x.shape
    partition = 1- int(x_row * test_proportion)
    x_train = x[:partition, :]
    x_test = x[partition:, :]
    y_train = y[:partition]
    y_test = y[partition:]
    print(x_test.shape)

    return (x_train, x_test, y_train, y_test)

In [50]:
seed = 6434242
rg = default_rng(seed)

x_train, x_test, y_train, y_test = split_dataset(x_toy, y_toy,
                                                 test_proportion=0.2,
                                                 random_generator=rg)

(779, 16)


In [51]:
foo = DecisionTreeClassifier()
foo.fit(x_train, y_train)

Depth =  0
3   10
right
Depth =  1
7   5
right
Depth =  2
4   14
right
Depth =  3
3   12
right
Depth =  4
8   7
right
Depth =  5
7   9
right
Depth =  6
7   14
right
Depth =  7
7   10
right
Depth =  8
Q
left
Depth =  8
A
left
Depth =  7
4   12
right
Depth =  8
6   10
right
Depth =  9
8   11
right
Depth =  10
10   9
right
Depth =  11
O
left
Depth =  11
C
left
Depth =  10
O
left
Depth =  9
O
left
Depth =  8
7   6
right
Depth =  9
12   11
right
Depth =  10
G
left
Depth =  10
O
left
Depth =  9
3   4
right
Depth =  10
8   5
right
Depth =  11
8   8
right
Depth =  12
9   5
right
Depth =  13
10   1
right
Depth =  14
O
left
Depth =  14
Q
left
Depth =  13
O
left
Depth =  12
Q
left
Depth =  11
O
left
Depth =  10
O
left
Depth =  6
8   5
right
Depth =  7
7   15
right
Depth =  8
5   14
right
Depth =  9
6   11
right
Depth =  10
Q
left
Depth =  10
O
left
Depth =  9
7   10
right
Depth =  10
9   3
right
Depth =  11
Q
left
Depth =  11
O
left
Depth =  10
8   11
right
Depth =  11
4   1
right
Depth =  12
11 

In [45]:
x = (foo.predict(x_test))
print(accuracy(x, y_test))

92.6829268292683


In [61]:
def confusion_matrix(y_gold, y_prediction, class_labels=None):
    
    # if no class_labels are given, we obtain the set of unique class labels from
    # the union of the ground truth annotation and the prediction
    if not class_labels:
        class_labels = np.unique(np.concatenate((y_gold, y_prediction)))

    confusion = np.zeros((len(class_labels), len(class_labels)), dtype=np.int64)

    # for each correct class (row), 
    # compute how many instances are predicted for each class (columns)
    for (i, label) in enumerate(class_labels):
        # get predictions where the ground truth is the current class label
        indices = (y_gold == label)
        gold = y_gold[indices]
        predictions = y_prediction[indices]

        # quick way to get the counts per label
        (unique_labels, counts) = np.unique(predictions, return_counts=True)

        # convert the counts to a dictionary
        frequency_dict = dict(zip(unique_labels, counts))

        # fill up the confusion matrix for the current row
        for (j, class_label) in enumerate(class_labels):
            confusion[i, j] = frequency_dict.get(class_label, 0)

    return confusion

In [82]:
def recall(y_gold, y_prediction):

    confusion = confusion_matrix(y_gold, y_prediction)
    r = np.zeros((len(confusion), ))
    for c in range(confusion.shape[0]):
        if np.sum(confusion[c, :]) > 0:
            r[c] = confusion[c, c] / np.sum(confusion[c, :])

    macro_r = 0.
    if len(r) > 0:
        macro_r = np.mean(r)
    return (r, macro_r)

In [90]:
def precision(y_gold, y_prediction):

    confusion = confusion_matrix(y_gold, y_prediction)
    p = np.zeros((len(confusion), ))
    for c in range(confusion.shape[0]):
        if np.sum(confusion[:, c]) > 0:
            p[c] = confusion[c, c] / np.sum(confusion[:, c])

    macro_p = 0.
    if len(p) > 0:
        macro_p = np.mean(p)
    
    return (p, macro_p)


In [98]:
def f1_score(y_gold, y_prediction):

    (precisions, macro_p) = precision(y_gold, y_prediction)
    (recalls, macro_r) = recall(y_gold, y_prediction)

    # just to make sure they are of the same length
    assert len(precisions) == len(recalls)

    f = np.zeros((len(precisions), ))
    for c, (p, r) in enumerate(zip(precisions, recalls)):
        if p + r > 0:
            f[c] = 2 * p * r / (p + r)

    macro_f = 0.
    if len(f) > 0:
        macro_f = np.mean(f)
    
    return (f, macro_f)

In [106]:
def k_fold_split(n_splits, n_instances, random_generator=default_rng()):

    # generate a random permutation of indices from 0 to n_instances
    shuffled_indices = random_generator.permutation(n_instances)

    # split shuffled indices into almost equal sized splits
    split_indices = np.array_split(shuffled_indices, n_splits)

    return split_indices

In [108]:
def train_test_k_fold(n_folds, n_instances, random_generator=default_rng()):

    # split the dataset into k splits
    split_indices = k_fold_split(n_folds, n_instances, random_generator)

    folds = []
    for k in range(n_folds):
        # pick k as test
        test_indices = split_indices[k]

        # combine remaining splits as train
        # this solution is fancy and worked for me
        # feel free to use a more verbose solution that's more readable
        train_indices = np.hstack(split_indices[:k] + split_indices[k+1:])

        folds.append([train_indices, test_indices])

    return folds


In [110]:
#FULL CROSS-VALIDATION TRAINING

n_folds = 10
models = []
accuracies = np.zeros((n_folds, ))
for i, (train_indices, test_indices) in enumerate(train_test_k_fold(n_folds, len(x_full), rg)):
    # get the dataset from the correct splits
    x_train = x_full[train_indices, :]
    y_train = y_full_label[train_indices]
    x_test = x_full[test_indices, :]
    y_test = y_full_label[test_indices]

    fullModel = DecisionTreeClassifier()
    fullModel.fit(x_train, y_train)
    models.append(fullModel)
    fullModelPredictions = (fullModel.predict(x_test))
    acc = accuracy(y_test, fullModelPredictions)
    accuracies[i] = acc
    

print(accuracies)
print(accuracies.mean())
print(accuracies.std())

Depth =  0
3   10
right
Depth =  1
7   5
right
Depth =  2
4   14
right
Depth =  3
4   7
right
Depth =  4
3   12
right
Depth =  5
6   14
right
Depth =  6
8   10
right
Depth =  7
10   1
right
Depth =  8
6   7
right
Depth =  9
5   0
right
Depth =  10
Q
left
Depth =  10
E
left
Depth =  9
C
left
Depth =  8
8   8
right
Depth =  9
E
left
Depth =  9
4   4
right
Depth =  10
8   2
right
Depth =  11
E
left
Depth =  11
Q
left
Depth =  10
9   1
right
Depth =  11
C
left
Depth =  11
Q
left
Depth =  7
6   15
right
Depth =  8
4   12
right
Depth =  9
7   13
right
Depth =  10
7   7
right
Depth =  11
11   11
right
Depth =  12
7   10
right
Depth =  13
Q
left
Depth =  13
11   3
right
Depth =  14
Q
left
Depth =  14
E
left
Depth =  12
8   4
right
Depth =  13
G
left
Depth =  13
Q
left
Depth =  11
9   12
right
Depth =  12
3   8
right
Depth =  13
11   1
right
Depth =  14
O
left
Depth =  14
10   11
right
Depth =  15
E
left
Depth =  15
G
left
Depth =  13
7   9
right
Depth =  14
O
left
Depth =  14
11   1
right
Dept

In [111]:
print(accuracies)
print(accuracies.mean())
print(accuracies.std())

[88.46153846 92.82051282 92.82051282 94.1025641  91.79487179 92.30769231
 91.79487179 91.79487179 92.30769231 93.07692308]
92.12820512820511
1.3999624302275568


In [145]:
# RANDOM FOREST
n_folds = 10
models = []
for i, (train_indices, test_indices) in enumerate(train_test_k_fold(n_folds, len(x_full), rg)):
    # get the dataset from the correct splits
    x_train = x_full[train_indices, :]
    y_train = y_full_label[train_indices]
    x_test = x_full[test_indices, :]
    y_test = y_full_label[test_indices]

    fullModel = DecisionTreeClassifier()
    fullModel.fit(x_train, y_train)
    models.append(fullModel)

In [146]:
print(models)

[<__main__.DecisionTreeClassifier object at 0x7f2e9aa2a9b0>, <__main__.DecisionTreeClassifier object at 0x7f2e9a9ac910>, <__main__.DecisionTreeClassifier object at 0x7f2eb0e3ad40>, <__main__.DecisionTreeClassifier object at 0x7f2e9aa96440>, <__main__.DecisionTreeClassifier object at 0x7f2e9aa650c0>, <__main__.DecisionTreeClassifier object at 0x7f2e9a8cb250>, <__main__.DecisionTreeClassifier object at 0x7f2e9a8d92d0>, <__main__.DecisionTreeClassifier object at 0x7f2e9a8f4760>, <__main__.DecisionTreeClassifier object at 0x7f2e9a9004c0>, <__main__.DecisionTreeClassifier object at 0x7f2e9a920130>]


In [132]:
#Random Forest Predictions

mode, count = np.unique(predictions, return_counts=True, axis=0)
modePredictions = mode[np.argmax(count)]
print(modePredictions)

['C' 'E' 'E' 'C' 'G' 'G' 'C' 'O' 'C' 'E' 'A' 'A' 'Q' 'G' 'C' 'C' 'C' 'O'
 'O' 'A' 'C' 'G' 'G' 'E' 'C' 'A' 'Q' 'Q' 'G' 'A' 'E' 'Q' 'A' 'O' 'A' 'G'
 'G' 'Q' 'O' 'C' 'Q' 'A' 'A' 'Q' 'G' 'E' 'Q' 'G' 'Q' 'A' 'C' 'C' 'C' 'C'
 'E' 'A' 'C' 'O' 'O' 'C' 'G' 'C' 'A' 'G' 'O' 'O' 'Q' 'G' 'O' 'G' 'A' 'O'
 'G' 'C' 'O' 'C' 'G' 'O' 'A' 'A' 'O' 'E' 'E' 'O' 'A' 'O' 'O' 'C' 'G' 'A'
 'A' 'O' 'G' 'E' 'O' 'G' 'A' 'E' 'G' 'A' 'G' 'G' 'Q' 'G' 'Q' 'C' 'Q' 'Q'
 'C' 'C' 'E' 'G' 'Q' 'G' 'G' 'C' 'C' 'G' 'E' 'E' 'E' 'E' 'Q' 'C' 'A' 'E'
 'O' 'E' 'C' 'C' 'C' 'A' 'C' 'A' 'G' 'O' 'O' 'Q' 'A' 'C' 'Q' 'O' 'O' 'G'
 'G' 'E' 'O' 'A' 'C' 'C' 'C' 'O' 'E' 'Q' 'Q' 'Q' 'E' 'G' 'O' 'G' 'A' 'G'
 'A' 'E' 'E' 'G' 'Q' 'G' 'Q' 'E' 'E' 'G' 'E' 'G' 'O' 'E' 'C' 'E' 'A' 'C'
 'O' 'O' 'E' 'G' 'C' 'Q' 'A' 'C' 'G' 'Q' 'A' 'Q' 'G' 'A' 'Q' 'O' 'G' 'C'
 'A' 'A' 'C' 'E' 'C' 'O' 'C' 'E' 'O' 'E' 'Q' 'A' 'A' 'C' 'O' 'E' 'G' 'E'
 'A' 'G' 'Q' 'A' 'E' 'A' 'O' 'E' 'C' 'E' 'O' 'C' 'A' 'C' 'G' 'O' 'G' 'Q'
 'E' 'C' 'G' 'Q' 'O' 'Q' 'Q' 'O' 'O' 'Q' 'G' 'O' 'E

In [130]:
# RF ACCURACY
print(accuracy(modePredictions, y_full_label))

0.0


  return np.sum(x==y)*100/len(x)


In [99]:
# TRAINING FULL DATASET

seed = 6434242
rg = default_rng(seed)

x_train, x_test, y_train, y_test = split_dataset(x_full, y_full_label,
                                                 test_proportion=0.2,
                                                 random_generator=rg)

fullModel = DecisionTreeClassifier()
fullModel.fit(x_train, y_train)

(779, 16)
Depth =  0
3   10
right
Depth =  1
7   5
right
Depth =  2
4   14
right
Depth =  3
3   12
right
Depth =  4
8   7
right
Depth =  5
7   9
right
Depth =  6
7   14
right
Depth =  7
7   10
right
Depth =  8
Q
left
Depth =  8
A
left
Depth =  7
4   12
right
Depth =  8
6   10
right
Depth =  9
8   11
right
Depth =  10
10   9
right
Depth =  11
O
left
Depth =  11
C
left
Depth =  10
O
left
Depth =  9
O
left
Depth =  8
7   6
right
Depth =  9
12   11
right
Depth =  10
G
left
Depth =  10
O
left
Depth =  9
3   4
right
Depth =  10
8   5
right
Depth =  11
8   8
right
Depth =  12
9   5
right
Depth =  13
10   1
right
Depth =  14
O
left
Depth =  14
Q
left
Depth =  13
O
left
Depth =  12
Q
left
Depth =  11
O
left
Depth =  10
O
left
Depth =  6
8   5
right
Depth =  7
7   15
right
Depth =  8
5   14
right
Depth =  9
6   11
right
Depth =  10
Q
left
Depth =  10
O
left
Depth =  9
7   10
right
Depth =  10
9   3
right
Depth =  11
Q
left
Depth =  11
O
left
Depth =  10
8   11
right
Depth =  11
4   1
right
Depth

In [67]:
# FULL ACCURACY
fullModelPredict = (fullModel.predict(x_test))
print(accuracy(fullModelPredict, y_test))

92.6829268292683


In [80]:
#FULL CONFUSION MATRIX
confusion_full = confusion_matrix(y_test, fullModelPredict)
print(confusion_full)

[[130   3   0   3   0   1]
 [  2 106   0   2   3   0]
 [  0   3 133   1   0   2]
 [  3   2   8 116   1   1]
 [  0   3   0   2 116   6]
 [  3   2   1   0   5 121]]


In [83]:
#FULL RECALL
(r_full, macro_r_full) = recall(y_test, fullModelPredict)
print(r_full)
print(macro_r_full)

[0.94890511 0.9380531  0.95683453 0.88549618 0.91338583 0.91666667]
0.9265569026421187


In [93]:
#FULL PRECISION
(p_full, macro_p_full) = precision(y_test, fullModelPredict)
print(p_full)
print(macro_p_full)

[0.94202899 0.8907563  0.93661972 0.93548387 0.928      0.92366412]
0.9260921665738767


In [100]:
#FULL F1
(f1_full, macro_f1_full) = f1_score(y_test, fullModelPredict)
print(f1_full)
print(macro_f1_full)

[0.94545455 0.9137931  0.94661922 0.90980392 0.92063492 0.92015209]
0.9260762999071622


In [None]:
#FULL CROSS-VALIDATION


In [101]:
# TRAINING SUB DATASET

seed = 6434242
rg = default_rng(seed)

x_train, x_test, y_train, y_test = split_dataset(x_sub, y_sub_label,
                                                 test_proportion=0.2,
                                                 random_generator=rg)

subModel = DecisionTreeClassifier()
subModel.fit(x_train, y_train)

(119, 16)
Depth =  0
4   10
right
Depth =  1
7   5
right
Depth =  2
6   7
right
Depth =  3
5   14
right
Depth =  4
10   1
right
Depth =  5
4   12
right
Depth =  6
7   9
right
Depth =  7
10   14
right
Depth =  8
A
left
Depth =  8
O
left
Depth =  7
C
left
Depth =  6
8   0
right
Depth =  7
C
left
Depth =  7
G
left
Depth =  5
9   8
right
Depth =  6
E
left
Depth =  6
7   14
right
Depth =  7
9   15
right
Depth =  8
E
left
Depth =  8
7   0
right
Depth =  9
Q
left
Depth =  9
G
left
Depth =  7
3   4
right
Depth =  8
Q
left
Depth =  8
G
left
Depth =  4
11   11
right
Depth =  5
5   0
right
Depth =  6
C
left
Depth =  6
3   0
right
Depth =  7
O
left
Depth =  7
C
left
Depth =  5
7   9
right
Depth =  6
8   13
right
Depth =  7
6   6
right
Depth =  8
O
left
Depth =  8
8   1
right
Depth =  9
O
left
Depth =  9
Q
left
Depth =  7
C
left
Depth =  6
8   6
right
Depth =  7
O
left
Depth =  7
8   5
right
Depth =  8
Q
left
Depth =  8
O
left
Depth =  3
4   12
right
Depth =  4
4   7
right
Depth =  5
7   10
right
D

In [86]:
# SUB MODEL ACCURACY
subModelPredict = (subModel.predict(x_test))
print(accuracy(subModelPredict, y_test))

81.5126050420168


In [72]:
#SUB CONFUSION MATRIX
confusion_sub = confusion_matrix(y_test, subModelPredict)
print(confusion_sub)

[[16  0  1  0  0  0]
 [ 0 38  0  2  0  1]
 [ 0  3 19  0  1  0]
 [ 0  0  2  3  1  1]
 [ 0  3  2  2 15  1]
 [ 0  0  0  0  2  6]]


In [87]:
#SUB RECALL
(r_sub, macro_r_sub) = recall(y_test, subModelPredict)
print(r_sub)
print(macro_r_sub)

[0.94117647 0.92682927 0.82608696 0.42857143 0.65217391 0.75      ]
0.7541396728362607


In [95]:
#SUB PRECISION
(p_sub, macro_p_sub) = precision(y_test, subModelPredict)
print(p_sub)
print(macro_p_sub)

[1.         0.86363636 0.79166667 0.42857143 0.78947368 0.66666667]
0.7566691349586087


In [102]:
#SUB F1
(f1_sub, macro_f1_sub) = f1_score(y_test, subModelPredict)
print(f1_sub)
print(macro_f1_sub)

[0.96969697 0.89411765 0.80851064 0.42857143 0.71428571 0.70588235]
0.7535107918086642


In [103]:
# TRAINING NOISY DATASET

seed = 6434242
rg = default_rng(seed)

x_train, x_test, y_train, y_test = split_dataset(x_noisy, y_noisy_label,
                                                 test_proportion=0.2,
                                                 random_generator=rg)

noisyModel = DecisionTreeClassifier()
noisyModel.fit(x_train, y_train)

(779, 16)
Depth =  0
3   10
right
Depth =  1
7   5
right
Depth =  2
6   7
right
Depth =  3
5   14
right
Depth =  4
8   5
right
Depth =  5
10   1
right
Depth =  6
9   3
right
Depth =  7
7   9
right
Depth =  8
8   2
right
Depth =  9
Q
left
Depth =  9
O
left
Depth =  8
Q
left
Depth =  7
7   14
right
Depth =  8
8   15
right
Depth =  9
9   15
right
Depth =  10
10   2
right
Depth =  11
O
left
Depth =  11
5   12
right
Depth =  12
G
left
Depth =  12
7   4
right
Depth =  13
Q
left
Depth =  13
G
left
Depth =  10
6   6
right
Depth =  11
6   0
right
Depth =  12
O
left
Depth =  12
Q
left
Depth =  11
Q
left
Depth =  9
A
left
Depth =  8
4   12
right
Depth =  9
5   6
right
Depth =  10
O
left
Depth =  10
Q
left
Depth =  9
8   0
right
Depth =  10
O
left
Depth =  10
G
left
Depth =  6
8   3
right
Depth =  7
2   8
right
Depth =  8
Q
left
Depth =  8
11   15
right
Depth =  9
Q
left
Depth =  9
6   0
right
Depth =  10
A
left
Depth =  10
O
left
Depth =  7
6   1
right
Depth =  8
8   15
right
Depth =  9
9   5
rig

In [76]:
#NOISY ACCURACY
noisyModelPredict = (noisyModel.predict(x_test))
print(accuracy(noisyModelPredict, y_test))

85.2374839537869


In [77]:
#NOISY CONFUSION MATRIX
confusion_noisy = confusion_matrix(y_test, noisyModelPredict)
print(confusion_noisy)

[[128   1   1   1   3   3]
 [  0  97   6  14   1   0]
 [  1   1 134   5   1   3]
 [  2   8   3  85   6   7]
 [  0   4   2   5 114   9]
 [  2   0   2  16   8 106]]


In [89]:
#SUB RECALL
(r_noisy, macro_r_noisy) = recall(y_test, noisyModelPredict)
print(r_noisy)
print(macro_r_noisy)

[0.93430657 0.8220339  0.92413793 0.76576577 0.85074627 0.79104478]
0.8480058682040864


In [97]:
#NOISY PRECISION
(p_noisy, macro_p_noisy) = precision(y_test, noisyModelPredict)
print(p_noisy)
print(macro_p_noisy)

[0.96240602 0.87387387 0.90540541 0.67460317 0.85714286 0.828125  ]
0.8502593876771507


In [104]:
#NOISY F1
(f1_noisy, macro_f1_noisy) = f1_score(y_test, noisyModelPredict)
print(f1_noisy)
print(macro_f1_noisy)

[0.94814815 0.84716157 0.91467577 0.71729958 0.85393258 0.80916031]
0.8483963259651475
