In [1]:
import numpy as np

In [2]:
def read_dataset(filepath):
    """ Read in the dataset from the specified filepath

    Args:
        filepath (str): The filepath to the dataset file

    Returns:
        tuple: returns a tuple of (x, y, classes), each being a numpy array. 
               - x is a numpy array with shape (N, K), 
                   where N is the number of instances
                   K is the number of features/attributes
               - y is a numpy array with shape (N, ), and each element should be 
                   an integer from 0 to C-1 where C is the number of classes 
               - classes : a numpy array with shape (C, ), which contains the 
                   unique class labels corresponding to the integers in y
    """

    x = []
    y_labels = []
    for line in open(filepath):
        if line.strip() != "": # handle empty rows in file
            row = line.strip().split(",")
            x.append(list(map(float, row[:-1]))) 
            y_labels.append(row[-1])
    

    x = np.array(x, dtype=int)
    y = np.array(y_labels)
    return (x, y)

In [3]:
from numpy.random import default_rng

def split_dataset(x, y, test_proportion, random_generator=default_rng()):
    """ Split dataset into training and test sets, according to the given
        test set proportion.

    Args:
        x (np.ndarray): Instances, numpy array with shape (N,K)
        y (np.ndarray): Class labels, numpy array with shape (N,)
        test_proportion (float): the desired proportion of test examples
                                 (0.0-1.0)
        random_generator (np.random.Generator): A random generator

    Returns:
        tuple: returns a tuple of (x_train, x_test, y_train, y_test)
               - x_train (np.ndarray): Training instances shape (N_train, K)
               - x_test (np.ndarray): Test instances shape (N_test, K)
               - y_train (np.ndarray): Training labels, shape (N_train, )
               - y_test (np.ndarray): Test labels, shape (N_train, )
    """

    # TODO: Complete this function
    indices = np.arange(len(x))
    random_generator.shuffle(indices)
    x = x[indices]
    y = y[indices]

    x_row, x_col = x.shape
    partition = 1- int(x_row * test_proportion)
    x_train = x[:partition, :]
    x_test = x[partition:, :]
    y_train = y[:partition]
    y_test = y[partition:]
    print(x_test.shape)

    return (x_train, x_test, y_train, y_test)

In [4]:
(x_test, y_test) = read_dataset("data/test.txt")
print(y_test)

['Q' 'C' 'Q' 'Q' 'G' 'C' 'E' 'C' 'O' 'E' 'A' 'A' 'G' 'A' 'A' 'A' 'G' 'G'
 'E' 'E' 'E' 'E' 'O' 'C' 'Q' 'Q' 'Q' 'C' 'O' 'Q' 'G' 'Q' 'G' 'O' 'Q' 'Q'
 'G' 'O' 'Q' 'A' 'C' 'Q' 'O' 'Q' 'A' 'G' 'O' 'E' 'E' 'A' 'Q' 'G' 'A' 'C'
 'A' 'G' 'A' 'C' 'O' 'C' 'O' 'E' 'G' 'Q' 'O' 'E' 'C' 'A' 'A' 'G' 'A' 'C'
 'C' 'Q' 'O' 'O' 'O' 'A' 'E' 'C' 'A' 'Q' 'O' 'Q' 'C' 'A' 'A' 'Q' 'C' 'E'
 'A' 'A' 'C' 'Q' 'A' 'Q' 'O' 'Q' 'O' 'G' 'C' 'G' 'C' 'C' 'O' 'Q' 'A' 'Q'
 'Q' 'C' 'G' 'Q' 'C' 'Q' 'O' 'Q' 'O' 'G' 'Q' 'E' 'O' 'Q' 'E' 'Q' 'O' 'E'
 'O' 'C' 'O' 'A' 'G' 'Q' 'E' 'G' 'O' 'Q' 'E' 'O' 'C' 'O' 'G' 'G' 'Q' 'Q'
 'E' 'E' 'A' 'C' 'A' 'A' 'A' 'A' 'C' 'O' 'Q' 'G' 'O' 'O' 'E' 'C' 'C' 'Q'
 'Q' 'C' 'C' 'O' 'C' 'A' 'G' 'A' 'C' 'A' 'C' 'G' 'E' 'Q' 'E' 'C' 'O' 'A'
 'C' 'E' 'C' 'G' 'G' 'Q' 'G' 'Q' 'A' 'C' 'C' 'E' 'G' 'E' 'A' 'E' 'Q' 'O'
 'O' 'O']


In [5]:
(x_val, y_val) = read_dataset("data/validation.txt")

In [6]:
(x_full, y_full) = read_dataset("data/train_full.txt")

In [179]:
(x_noisy, y_noisy) = read_dataset("data/train_noisy.txt")

In [7]:
(x_toy, y_toy) = read_dataset("data/toy.txt")

In [8]:
 def confusion_matrix(y_gold, y_prediction, class_labels=None):
    
    # if no class_labels are given, we obtain the set of unique class labels from
    # the union of the ground truth annotation and the prediction
    if not class_labels:
        class_labels = np.unique(np.concatenate((y_gold, y_prediction)))

    confusion = np.zeros((len(class_labels), len(class_labels)), dtype=np.int64)

    # for each correct class (row), 
    # compute how many instances are predicted for each class (columns)
    for (i, label) in enumerate(class_labels):
        # get predictions where the ground truth is the current class label
        indices = (y_gold == label)
        gold = y_gold[indices]
        predictions = y_prediction[indices]

        # quick way to get the counts per label
        (unique_labels, counts) = np.unique(predictions, return_counts=True)

        # convert the counts to a dictionary
        frequency_dict = dict(zip(unique_labels, counts))

        # fill up the confusion matrix for the current row
        for (j, class_label) in enumerate(class_labels):
            confusion[i, j] = frequency_dict.get(class_label, 0)

    return confusion

In [9]:
 def recall(y_gold, y_prediction):

    confusion = confusion_matrix(y_gold, y_prediction)
    r = np.zeros((len(confusion), ))
    for c in range(confusion.shape[0]):
        if np.sum(confusion[c, :]) > 0:
            r[c] = confusion[c, c] / np.sum(confusion[c, :])

    macro_r = 0.
    if len(r) > 0:
        macro_r = np.mean(r)
    return (r, macro_r)

In [10]:
def precision(y_gold, y_prediction):

    confusion = confusion_matrix(y_gold, y_prediction)
    p = np.zeros((len(confusion), ))
    for c in range(confusion.shape[0]):
        if np.sum(confusion[:, c]) > 0:
            p[c] = confusion[c, c] / np.sum(confusion[:, c])

    macro_p = 0.
    if len(p) > 0:
        macro_p = np.mean(p)
    
    return (p, macro_p)

In [11]:
def f1_score(y_gold, y_prediction):

    (precisions, macro_p) = precision(y_gold, y_prediction)
    (recalls, macro_r) = recall(y_gold, y_prediction)

    # just to make sure they are of the same length
    assert len(precisions) == len(recalls)

    f = np.zeros((len(precisions), ))
    for c, (p, r) in enumerate(zip(precisions, recalls)):
        if p + r > 0:
            f[c] = 2 * p * r / (p + r)

    macro_f = 0.
    if len(f) > 0:
        macro_f = np.mean(f)
    
    return (f, macro_f)

In [117]:
def fb_score(y_gold, y_prediction, b):

    (precisions, macro_p) = precision(y_gold, y_prediction)
    (recalls, macro_r) = recall(y_gold, y_prediction)

    # just to make sure they are of the same length
    assert len(precisions) == len(recalls)

    f = np.zeros((len(precisions), ))
    for c, (p, r) in enumerate(zip(precisions, recalls)):
        if p + r > 0:
            f[c] = (1+b**2) * p * r / ((b**2) * p + r)

    macro_f = 0.
    if len(f) > 0:
        macro_f = np.mean(f)
    
    return (f, macro_f)

In [94]:
import random

class RandomForestClassifier(object):

    def __init__(self, treesNum, num_layers_to_prune=0, feature_proportion=1):
        self.is_trained = False
        self.roots = []
        self.treesNum = treesNum
        self.prune_layer_num = num_layers_to_prune
        self.feature_proportion = feature_proportion

    def fit(self, x, y):

        features_num = int(x.shape[1] * self.feature_proportion)
        
        # for treesNum
        for i in range(self.treesNum):
            # bagged samples from x
            bagged_ints = np.random.randint(0, len(x), len(x), dtype=int)
            x_bagged = x[bagged_ints, :]
            y_bagged = y[bagged_ints]

            features = np.arange(0,x.shape[1])
            np.random.shuffle(features)
            feature_list = features[:features_num]
            x_bagged = x_bagged[:, feature_list]
            
            # train a tree on this subset
            tree = DecisionTreeClassifier(self.prune_layer_num)
            tree.fit(x_bagged, y_bagged)
            
            # add to roots
            self.roots.append((tree, feature_list))

    def predict(self, x, num_of_trees):

        predictions = np.zeros((num_of_trees, len(x)), dtype='str')
        # for each tree in roots
        new_forest = random.sample(self.roots, k=num_of_trees)
        for i, (tree, feature_list) in enumerate(new_forest):
            predictions[i] = tree.predict(x[:, feature_list])
            
        # then find mode of predictions
        mode, count = np.unique(predictions, return_counts=True, axis=0)
        modePredictions = mode[np.argmax(count)]
        return modePredictions
       

In [94]:
import random

class RandomForestClassifier(object):

    def __init__(self, treesNum, num_layers_to_prune=0, feature_proportion=1):
        self.is_trained = False
        self.roots = []
        self.treesNum = treesNum
        self.prune_layer_num = num_layers_to_prune
        self.feature_proportion = feature_proportion

    def fit(self, x, y):

        features_num = int(x.shape[1] * self.feature_proportion)
        
        # for treesNum
        for i in range(self.treesNum):
            # bagged samples from x
            bagged_ints = np.random.randint(0, len(x), len(x), dtype=int)
            x_bagged = x[bagged_ints, :]
            y_bagged = y[bagged_ints]

            features = np.arange(0,x.shape[1])
            np.random.shuffle(features)
            feature_list = features[:features_num]
            x_bagged = x_bagged[:, feature_list]
            
            # train a tree on this subset
            tree = DecisionTreeClassifier(self.prune_layer_num)
            tree.fit(x_bagged, y_bagged)
            
            # add to roots
            self.roots.append((tree, feature_list))

    def predict(self, x, num_of_trees):

        predictions = np.zeros((num_of_trees, len(x)), dtype='str')
        # for each tree in roots
        new_forest = random.sample(self.roots, k=num_of_trees)
        for i, (tree, feature_list) in enumerate(new_forest):
            predictions[i] = tree.predict(x[:, feature_list])
            
        # then find mode of predictions
        mode, count = np.unique(predictions, return_counts=True, axis=0)
        modePredictions = mode[np.argmax(count)]
        return modePredictions
       

In [108]:
def accuracy(x, y):
    return np.sum(x==y)*100/len(x)

In [13]:
class Node:
    def __init__(self, split_val=None, column=None, label=None):
        self.left = self.right = None
        self.split_val = split_val
        self.column = column
        self.label = label
        
    def add_child(self, child): 
        if self.left == None:
            self.left = child
        elif self.right == None:
            self.right = child
        else:
            print("no children node free")
            exit()

    def is_leaf(self):
        if self.label != None:
            return True
        return False

    def max_depth(self, d=0):

        if self.left == None and self.right == None:
            return 0
        
        left_depth = self.left.max_depth(d) + 1
        right_depth = self.right.max_depth(d) + 1

        if right_depth >= left_depth:
            return right_depth
        else:
            return left_depth


In [193]:
class DecisionTreeClassifier(object):
    """ Basic decision tree classifier
    
    Attributes:
    is_trained (bool): Keeps track of whether the classifier has been trained
    
    Methods:
    fit(x, y): Constructs a decision tree from data X and label y
    predict(x): Predicts the class label of samples X
    prune(x_val, y_val): Post-prunes the decision tree
    """

    def __init__(self):
        self.is_trained = False
        self.root = None

    
    def calculate_entropy(self, y):
        values, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    
    def calculate_info_gain(self, x, y, x_val, sort_col):

        # Calculate total entropy for overall data
        total_entropy = self.calculate_entropy(y)

        # Calculate entropy for left and right of split
        left_entropy = self.calculate_entropy(y[x[:, sort_col] < x_val])
        right_entropy = self.calculate_entropy(y[x[:, sort_col] >= x_val])

        # Calculate info gain
        info_gain = total_entropy - ((len(y[x[:, sort_col] < x_val])/len(y) * left_entropy) 
                                    + (len(y[x[:, sort_col] >= x_val])/len(y) * right_entropy))
        
        return info_gain

        
    def find_best_node(self, x, y):
        y = y.reshape(-1, 1)

        # To keep track of info gain
        max_gain = value_to_split_on = column_to_split_on = None
                
        # loop through each column 
        for i in range(x.shape[1]):
            # sort by that column
            index_list = x[:, i].argsort()
            x = x[index_list]
            y = y[index_list]

            starting_label = y[0]
            starting_val = x[:, i][0]

            # loop through the column
            for x_val, y_val in zip(x[:, i], y):
                if (y_val != starting_label) and (x_val != starting_val):

                    # calculate information gain
                    info_gain = self.calculate_info_gain(x, y, x_val, i)
                    
                    # update the max information gain
                    if max_gain is None or info_gain > max_gain:
                        max_gain = info_gain
                        value_to_split_on = x_val
                        column_to_split_on = i

                    # Update starting label
                    starting_label = y_val
                    starting_val = x_val
                    
        return Node(value_to_split_on, column_to_split_on)


    def split_dataset(self, x, y, node):
        # Simplified dataset splitting
        left_mask = x[:, node.column] < node.split_val
        right_mask = ~left_mask  # Inverse of left_mask
        return (x[left_mask], y[left_mask]), (x[right_mask], y[right_mask])


    
    def induce_decision_tree(self, x, y):
        # check y count is 1 or node column returns -1
        if (len(np.unique(y)) <= 1 or x.shape[0] == 1):
            leaf_node = Node(label=y[0])
            return leaf_node

        else:
            # find best node
            parent = self.find_best_node(x, y)
            
            # get left and right datasets
            if parent.split_val is None:
                label_set, count = np.unique(y, return_counts=True)
                label = label_set[np.argmax(count)]
                leaf_node = Node(label=label)
                return leaf_node
                 
            child_data = self.split_dataset(x, y, parent)
            
            for i in child_data: 
                child_node = self.induce_decision_tree(i[0], i[1])
                parent.add_child(child_node)
        
            return parent
    

    def fit(self, x, y):
        """ Constructs a decision tree classifier from data
        
        Args:
        x (numpy.ndarray): Instances, numpy array of shape (N, K) 
                           N is the number of instances
                           K is the number of attributes
        y (numpy.ndarray): Class labels, numpy array of shape (N, )
                           Each element in y is a str 
        """
        
        # Make sure that x and y have the same number of instances
        assert x.shape[0] == len(y), \
            "Training failed. x and y must have the same number of instances."
        
        #######################################################################
        #                 ** TASK 2.1: COMPLETE THIS METHOD **
        #######################################################################    
        self.root = self.induce_decision_tree(x, y)
        
        # set a flag so that we know that the classifier has been trained
        self.is_trained = True

    
    def classify_instance(self, instance, node):

        if node.label != None:
            return node.label

        if instance[node.column] >= node.split_val:
            return self.classify_instance(instance, node.right)
        
        return self.classify_instance(instance, node.left)
        
    
    def predict(self, x):
        """ Predicts a set of samples using the trained DecisionTreeClassifier.
        
        Assumes that the DecisionTreeClassifier has already been trained.
        
        Args:
        x (numpy.ndarray): Instances, numpy array of shape (M, K) 
                           M is the number of test instances
                           K is the number of attributes
        
        Returns:
        numpy.ndarray: A numpy array of shape (M, ) containing the predicted
                       class label for each instance in x
        """
        
        # make sure that the classifier has been trained before predicting
        if not self.is_trained:
            raise Exception("DecisionTreeClassifier has not yet been trained.")
        
        # set up an empty (M, ) numpy array to store the predicted labels 
        # feel free to change this if needed
        predictions = np.zeros((x.shape[0],), dtype=object)
        #######################################################################
        #                 ** TASK 2.2: COMPLETE THIS METHOD **
        #######################################################################
        for i in range(len(x)):
            label = self.classify_instance(x[i], self.root)
            predictions[i] = label
    
        return predictions


In [192]:
class NewDecisionTreeClassifier(object):
    """ Basic decision tree classifier
    
    Attributes:
    is_trained (bool): Keeps track of whether the classifier has been trained
    
    Methods:
    fit(x, y): Constructs a decision tree from data X and label y
    predict(x): Predicts the class label of samples X
    prune(x_val, y_val): Post-prunes the decision tree
    """

    def __init__(self, num_layers_to_prune=None, min_samples_split=2, min_samples_leaf=1):
        self.is_trained = False
        self.root = None
        self.num_layers_to_prune = num_layers_to_prune
        self.max_depth = 0
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf

    
    def calculate_entropy(self, y):
        values, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    
    def calculate_info_gain(self, x, y, x_val, sort_col):

        # Calculate total entropy for overall data
        total_entropy = self.calculate_entropy(y)

        # Calculate entropy for left and right of split
        left_entropy = self.calculate_entropy(y[x[:, sort_col] < x_val])
        right_entropy = self.calculate_entropy(y[x[:, sort_col] >= x_val])

        # Calculate info gain
        info_gain = total_entropy - ((len(y[x[:, sort_col] < x_val])/len(y) * left_entropy) 
                                    + (len(y[x[:, sort_col] >= x_val])/len(y) * right_entropy))
        
        return info_gain

        
    def find_best_node(self, x, y):
        y = y.reshape(-1, 1)

        # To keep track of info gain
        max_gain = value_to_split_on = column_to_split_on = None
                
        # loop through each column 
        for i in range(x.shape[1]):
            # sort by that column
            index_list = x[:, i].argsort()
            x = x[index_list]
            y = y[index_list]

            starting_label = y[0]
            starting_val = x[:, i][0]

            # loop through the column
            for x_val, y_val in zip(x[:, i], y):
                if (y_val != starting_label) and (x_val != starting_val):

                    # calculate information gain
                    info_gain = self.calculate_info_gain(x, y, x_val, i)
                    
                    # update the max information gain
                    if max_gain is None or info_gain > max_gain:
                        max_gain = info_gain
                        value_to_split_on = x_val
                        column_to_split_on = i

                    # Update starting label
                    starting_label = y_val
                    starting_val = x_val
                    
        return Node(value_to_split_on, column_to_split_on)


    def split_dataset(self, x, y, node):
        # Simplified dataset splitting
        left_mask = x[:, node.column] < node.split_val
        right_mask = ~left_mask  # Inverse of left_mask
        return (x[left_mask], y[left_mask]), (x[right_mask], y[right_mask])


        
    def induce_decision_tree(self, x, y, depth=0):
        if len(np.unique(y)) <= 1 or x.shape[0] < self.min_samples_split or depth == self.max_depth:
            label_set, count = np.unique(y, return_counts=True)
            leaf_node = Node(label=label_set[np.argmax(count)])
            return leaf_node
        else:
            parent = self.find_best_node(x, y)
            if parent.split_val is None or x.shape[0] <= self.min_samples_leaf:
                label_set, count = np.unique(y, return_counts=True)
                leaf_node = Node(label=label_set[np.argmax(count)])
                return leaf_node
            
            (x_left, y_left), (x_right, y_right) = self.split_dataset(x, y, parent)
            
            if len(y_left) >= self.min_samples_leaf and len(y_right) >= self.min_samples_leaf:
                parent.left = self.induce_decision_tree(x_left, y_left, depth + 1)
                parent.right = self.induce_decision_tree(x_right, y_right, depth + 1)
            else:
                label_set, count = np.unique(y, return_counts=True)
                return Node(label=label_set[np.argmax(count)])
            
            return parent



    def fit(self, x, y):

        
        # Make sure that x and y have the same number of instances
        assert x.shape[0] == len(y), \
            "Training failed. x and y must have the same number of instances."
        
        #######################################################################
        #                 ** TASK 2.1: COMPLETE THIS METHOD **
        #######################################################################
        self.max_depth = np.inf
        self.root = self.induce_decision_tree(x, y)
        self.max_depth = self.get_max_depth_from_tree() - self.num_layers_to_prune

        self.root = self.induce_decision_tree(x, y)
        
        # set a flag so that we know that the classifier has been trained
        self.is_trained = True

    
    def classify_instance(self, instance, node):

        if node.label != None:
            return node.label

        if instance[node.column] >= node.split_val:
            return self.classify_instance(instance, node.right)
        
        return self.classify_instance(instance, node.left)
        
    
    def predict(self, x):

        
        # make sure that the classifier has been trained before predicting
        if not self.is_trained:
            raise Exception("DecisionTreeClassifier has not yet been trained.")
        
        # set up an empty (M, ) numpy array to store the predicted labels 
        # feel free to change this if needed
        predictions = np.zeros((x.shape[0],), dtype=object)
        #######################################################################
        #                 ** TASK 2.2: COMPLETE THIS METHOD **
        #######################################################################
        for i in range(len(x)):
            label = self.classify_instance(x[i], self.root)
            predictions[i] = label
    
        return predictions

    def get_max_depth_from_tree(self):
        return self.root.max_depth()

In [20]:
t = DecisionTreeClassifier(8);
t.fit(x_full, y_full)

In [21]:
print(t.get_max_depth_from_tree())

predict = t.predict(x_val)

print(accuracy(predict, y_val))

13
93.0


In [110]:
import random

class RandomForestClassifier(object):

    def __init__(self, treesNum, num_layers_to_prune=0, feature_proportion=1):
        self.is_trained = False
        self.roots = []
        self.treesNum = treesNum
        self.prune_layer_num = num_layers_to_prune
        self.feature_proportion = feature_proportion

    def fit(self, x, y):

        features_num = int(x.shape[1] * self.feature_proportion)
        
        # for treesNum
        for i in range(self.treesNum):
            # bagged samples from x
            bagged_ints = np.random.randint(0, len(x), len(x), dtype=int)
            x_bagged = x[bagged_ints, :]
            y_bagged = y[bagged_ints]

            features = np.arange(0,x.shape[1])
            np.random.shuffle(features)
            feature_list = features[:features_num]
            x_bagged = x_bagged[:, feature_list]
            
            # train a tree on this subset
            tree = DecisionTreeClassifier(self.prune_layer_num)
            tree.fit(x_bagged, y_bagged)
            
            # add to roots
            self.roots.append((tree, feature_list))

    def predict(self, x, num_of_trees):

        predictions = np.zeros((num_of_trees, len(x)), dtype='str')
        # for each tree in roots
        new_forest = random.sample(self.roots, k=num_of_trees)
        for i, (tree, feature_list) in enumerate(new_forest):
            predictions[i] = tree.predict(x[:, feature_list])
            
        # then find mode of predictions
        mode, count = np.unique(predictions, return_counts=True, axis=0)
        modePredictions = mode[np.argmax(count)]
        return modePredictions
       

In [95]:
test = RandomForestClassifier(10, 8, 0.4);
test.fit(x_full, y_full)

In [96]:
thirty_predict = test.predict(x_val, 8)
print(thirty_predict)

<__main__.DecisionTreeClassifier object at 0x106bb2690>
<__main__.DecisionTreeClassifier object at 0x37d3ad790>
<__main__.DecisionTreeClassifier object at 0x37d171b50>
<__main__.DecisionTreeClassifier object at 0x17c1b36d0>
<__main__.DecisionTreeClassifier object at 0x37d0e8050>
<__main__.DecisionTreeClassifier object at 0x37d077150>
<__main__.DecisionTreeClassifier object at 0x37c9c3e50>
<__main__.DecisionTreeClassifier object at 0x37d0dc290>
['A' 'A' 'A' 'E' 'Q' 'C' 'Q' 'Q' 'O' 'A' 'O' 'Q' 'C' 'O' 'A' 'O' 'C' 'C'
 'C' 'A' 'A' 'C' 'A' 'E' 'O' 'G' 'A' 'C' 'E' 'E' 'O' 'A' 'C' 'E' 'E' 'Q'
 'E' 'C' 'A' 'O' 'Q' 'O' 'O' 'Q' 'G' 'E' 'G' 'A' 'C' 'E' 'Q' 'O' 'O' 'O'
 'C' 'C' 'C' 'A' 'O' 'E' 'E' 'E' 'O' 'E' 'Q' 'O' 'G' 'C' 'O' 'E' 'O' 'Q'
 'Q' 'A' 'C' 'G' 'E' 'E' 'C' 'O' 'O' 'C' 'Q' 'A' 'E' 'C' 'O' 'G' 'A' 'A'
 'G' 'E' 'Q' 'O' 'A' 'A' 'A' 'A' 'O' 'G']


In [28]:
confusion_thirty = confusion_matrix(y_val, thirty_predict)
print(confusion_thirty)

[[16  0  1  0  1  1]
 [ 0 13  0  0  0  0]
 [ 0  2 12  3  0  1]
 [ 1  0  1 11  0  1]
 [ 3  1  1  0 14  1]
 [ 1  0  0  0  1 14]]


In [29]:
print(accuracy(thirty_predict, y_val))

80.0


In [30]:
test_predict = test.predict(x_test)
print(test_predict)

['A' 'C' 'Q' 'A' 'G' 'C' 'C' 'C' 'Q' 'E' 'A' 'A' 'G' 'A' 'A' 'A' 'G' 'Q'
 'E' 'E' 'E' 'G' 'O' 'C' 'Q' 'Q' 'Q' 'C' 'G' 'O' 'O' 'G' 'G' 'O' 'Q' 'A'
 'O' 'O' 'A' 'A' 'C' 'G' 'O' 'G' 'A' 'Q' 'E' 'E' 'E' 'G' 'Q' 'G' 'A' 'C'
 'G' 'G' 'G' 'C' 'G' 'G' 'O' 'E' 'G' 'Q' 'O' 'E' 'G' 'A' 'A' 'G' 'Q' 'C'
 'O' 'C' 'O' 'O' 'O' 'A' 'G' 'C' 'C' 'Q' 'O' 'C' 'C' 'A' 'A' 'Q' 'C' 'E'
 'A' 'A' 'C' 'Q' 'A' 'Q' 'A' 'Q' 'O' 'G' 'C' 'Q' 'G' 'C' 'O' 'Q' 'A' 'Q'
 'Q' 'C' 'G' 'Q' 'C' 'Q' 'G' 'Q' 'O' 'G' 'Q' 'E' 'O' 'Q' 'E' 'Q' 'O' 'E'
 'O' 'G' 'O' 'Q' 'G' 'Q' 'E' 'Q' 'E' 'G' 'E' 'O' 'G' 'O' 'G' 'G' 'Q' 'Q'
 'G' 'E' 'A' 'C' 'A' 'G' 'A' 'Q' 'C' 'O' 'Q' 'G' 'O' 'O' 'E' 'C' 'C' 'Q'
 'G' 'C' 'C' 'O' 'C' 'A' 'A' 'A' 'O' 'C' 'C' 'G' 'E' 'Q' 'E' 'G' 'O' 'A'
 'C' 'E' 'G' 'G' 'G' 'Q' 'G' 'Q' 'A' 'C' 'C' 'O' 'G' 'E' 'A' 'E' 'Q' 'O'
 'A' 'O']


In [31]:
print(accuracy(test_predict, y_test))

75.0


In [81]:
predictions = np.zeros((10, len(x_val)), dtype='str')

for i in range(1,101, 10):
    test = RandomForestClassifier(i)
    test.fit(x_full, y_full)

    predictions[i//10] = (test.predict(x_val))
    
    

In [82]:
for i, list in enumerate(predictions):
    print(f"{i}: {accuracy(list, y_val)}")

0: 88.0
1: 82.0
2: 85.0
3: 86.0
4: 84.0
5: 82.0
6: 91.0
7: 85.0
8: 92.0
9: 89.0


In [153]:
# save each configuration
    # 6x? array
    # 6th index = forest

# ranges of configurations
    # num of trees
    # pruning layers (0-10)
    # min split sample size (2-7)
    # min leaf sample size (1-6)
    # proportions of features (0.3 - 0.9, steps of 0.05)

# create array for configs
configurations = []

# pruning range
for j in range(5,9): #3,4,5,6
    
    # feature range
    for k in np.arange(0.33, 0.67, 0.33): #0.33, 0.66
        print(100,j,k)

        # create random forest
        tempForest = RandomForestClassifier(100, j, k)
        tempForest.fit(x_full, y_full)
        
        # save to list
        config = (j, k, tempForest)
        configurations.append(config)
            


100 5 0.33


KeyboardInterrupt: 

In [124]:
# 100
    # 1-7 0.33
    # 1-7 0.66

# 1 0.33

for (j, k, tempForest) in configurations:
    for i in range(25,101,25):
        accuracies = np.zeros(10)
        f1_scores = np.zeros(10)
        recalls = np.zeros(10)
        precisions = np.zeros(10)
        for m in range(0,10):
            temp_prediction = tempForest.predict(x_val, i)
            accuracies[m] = accuracy(temp_prediction, y_val)
            recalls[m] = fb_score(y_val, temp_prediction, 0.2)[1]
            precisions[m] = fb_score(y_val, temp_prediction, 0.2)[1]
            f1_scores[m] = fb_score(y_val, temp_prediction, 0.2)[1]

        ave_accuracy = np.sum(accuracies)/10
        ave_f1 = np.sum(f1_scores)/10
        ave_recall = np.sum(recalls)/10
        ave_precision = np.sum(precisions)/10
            
        print(f"({j}, {k}, {i}) (accuracy is: {ave_accuracy}) and (recall is: {ave_recall}) and (precision is: {ave_precision})")
    

(1, 0.33, 25) (accuracy is: 74.7) and (recall is: 0.7385150176307415) and (precision is: 0.7385150176307415)
(1, 0.33, 50) (accuracy is: 76.8) and (recall is: 0.7665191736108101) and (precision is: 0.7665191736108101)
(1, 0.33, 75) (accuracy is: 74.8) and (recall is: 0.7485071938191374) and (precision is: 0.7485071938191374)
(1, 0.33, 100) (accuracy is: 74.0) and (recall is: 0.7444608707698157) and (precision is: 0.7444608707698157)
(1, 0.66, 25) (accuracy is: 86.5) and (recall is: 0.8654944817213852) and (precision is: 0.8654944817213852)
(1, 0.66, 50) (accuracy is: 86.3) and (recall is: 0.8655792394368851) and (precision is: 0.8655792394368851)
(1, 0.66, 75) (accuracy is: 86.4) and (recall is: 0.8672571898737447) and (precision is: 0.8672571898737447)
(1, 0.66, 100) (accuracy is: 86.0) and (recall is: 0.86361429196184) and (precision is: 0.86361429196184)
(2, 0.33, 25) (accuracy is: 72.7) and (recall is: 0.7255831197683572) and (precision is: 0.7255831197683572)
(2, 0.33, 50) (accura

0
67.0
79.0
79.0
83.0
83.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
90.0
['O' 'C' 'E' 'Q' 'G' 'C' 'E' 'C' 'O' 'E' 'A' 'A' 'G' 'A' 'A' 'A' 'G' 'G'
 'E' 'E' 'E' 'E' 'O' 'C' 'C' 'Q' 'Q' 'C' 'O' 'Q' 'G' 'G' 'G' 'O' 'Q' 'G'
 'G' 'O' 'Q' 'C' 'C' 'Q' 'O' 'E' 'A' 'E' 'O' 'E' 'E' 'A' 'G' 'G' 'A' 'C'
 'A' 'G' 'A' 'C' 'Q' 'G' 'O' 'E' 'G' 'Q' 'O' 'E' 'G' 'A' 'A' 'A' 'A' 'C'
 'C' 'G' 'G' 'O' 'O' 'A' 'E' 'C' 'A' 'Q' 'E' 'Q' 'C' 'A' 'A' 'Q' 'C' 'E'
 'A' 'A' 'C' 'Q' 'A' 'Q' 'O' 'Q' 'O' 'Q' 'C' 'G' 'C' 'C' 'O' 'Q' 'A' 'G'
 'Q' 'C' 'G' 'Q' 'C' 'Q' 'G' 'Q' 'O' 'G' 'Q' 'E' 'O' 'Q' 'E' 'Q' 'O' 'E'
 'O' 'G' 'O' 'A' 'G' 'O' 'E' 'G' 'Q' 'Q' 'E' 'E' 'E' 'O' 'G' 'C' 'Q' 'Q'
 'E' 'G' 'A' 'C' 'A' 'Q' 'A' 'A' 'C' 'G' 'Q' 'O' 'O' 'O' 'E' 'C' 'C' 'Q'
 'Q' 'C' 'C' 'O' 'C' 'A' 'G' 'A' 'C' 'A' 'C' 'G' 'E' 'Q' 'E' 'C' 'O' 'A'
 'C' 'C' 'C' 'G' 'G' 'Q' 'G' 'Q' 'A' 'C' 'C' 'E' 'G' 'E' 'A' 'E' 'Q' 'C'
 'O' 'O']
84.5


In [170]:
# save each configuration
    # 6x? array
    # 6th index = forest

# ranges of configurations
    # num of trees
    # pruning layers (0-10)
    # min split sample size (2-7)
    # min leaf sample size (1-6)
    # proportions of features (0.3 - 0.9, steps of 0.05)

# create array for configs
configurations = []
tempTree = None

# pruning range
for i in range(0,10):
    for j in range(2,10):
        for k in range(1,5):

            print (i, j, k)
            overall_acc = 0
            
            for n in range(0,5):
                tempTree = DecisionTreeClassifier(i,j,k)
                tempTree.fit(x_full, y_full)

                temp_prediction = tempTree.predict(x_val)
                temp_acc = accuracy(temp_prediction, y_val)
                overall_acc += temp_acc

            overall_acc /= 5
            
            # save to list
            config = (i, j, k, tempTree, overall_acc)
            configurations.append(config)
        

0 2 1
0 2 2
0 2 3
0 2 4
0 3 1
0 3 2
0 3 3
0 3 4
0 4 1
0 4 2
0 4 3
0 4 4
0 5 1
0 5 2
0 5 3
0 5 4
0 6 1
0 6 2
0 6 3
0 6 4
0 7 1
0 7 2
0 7 3
0 7 4
0 8 1
0 8 2
0 8 3
0 8 4
0 9 1
0 9 2
0 9 3
0 9 4
1 2 1
1 2 2
1 2 3
1 2 4
1 3 1
1 3 2
1 3 3
1 3 4
1 4 1
1 4 2
1 4 3
1 4 4
1 5 1
1 5 2
1 5 3
1 5 4
1 6 1
1 6 2
1 6 3
1 6 4
1 7 1
1 7 2
1 7 3
1 7 4
1 8 1
1 8 2
1 8 3
1 8 4
1 9 1
1 9 2
1 9 3
1 9 4
2 2 1
2 2 2
2 2 3
2 2 4
2 3 1
2 3 2
2 3 3
2 3 4
2 4 1
2 4 2
2 4 3
2 4 4
2 5 1
2 5 2
2 5 3
2 5 4
2 6 1
2 6 2
2 6 3
2 6 4
2 7 1
2 7 2
2 7 3
2 7 4
2 8 1
2 8 2
2 8 3
2 8 4
2 9 1
2 9 2
2 9 3
2 9 4
3 2 1
3 2 2
3 2 3
3 2 4
3 3 1
3 3 2
3 3 3
3 3 4
3 4 1
3 4 2
3 4 3
3 4 4
3 5 1
3 5 2
3 5 3
3 5 4
3 6 1
3 6 2
3 6 3
3 6 4
3 7 1
3 7 2
3 7 3
3 7 4
3 8 1
3 8 2
3 8 3
3 8 4
3 9 1
3 9 2
3 9 3
3 9 4
4 2 1
4 2 2
4 2 3
4 2 4
4 3 1
4 3 2
4 3 3
4 3 4
4 4 1
4 4 2
4 4 3
4 4 4
4 5 1
4 5 2
4 5 3
4 5 4
4 6 1
4 6 2
4 6 3
4 6 4
4 7 1
4 7 2
4 7 3
4 7 4
4 8 1
4 8 2
4 8 3
4 8 4
4 9 1
4 9 2
4 9 3
4 9 4
5 2 1
5 2 2
5 2 3
5 2 4
5 3 1
5 3 2
5 3 

In [174]:
for (i,j,k, tree, acc) in configurations:
    
    # accuracies = np.zeros(10)
    # f1_scores = np.zeros(10)
    
    # temp_prediction = tree.predict(x_test)
    
    # ave_accuracy = accuracy(temp_prediction, y_test)

    # ave_f1 = fb_score(y_test, temp_prediction, 1)[1]

    if acc > 92:
        print(f"({i, j, k}) accuracy is: {acc}")
    

((6, 2, 2)) accuracy is: 93.0
((6, 3, 2)) accuracy is: 93.0
((6, 4, 2)) accuracy is: 93.0
((6, 5, 2)) accuracy is: 93.0
((7, 2, 2)) accuracy is: 93.0
((7, 3, 2)) accuracy is: 93.0
((7, 4, 2)) accuracy is: 93.0
((7, 5, 2)) accuracy is: 93.0
((8, 2, 1)) accuracy is: 93.0
((8, 3, 1)) accuracy is: 93.0
((8, 4, 1)) accuracy is: 93.0
((8, 5, 1)) accuracy is: 93.0


In [199]:
def train_and_predict(x_train, y_train, x_test, x_val, y_val):
    """ Interface to train and test the new/improved decision tree.
    
    This function is an interface for training and testing the new/improved
    decision tree classifier. 

    x_train and y_train should be used to train your classifier, while 
    x_test should be used to test your classifier. 
    x_val and y_val may optionally be used as the validation dataset. 
    You can just ignore x_val and y_val if you do not need a validation dataset.

    Args:
    x_train (numpy.ndarray): Training instances, numpy array of shape (N, K) 
                       N is the number of instances
                       K is the number of attributes
    y_train (numpy.ndarray): Class labels, numpy array of shape (N, )
                       Each element in y is a str 
    x_test (numpy.ndarray): Test instances, numpy array of shape (M, K) 
                            M is the number of test instances
                            K is the number of attributes
    x_val (numpy.ndarray): Validation instances, numpy array of shape (L, K) 
                       L is the number of validation instances
                       K is the number of attributes
    y_val (numpy.ndarray): Class labels of validation set, numpy array of shape (L, )
    
    Returns:
    numpy.ndarray: A numpy array of shape (M, ) containing the predicted class label for each instance in x_test
    """

    #######################################################################
    #                 ** TASK 4.1: COMPLETE THIS FUNCTION **
    #######################################################################
       

    # TODO: Train new classifier

    # set up an empty (M, ) numpy array to store the predicted labels 
    # feel free to change this if needed
    predictions = np.zeros((x_test.shape[0],), dtype=object)
        
    # TODO: Make predictions on x_test using new classifier        

    prev_accuracy = 0
    prev_std = 100
    accuracies = np.zeros(3, dtype=np.float64)
    best_model = None

    # pruning range
    for i in range(7,9):
        
        # min split sample size
        for j in range(4,6):

            # min leaf sample size
            for k in range(1,3):

                # average accuracy
                for n in range(0,1):
                    
                    tempTree = NewDecisionTreeClassifier(i,j,k)
                    tempTree.fit(x_train, y_train)

                    temp_prediction = tempTree.predict(x_val)
                    accuracies[n] = accuracy(temp_prediction, y_val)
                
                overall_acc = np.mean(accuracies)
                overall_std = np.std(accuracies)
                
                if overall_acc > prev_accuracy:
                    prev_accuracy = overall_acc
                    best_model = tempTree
                    prev_std = overall_std

                if np.isclose(overall_acc, prev_accuracy, atol=0.5) and overall_std < prev_std:
                    prev_accuracy = overall_acc
                    prev_std = overall_std
                    best_model = tempTree

    print(f" val std: {prev_std}")
    print(f" val acc: {prev_accuracy}")
    best_prediction = best_model.predict(x_test)

    return best_model

    print(confusion_matrix(y_test, best_prediction))
    print(recall(y_test, best_prediction))
    print(precision(y_test, best_prediction))
    print(f1_score(y_test, best_prediction))
    print(accuracy(y_test, best_prediction))

In [190]:
train_and_predict(x_full, y_full, x_test, x_val, y_val)

 val std: 0.0
 val acc: 93.0
[[33  0  0  1  0  0]
 [ 0 34  2  1  0  0]
 [ 0  1 23  0  1  1]
 [ 1  1  0 21  1  3]
 [ 0  1  0  1 32  0]
 [ 0  1  2  0  6 33]]
(array([0.97058824, 0.91891892, 0.88461538, 0.77777778, 0.94117647,
       0.78571429]), 0.8797985121514533)
(array([0.97058824, 0.89473684, 0.85185185, 0.875     , 0.8       ,
       0.89189189]), 0.8806781368571874)
(array([0.97058824, 0.90666667, 0.86792453, 0.82352941, 0.86486486,
       0.83544304]), 0.8781694574778208)
88.0


In [191]:
train_and_predict(x_noisy, y_noisy, x_test, x_val, y_val)

 val std: 0.0
 val acc: 87.0
[[31  0  0  0  1  2]
 [ 1 31  1  3  1  0]
 [ 0  0 25  1  0  0]
 [ 0  1  2 16  1  7]
 [ 0  2  0  1 28  3]
 [ 1  0  3  3  7 28]]
(array([0.91176471, 0.83783784, 0.96153846, 0.59259259, 0.82352941,
       0.66666667]), 0.7989882793804363)
(array([0.93939394, 0.91176471, 0.80645161, 0.66666667, 0.73684211,
       0.7       ]), 0.793519838351557)
(array([0.92537313, 0.87323944, 0.87719298, 0.62745098, 0.77777778,
       0.68292683]), 0.7939935234737407)
79.5


In [195]:
base_tree = DecisionTreeClassifier()
base_tree.fit(x_full, y_full)

In [197]:
base_tree_noisy = DecisionTreeClassifier()
base_tree_noisy.fit(x_noisy, y_noisy)

In [200]:
new_tree = train_and_predict(x_full, y_full, x_test, x_val, y_val)

 val std: 0.0
 val acc: 93.0


In [201]:
new_tree_noisy = train_and_predict(x_noisy, y_noisy, x_test, x_val, y_val)

 val std: 0.0
 val acc: 87.0


In [212]:
def paired_t_test(model1, model2, x_test):

    paired_accuracies = []

    # split the test set into 10
    
    # generate a random permutation of indices from 0 to n_instances
    seed = 12312
    rg = default_rng(seed)

    shuffled_indices = rg.permutation(len(x_test))

    # split shuffled indices into almost equal sized splits
    split_indices = np.array_split(shuffled_indices, 10)

    # per split
    for k in split_indices:
        
        # predict for both models
        model1_predictions = model1.predict(x_test[k]) 
        model2_predictions = model2.predict(x_test[k]) 
        
        # obtain accuracy
        model1_acc = accuracy(model1_predictions, y_test[k])
        model2_acc = accuracy(model2_predictions, y_test[k])
        
        # pair in a tuple, and add to list
        paired_accuracies.append((model1_acc, model2_acc))

    # t test formulas here
    diff = np.zeros(10, dtype=np.float64)
    for i, (a, b) in enumerate(paired_accuracies):
        diff[i] = b-a
    mean = np.mean(diff)
    sd = np.std(diff)
    t = (np.sqrt(10) * mean) / sd
    print(t)


In [213]:
paired_t_test(base_tree, new_tree, x_test)
paired_t_test(base_tree_noisy, new_tree_noisy, x_test)

1.3801311186847085
0.8624393618641034
