In [1]:
from scipy.io import loadmat
from collections import Counter
from random import randint
import numpy as np

$N = $ number of observations

$C = $ set of classes

`X_clean`, `X_noisy` $ \in \{0, 1\}^{(N \times 45)}$

`Y_clean`, `Y_noisy` $ \in C^{(N \times 0)}$

In [3]:
clean_data = loadmat('Data/cleandata_students.mat')
noisy_data = loadmat('Data/noisydata_students.mat')

X_clean = clean_data['x']
X_noisy = noisy_data['x']

attributes = [i for i in range(len(X_clean[0]) - 1)]

y_clean = [y[0] for y in clean_data['y']]
y_noisy = [y[0] for y in noisy_data['y']]

In [4]:
def unique_attr_vals(X):
    X = np.array(X)
    X = X.reshape(X.shape[1], X.shape[0])
    return [list(set(list(attr_vec))) for attr_vec in list(X)]

def format_train_targets(y, target_class):
    return [1 if c == target_class else 0 for c in y]

class Node:
    def __init__(self, op, value=None):
        self.op = op
        self.value = value
        self.kids = [None, None]
            
    def add_child(self, value, node):
        self.kids[value] = node

In [5]:
def majoirty_value(y):
    return Counter(y).most_common(1)[0][0]

def filter_examples(X, y, attribute, value):
    X_new = []
    y_new = []
    
    for (index, x) in enumerate(X):
        if (x[attribute] == value):
            X_new.append(x)
            y_new.append(y[index])
            
    return (X_new, y_new)

In [6]:
def choose_best_attribute(X, attributes, y):
    return attributes[randint(0, len(attributes) - 1)]

# attributes = list of indexes of attributes \in [0, 44]
def build_decision_tree(X, attributes, y, attr_vals):
    unique_y = set(y)
    
    if len(unique_y) == 1:
        return Node(-1, unique_y.pop())
    elif len(attributes) == 0:
        return Node(-1, majoirty_value(y))
    
    best_attr = choose_best_attribute(X, attributes, y)

    node = Node(best_attr)
    
    for val in attr_vals[best_attr]:
        (X_v, y_v) = filter_examples(X, y, best_attr, val)
        
        if len(X_v) == 0:
            node.add_child(val, Node(-1, majoirty_value(y)))
        else:
            new_attr = [a for a in attributes if a != best_attr]
            child_node = build_decision_tree(X_v, new_attr, y_v, attr_vals)
            node.add_child(val, child_node)
    
    return node

def predict(tree, x):
    if tree == None:
        print("bug")
        return 0
    
    if tree.op == -1:
        return tree.value
    
    return predict(tree.kids[x[tree.op]], x)

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split

attr_vals = unique_attr_vals(X_clean)

y = format_train_targets(y_clean, 2)

X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.33, random_state=42)

tree = build_decision_tree(X_train, attributes, y_train, attr_vals)

def test(X_test, actual_y, tree):
    predicted = [predict(tree, x) for x in X_test]
    
    correct = 0
    
    for i in range(len(actual_y)):
        if actual_y[i] == predicted[i]:
            correct += 1
    
    return "{}%".format(round(correct/len(actual_y), 4) * 100)

print(test(X_test, y_test, tree))

79.52%
