# Mustererkennung/Machine Learning - Assignment 6



In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

### Load the spam dataset:

In [2]:
data = np.array(pd.read_csv('spambase.data', header=None))

X = data[:,:-1] # features
y = data[:,-1] # Last column is label

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, shuffle=True, stratify=y)

In [3]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def gini(y_true):
    """
    For simplicity reasons this assumes that there are only 2 classes
    """
    p_m0 = np.mean(y_true == 0)
    p_m1 = np.mean(y_true == 1)
    return 1 - p_m0 ** 2 + p_m1 ** 2

class LeafNode():
    def fit(self, c):
        self.c = c
        
    def predict(self, x):
        return self.c
    
class InternalNode():
    def fit(self, x, y, depth, max_depth):
        m, n = x.shape
        self.is_leaf = False
        # columns are j, z, c1, c2, split_index, loss_total
        split_infos = []
        for j in range(n):
            # sort rows by feature j in ascending order
            x = x[x[:,j].argsort()]
            for split_index in range(1, m - 1):
                this_feat_val = x[split_index, j]
                next_feat_val = x[split_index + 1, j]
                z = this_feat_val + (next_feat_val - this_feat_val)
                y_top_split = y[:split_index]
                y_bottom_split = y[split_index:]
                # c is the class which has the most entries in the split
                c1 = self.find_c(y_top_split)
                c2 = self.find_c(y_bottom_split)
                
                loss_1 = gini(y_top_split)
                loss_2 = gini(y_bottom_split)
                
                loss_total = loss_1 + loss_2
                
                row = np.array([j, z, c1, c2, split_index, loss_total])
                split_infos.append(row)
                
        split_infos = np.array(split_infos)
        best_split_idx = np.argmin(split_infos[:,-1], axis=0)
        best_split = split_infos[best_split_idx]
        self.j = int(best_split[0])
        self.z = best_split[1]
        c1 = best_split[2]
        c2 = best_split[3]
        split_index = int(best_split[4])
        x = x[x[:, self.j].argsort()]
        x_top_split, y_top_split = x[:split_index], y[:split_index]
        x_bottom_split, y_bottom_split = x[split_index:], y[split_index:]
        
        if x_top_split.shape[0] <= 2 or depth >= max_depth:
            self.left_child = LeafNode()
            self.left_child.fit(c1)
        else:
            self.left_child = InternalNode()
            self.left_child.fit(x_top_split, y_top_split, depth + 1, max_depth)
            
        if x_bottom_split.shape[0] <= 2 or depth >= max_depth:
            self.right_child = LeafNode()
            self.right_child.fit(c2)
        else:
            self.right_child = InternalNode()
            self.right_child.fit(x_bottom_split, y_bottom_split, depth + 1, max_depth)
        
    def predict(self, x):
        if x[self.j] <= self.z:
            return self.left_child.predict(x)
        return self.right_child.predict(x)
    
    def find_c(self, y):
        """
        For simplicity reasons this assumes that there are only 2 classes
        """
        zeros = np.sum(y == 0)
        ones = np.sum(y == 1)
        if zeros > ones:
            return 1
        return 0
        
    
class DecisionTreeClassifier():
    """
    Basically just holds the root node of the tree which starts the recursion
    """
    def __init__(self, max_depth):
        self.max_depth = max_depth
        
    def fit(self, x, y):
        self.root = InternalNode()
        x = np.copy(x)
        y = np.copy(y)
        self.root.fit(x, y, 1, self.max_depth)
    
    def predict(self, x):
        y_preds = []
        for sample in x:
            y_pred = self.root.predict(sample)
            y_preds.append(y_pred)
        return np.array(y_preds)


This takes about two minutes

1min 43s and test acc of 0.639 for max_depth = 5 

In [4]:
%%time

clf = DecisionTreeClassifier(max_depth = 6)
clf.fit(X_train, y_train)
print()


CPU times: user 2min 1s, sys: 171 ms, total: 2min 1s
Wall time: 2min 1s


In [5]:
y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)
print(acc)

0.6394439617723718
