In [None]:
import numpy as np

class Node:
    def __init__(self):
        self.threshold, self.feature_id, self.node_id = -1, -1, -1
        self.labels = np.array([])
        self.left, self.right = None, None
        self.prune_flag = True
    def __repr__(self):
        return 'Node %d' % self.node_id
        #return 'Node for %d feature' % self.feature_id
    def set_all(self, threshold, feature_id, node_id, data):
        self.threshold, self.feature_id, self.node_id = threshold, feature_id, node_id
        self.labels = [data[i][-1] for i in range(len(data))]
    def create_left(self):
        self.left = Node()
    def create_right(self):
        self.right = Node()

In [None]:
#%%script pypy3

import csv, random, math as m, queue, numpy as np

class DecisiveTreeFeatureBagging:
    def __init__(self, file):
        self.file = file
        self.train, self.valid, self.test = np.array([]), np.array([]), np.array([])
        self.features, self.root, self.nodesCnt, self.depth_threshold = 0, 0, 0, 13
        self.to_preserve, self.to_remove = 0, 1
        self.root = None

    def set_dataset(self, train, valid, test):
        self.train, self.valid, self.test = train, valid, test
        self.features = 11

    def get_distro(self, data):
        labels = data[:,-1]
        return np.divide(np.unique(labels, return_counts=True)[1], float(labels.size))

    def compute_entropy(self, data):
        distro = self.get_distro(data)
        log_distro = np.log2(distro)
        return 0 if np.count_nonzero(distro)<=1 else (-1 * distro * log_distro).sum()

    def information_gain(self, val, fea_id, entropy, data):
        smaller = data[np.where(data[:,fea_id]<=val)]
        greater = data[np.where(data[:,fea_id]>val)]
        prob_smaller = float(smaller.shape[0]) / (smaller.shape[0] + greater.shape[0])
        prob_greater = float(greater.shape[0]) / (smaller.shape[0] + greater.shape[0])
        s_entropy = self.compute_entropy(smaller)
        g_entropy = self.compute_entropy(greater)
        return (entropy - (prob_smaller*s_entropy + prob_greater*g_entropy), smaller, greater)

    def feature_choice_feature_bagging(self, data, avail_features):
        maxIG = (-1, 0, 0, [], [])
        thres, samples = self.features, data.shape[0]
        entropy = self.compute_entropy(data)
        for fea_id in avail_features:
            for split_id in range(samples):
                inf_gain = self.information_gain(data[split_id, fea_id], fea_id, entropy, data)
                if maxIG[0] < inf_gain[0]:
                    maxIG = (inf_gain[0], split_id, fea_id, inf_gain[1], inf_gain[2])
        return maxIG[1:]

    def set_root_node(self, split_id, fea_id):
        self.root = Node()
        self.root.set_all(self.train[split_id][fea_id], fea_id, self.nodesCnt, self.train)
        self.nodesCnt += 1

    def setup_node(self, data, node, split_id, fea_id):
        node.set_all(data[split_id][fea_id], fea_id, self.nodesCnt, data)
        self.nodesCnt += 1

    def get_available_features(self):
        formula = m.ceil(m.sqrt(11))
        sample_to_choose = int(formula) if formula > 0 else 1
        return np.random.choice(11, sample_to_choose, replace=False)
    
    def create_tree_feature_bagging(self):
        avail_features = self.get_available_features()
        split_id, fea_id, s, g = self.feature_choice_feature_bagging(self.train, avail_features)
        self.set_root_node(split_id, fea_id)
        que = queue.Queue()
        que.put((self.root, 'l', s, 1))
        que.put((self.root, 'r', g, 1))
        while not que.empty():
            node, side, data, depth = que.get()
            if depth > self.depth_threshold or len(data) == 0:
                continue
            node.create_left() if side == 'l' else node.create_right()
            node = node.left if side == 'l' else node.right
            avail_features = self.get_available_features()
            split_id, fea_id, s, g = self.feature_choice_feature_bagging(data, avail_features)
            self.setup_node(data, node, split_id, fea_id)
            que.put((node, 'l', s, depth+1))
            que.put((node, 'r', g, depth+1))

    def calculate_accuracy(self, result, predicted_result):
        return np.sum(result == predicted_result) / float(result.size)
    
    def set_next_node(self, t, val_fea_id):
        if t.left is None or t.right is None:
            return t.left if t.right is None else t.right
        else:
            return t.right if val_fea_id > t.threshold else t.left
    
    def evaluate_one_sample(self, node, sample):
        while True:
            if node.left is None and node.right is None:
                label = np.argmax(np.bincount(node.labels))
                return int(sample[-1]), int(label)
            else:
                node = self.set_next_node(node, sample[node.feature_id])

    def evaluate_tree(self, test_data):
        predicted_result, result = [], []
        for sample in test_data:
            t = self.root
            sample_label, pred_label = self.evaluate_one_sample(t, sample)
            result.append(sample_label)
            predicted_result.append(pred_label)
        result, predicted_result = np.asarray(result), np.asarray(predicted_result)
        accuracy = self.calculate_accuracy(result, predicted_result)
        return ("Accuracy obtained on test data is %f" % accuracy, accuracy, result, predicted_result)

In [None]:
#%%script pypy3

import csv, random, math as m, queue, numpy as np

class RandomForest:
    def __init__(self, file):
        self.file = file
        self.card_forest, self.samples_per_tree = 40, 2200
        self.feature_bagging = np.array((self.samples_per_tree, self.card_forest))
        self.values, self.train, self.valid, self.test = np.array([]), np.array([]), np.array([]), np.array([])
        self.forest = [DecisiveTreeFeatureBagging(file) for _ in range(self.card_forest)]
        self.bags = np.empty([self.card_forest, self.samples_per_tree])

    def create_dataset(self, train_ratio=5, valid_ratio=1, test_ratio=1):
        with open(self.file) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            splitted_lines = [x[0].split(';') for x in csv_reader]
            self.columns = [str(el.replace('"', '')) for el in splitted_lines[0]]
            self.features = len(splitted_lines[0]) - 1
            self.used_in_tree = [False] * self.features
            for line in splitted_lines[1:]:
                line = [float(el) for el in line]
                self.values = np.asarray(line) if self.values.size == 0 else np.vstack((self.values, np.asarray(line)))
        self.extract_datasets(train_ratio, valid_ratio, test_ratio)

    def extract_datasets(self, train_ratio, valid_ratio, test_ratio):
        n = self.values.shape[0]
        all_ratio = train_ratio + test_ratio + valid_ratio
        np.random.shuffle(self.values)
        self.train = np.asarray(self.values[:int((train_ratio*n)/all_ratio)])
        self.valid = np.asarray(self.values[int((train_ratio*n)/all_ratio):int(((train_ratio+valid_ratio)*n)/all_ratio)])
        self.test = np.asarray(self.values[int(((train_ratio+valid_ratio)*n)/all_ratio):])

    def create_tree(self, tree):
        smaller_train = self.train[np.random.choice(list(range(self.train.shape[0])),
            self.samples_per_tree, replace=True)]
        tree.set_dataset(smaller_train, self.valid, self.test)
        tree.create_tree_feature_bagging()

    def create_random_forest(self):
        for i in range(self.card_forest):
            print("Creating tree {}".format(i))
            self.create_tree(self.forest[i])
            self.evaluate_random_forest(i+1)
    
    def calculate_accuracy(self, result, predicted_result):
        return np.sum(result == predicted_result) / float(result.size)

    def evaluate_random_forest(self, nr_trees):
        result, pred_result = [], []
        labels = self.test[:,-1]
        unique, counts = np.unique(labels, return_counts=True)
        pred_good, pred_bad = {k:0 for k in unique}, {k:0 for k in unique}
        print('\n')
        print("All labels ", dict(zip(unique, counts)))
        for sample in self.test:
            res_for_sample = []
            for j in range(nr_trees):
                t = self.forest[j].root
                real, pred = self.forest[j].evaluate_one_sample(t, sample)
                res_for_sample.append(pred)
            majority = np.argmax(np.bincount(np.asarray(res_for_sample)))
            result.append(real)
            pred_result.append(majority)
            if real == majority:
                pred_good[real] += 1
            else:
                pred_bad[real] += 1
        print("Good predictions ", pred_good)
        print("Bad predictions ", pred_bad)
        result, predicted_result = np.asarray(result), np.asarray(pred_result)
        accuracy = self.calculate_accuracy(result, predicted_result)*100
        print("Accuracy obtained on test data is %f" % accuracy)
        return ("Accuracy obtained on test data is %f" % accuracy, accuracy, result, predicted_result)

In [None]:
#%%script pypy3

import sys

if __name__ == '__main__':
    input_file = "dataset/winequality-white.csv"
    results_sum, required = 0.0, 0.500000
    samples, better_than_required = 10, 0
    custom_split = False
    
    rf = RandomForest(input_file)
    rf.create_dataset() if not custom_split else rf.create_dataset(train_ratio, valid_ratio, test_ratio)
    rf.create_random_forest()
    info, accuracy, res_labels, predicted_labels = rf.evaluate_random_forest(rf.card_forest)
    print(info)

