1. Сформировать с помощью sklearn.make_classification датасет из 1000 объектов с двумя признаками, обучить случайный лес из 1, 3, 10 и 50, 100, 200 деревьев и визуализировать их разделяющие гиперплоскости на графиках (по подобию визуализации деревьев из предыдущего урока, необходимо только заменить вызов функции predict на tree_vote). Сделать выводы о получаемой сложности гиперплоскости и недообучении или переобучении случайного леса в зависимости от количества деревьев в нем.
2. (опция). Заменить в реализованном алгоритме проверку с помощью отложенной выборки на Out-of-Bag.

In [3]:
import matplotlib.pyplot as plt
import random

from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.model_selection import train_test_split

import numpy as np

In [4]:
class Node:

    def __init__(self, index, border, true_branch, false_branch):
        self.index = index  
        self.border = border
        self.true_branch = true_branch
        self.false_branch = false_branch


class Leaf:

    def __init__(self, data, labels):
        self.data = np.array(data)
        self.labels = np.array(labels)
        self.prediction = self.predict()
        
    def predict(self):
        classes, labels_cnt = np.unique(self.labels, return_counts=True)
        prediction = classes[labels_cnt == labels_cnt.max()][0]
        return prediction
    
class MyTree:
    
    def __init__(self,
                 max_depth: int=None,
                 min_leaves: int=1,
                 max_leaves: int=None,
                 gini: bool=True,
                 entropy: bool=False
                 ):
        self.max_depth = max_depth # N nodes on tree
        self.min_leaves = max_leaves # at least N leaves on tree
        self.max_leaves = max_leaves
        self.gini = gini
        self.entropy = entropy
        self.n_leaves = 0
        
    def make_split(self, data, labels, index, t):
        left = np.where(data[:, index] <= t)
        right = np.where(data[:, index] > t)
        
        true_data = data[left]
        false_data = data[right]
        true_labels = labels[left]
        false_labels = labels[right]
        return true_data, false_data, true_labels, false_labels
    
    def get_quality(self, left_labels, right_labels, base_crit):
        p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])
        if self.gini:
            _criterion = self.get_gini
        else:
            _criterion = self.get_entropy
        return base_crit - p * _criterion(left_labels) - (1 - p) * _criterion(right_labels)
        
    def get_gini(self, labels: np.array):       
        labels = np.array(labels)
        classes, size = np.unique(labels, return_counts=True)
        impurity = 1 - ((size / labels.shape) ** 2).sum()
        return impurity
        
    def get_entropy(self, labels: np.array):
        labels = np.array(labels)
        classes, size = np.unique(labels, return_counts=True)
        p = size / labels.shape
        impurity = - (p * np.log2(p)).sum()
        return impurity
    
    def fit(self, data, labels):
        self.tree = self.build_tree(data, labels)
    
    def build_tree(self, data, labels, depth=0):
        quality, t, index = self.find_best_split(data, labels)

        if quality == 0 or (self.max_depth is not None and depth >= self.max_depth)\
        or (self.max_leaves is not None and self.n_leaves >= self.max_leaves - 1):
            self.n_leaves += 1
            return Leaf(data, labels)

        true_data, false_data, true_labels, false_labels = self.make_split(data, labels, index, t)        
        true_branch = self.build_tree(true_data, true_labels, depth + 1)
        false_branch = self.build_tree(false_data, false_labels, depth + 1)

        return Node(index, t, true_branch, false_branch)
            
    def find_best_split(self, data, labels):
        if self.gini:
            criterion = self.get_gini(labels)
        else:
            criterion = self.get_entropy(labels)
        best_quality = 0
        best_t = None
        best_index = None
        n_features = data.shape[1]

        for index in range(n_features):
            t_values = np.unique(data[:, index])
            for t in t_values:
                true_data, false_data, true_labels, false_labels = self.make_split(data, labels, index, t)
                if min(len(true_data), len(false_data)) < self.min_leaves:
                    continue

                current_quality = self.get_quality(true_labels, false_labels, criterion)

                if current_quality > best_quality:
                    best_quality, best_t, best_index = current_quality, t, index

        return best_quality, best_t, best_index
        
    def find_leves(self, example, node):
        if isinstance(node, Leaf):
            return node.prediction            

        if example[node.index] <= node.border:
            return self.find_leves(example, node.true_branch)
        
        else:
            return self.find_leves(example, node.false_branch)
    
    def predict(self, X):        
        predictions = []
        for example in X:
            prediction = self.find_leves(example, self.tree)
            predictions.append(prediction)
        return predictions    

In [5]:
def accuracy(actual, predicted):
    return (actual == predicted).sum() / actual.shape[0]

In [216]:
class MyForest:
    
    def __init__(self, 
                 n_estimators: int=10,
                 max_depth: int=5,
                 min_leaves: int=1,
                 max_leaves: int=6,
                 gini: bool=True,
                 entropy: bool=False):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_leaves = min_leaves
        self.max_leaves = max_leaves
        self.gini = gini
        self.entropy = entropy
        
    def get_subsample(self, len_sample):
    # будем сохранять не сами признаки, а их индексы
        sample_indexes = [i for i in range(len_sample)]

        len_subsample = int(np.sqrt(len_sample))
        subsample = []

        random.shuffle(sample_indexes)
        for _ in range(len_subsample):
            subsample.append(sample_indexes.pop())

        return subsample
        
    def get_bootstrap(self, data, labels, N):
        n_samples = data.shape[0]
        bootstrap = []

        for i in range(N):
            b_data = np.zeros(data.shape)
            b_labels = np.zeros(labels.shape)
            for j in range(n_samples):
                sample_index = random.randint(0, n_samples-1)
                b_data[j] = data[sample_index]
                b_labels[j] = labels[sample_index]            
            bootstrap.append((b_data, b_labels))

        return bootstrap
        
    def fit(self, data, labels):
        forest = []
        bootstrap = self.get_bootstrap(data, labels, self.n_estimators)
        
        for b_data, b_labels in bootstrap:

            tree = MyTree(max_depth=self.max_depth, max_leaves=self.max_leaves, gini=True, entropy=False)
            tree.fit(b_data, b_labels)
            
            forest.append(tree)
        
        self.forest = forest
    
    def predict(self, X):
        prediction = []
        for tree in self.forest:
            sub_result = tree.predict(X)
            prediction.append(sub_result)
               
        return np.round(np.nanmean(np.array(prediction).T, axis=1))

In [215]:
data, labels = datasets.make_classification(
    n_samples = 1000,
    n_features = 3,
    n_informative = 2, 
    n_classes = 2,
    n_redundant=0,
    n_clusters_per_class=1,
    random_state=5
)

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)
n_trees = [1, 3, 10, 50, 100, 200]

for forest in n_trees:
    model = MyForest(n_estimators=forest)
    model.fit(X_train, y_train)
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    print(f'FOREST of <{forest}> trees accuracy:\ntrain: {accuracy(y_train, pred_train):.2f}\ntest: {accuracy(y_test, pred_test):.2f}\n')

FOREST of <1> trees accuracy:
train: 0.90
test: 0.89

FOREST of <3> trees accuracy:
train: 0.92
test: 0.92

FOREST of <10> trees accuracy:
train: 0.91
test: 0.91

FOREST of <50> trees accuracy:
train: 0.91
test: 0.91

FOREST of <100> trees accuracy:
train: 0.92
test: 0.91

FOREST of <200> trees accuracy:
train: 0.91
test: 0.91



Вывод: лес получился не переобученный, с рочтом количества деревьев растет качество классификации.