#**Random Forest - Classification (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import load_iris, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, auc, roc_curve

**Code**

In [2]:
class Node:
    def __init__(self, feature_index, threshold, left, right):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right

In [3]:
class LeafNode:
    def __init__(self, y):
        self.labels, self.counts = np.unique(y, return_counts=True)

    def predicted_class(self):
        return self.labels[np.argmax(self.counts)]

In [4]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion="gini"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        # Check stopping criteria
        if len(unique_classes) == 1 or depth >= self.max_depth or num_samples < self.min_samples_split:
            return LeafNode(y)

        # Find the best split
        best_split = self._best_split(X, y, num_features)
        if best_split is None:
            return LeafNode(y)

        left_indices = best_split['indices_left']
        right_indices = best_split['indices_right']

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(best_split['feature_index'], best_split['threshold'], left_subtree, right_subtree)

    def _best_split(self, X, y, num_features):
        best_gain = -1
        best_split = None

        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                indices_left = np.where(X[:, feature_index] <= threshold)[0]
                indices_right = np.where(X[:, feature_index] > threshold)[0]

                if len(indices_left) > 0 and len(indices_right) > 0:
                    gain = self._information_gain(y, indices_left, indices_right)
                    if gain > best_gain:
                        best_gain = gain
                        best_split = {
                            'feature_index': feature_index,
                            'threshold': threshold,
                            'indices_left': indices_left,
                            'indices_right': indices_right
                        }
        return best_split

    def _information_gain(self, y, left_indices, right_indices):
        # Calculate impurity for the split based on the selected criterion
        impurity_before = self._impurity(y)
        impurity_left = self._impurity(y[left_indices])
        impurity_right = self._impurity(y[right_indices])

        weighted_impurity = (len(left_indices) / len(y)) * impurity_left + (len(right_indices) / len(y)) * impurity_right
        return impurity_before - weighted_impurity

    def _impurity(self, y):
        if self.criterion == "gini":
            return self._gini_impurity(y)
        elif self.criterion == "entropy":
            return self._entropy_impurity(y)
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

    def _gini_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)

    def _entropy_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return -np.sum(probabilities * np.log2(probabilities + 1e-15))  # Adding a small value to prevent log(0)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, tree):
        if isinstance(tree, LeafNode):
            return tree.predicted_class()
        else:
            if x[tree.feature_index] <= tree.threshold:
                return self._traverse_tree(x, tree.left)
            else:
                return self._traverse_tree(x, tree.right)

    def predict_proba(self, X):
        probabilities = np.zeros((X.shape[0], len(np.unique(y))))
        for i, x in enumerate(X):
            class_count = self._traverse_tree_proba(x, self.tree)
            total = sum(class_count.values())
            for label, count in class_count.items():
                probabilities[i, label] = count / total  # Probability for each class
        return probabilities

    def _traverse_tree_proba(self, x, tree):
        if isinstance(tree, LeafNode):
            return {label: count for label, count in zip(tree.labels, tree.counts)}
        else:
            if x[tree.feature_index] <= tree.threshold:
                return self._traverse_tree_proba(x, tree.left)
            else:
                return self._traverse_tree_proba(x, tree.right)

In [5]:
class RandomForest:
    def __init__(self, n_trees=5, max_depth=10, min_samples_split=2, criterion="gini", loss_function="categorical_cross_entropy"):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.loss_function = loss_function
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_trees):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion)
            # Bootstrap sampling
            sample_indices = np.random.choice(len(X), len(X), replace=True)
            X_sample = X[sample_indices]
            y_sample = y[sample_indices]
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        return [Counter(tree_preds).most_common(1)[0][0] for tree_preds in tree_predictions.T]

    def predict_proba(self, X):
        probabilities = np.array([tree.predict_proba(X) for tree in self.trees])
        return np.mean(probabilities, axis=0)

    def evaluate(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        loss = self._loss_function(y, self.predict_proba(X))
        return accuracy, loss

    def _loss_function(self, y_true, y_pred):
        if self.loss_function == "binary_cross_entropy":
            return self._binary_cross_entropy(y_true, y_pred)
        elif self.loss_function == "categorical_cross_entropy":
            return self._categorical_cross_entropy(y_true, y_pred)
        else:
            raise ValueError(f"Unknown loss function: {self.loss_function}")

    def _binary_cross_entropy(self, y_true, y_pred):
        # Clip predictions to prevent log(0)
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def _categorical_cross_entropy(self, y_true, y_pred):
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        y_true_one_hot = np.eye(np.max(y_true) + 1)[y_true]
        return -np.mean(np.sum(y_true_one_hot * np.log(y_pred), axis=1))

**Load Dataset**

In [6]:
# Load Iris dataset
data = load_iris()
X, y = data.data, data.target

print(type(X)), print(type(y))
print(X.shape), print(y.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(150, 4)
(150,)


(None, None)

In [7]:
print(X[:5])
print(y[:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[0 0 0 0 0]


In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [9]:
# Train
forest = RandomForest(n_trees=5, max_depth=10, min_samples_split=2, criterion="gini", loss_function="categorical_cross_entropy")
forest.fit(X_train, y_train)

In [10]:
# Predictions
predictions = forest.predict(X_test)
print("Predicted classes:", predictions)

Predicted classes: [1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [11]:
# Evaluate with Gini criterion
accuracy, loss = forest.evaluate(X_test, y_test)
print(f"Gini Accuracy: {accuracy * 100:.2f}%")
print(f"Gini Loss: {loss:.4f}")

Gini Accuracy: 100.00%
Gini Loss: 0.0223


In [12]:
# Accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Random Forest Accuracy: {accuracy}")

# Making the Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))

Random Forest Accuracy: 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

