#**AdaBoost Random Forest - Classification (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import IsolationForest
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

**Code**

In [2]:
class Node:
    def __init__(self, feature_index, threshold, left, right):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right

In [3]:
class LeafNode:
    def __init__(self, y):
        self.labels, self.counts = np.unique(y, return_counts=True)

    def predicted_class(self):
        return self.labels[np.argmax(self.counts)]

In [4]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion="gini"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        if len(unique_classes) == 1 or depth >= self.max_depth or num_samples < self.min_samples_split:
            return LeafNode(y)

        best_gain = -1
        best_feature_index = None
        best_threshold = None
        best_left_indices = None
        best_right_indices = None

        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                indices_left = np.where(X[:, feature_index] <= threshold)[0]
                indices_right = np.where(X[:, feature_index] > threshold)[0]

                if len(indices_left) > 0 and len(indices_right) > 0:
                    gain = self._information_gain(y, indices_left, indices_right)
                    if gain > best_gain:
                        best_gain = gain
                        best_feature_index = feature_index
                        best_threshold = threshold
                        best_left_indices = indices_left
                        best_right_indices = indices_right

        if best_gain == -1:
            return LeafNode(y)

        left_subtree = self._build_tree(X[best_left_indices], y[best_left_indices], depth + 1)
        right_subtree = self._build_tree(X[best_right_indices], y[best_right_indices], depth + 1)

        return Node(best_feature_index, best_threshold, left_subtree, right_subtree)

    def _information_gain(self, y, left_indices, right_indices):
        impurity_before = self._impurity(y)
        impurity_left = self._impurity(y[left_indices])
        impurity_right = self._impurity(y[right_indices])

        weighted_impurity = (len(left_indices) / len(y)) * impurity_left + (len(right_indices) / len(y)) * impurity_right
        return impurity_before - weighted_impurity

    def _impurity(self, y):
        if self.criterion == "gini":
            return self._gini_impurity(y)
        elif self.criterion == "entropy":
            return self._entropy_impurity(y)
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

    def _gini_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / np.sum(counts)
        return 1 - np.sum(probabilities ** 2)

    def _entropy_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / np.sum(counts)
        return -np.sum(probabilities * np.log2(probabilities + 1e-15))

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, tree):
        if isinstance(tree, LeafNode):
            return tree.predicted_class()
        else:
            if x[tree.feature_index] <= tree.threshold:
                return self._traverse_tree(x, tree.left)
            else:
                return self._traverse_tree(x, tree.right)

In [5]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=3, min_samples_split=2, criterion="gini"):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.trees = []

    def fit(self, X, y):
        n_samples, n_features = X.shape

        for _ in range(self.n_trees):
            # Bootstrap sampling
            indices = np.random.choice(n_samples, size=n_samples, replace=True)
            X_sample = X[indices]
            y_sample = y[indices]

            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        # Majority voting
        return np.array([np.argmax(np.bincount(tree_preds)) for tree_preds in tree_predictions.T])

In [6]:
class AdaBoost:
    def __init__(self, n_forests=5, n_trees=10):
        self.n_forests = n_forests
        self.n_trees = n_trees
        self.forests = []

    def fit(self, X, y):
        for _ in range(self.n_forests):
            forest = RandomForest(n_trees=self.n_trees)
            forest.fit(X, y)
            self.forests.append(forest)

    def predict(self, X):
        forest_predictions = np.array([forest.predict(X) for forest in self.forests])
        # Majority voting from all forests
        return np.array([np.argmax(np.bincount(forest_preds)) for forest_preds in forest_predictions.T])

**Load Dataset**

In [7]:
# Load Iris dataset
data = load_iris()
X, y = data.data, data.target

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Train AdaBoost with Random Forests on the data
adaboost = AdaBoost(n_forests=5, n_trees=10)
adaboost.fit(X_train, y_train)

In [10]:
# Predictions
predictions = adaboost.predict(X_test)
print(predictions[:5])

[1 0 2 1 1]


In [11]:
# Evaluate
accuracy = accuracy_score(y_test, predictions)
print(f"AdaBoost with Random Forest Accuracy: {accuracy}")

AdaBoost with Random Forest Accuracy: 1.0


In [12]:
# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))

Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [13]:
# Print the number of forests and trees
print(f"Total number of forests: {adaboost.n_forests}")
print(f"Number of trees in each forest: {adaboost.n_trees}")

Total number of forests: 5
Number of trees in each forest: 10
