#**AdaBoost Random Forest - Classification (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import IsolationForest
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

**Code**

In [2]:
class Node:
    def __init__(self, feature_index, threshold, left, right):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right

In [3]:
class LeafNode:
    def __init__(self, y):
        self.labels, self.counts = np.unique(y, return_counts=True)

    def predicted_class(self):
        return self.labels[np.argmax(self.counts)]

In [4]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion="gini"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y, sample_weights=None):
        self.tree = self._build_tree(X, y, sample_weights=sample_weights)

    def _build_tree(self, X, y, depth=0, sample_weights=None):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        if len(unique_classes) == 1 or depth >= self.max_depth or num_samples < self.min_samples_split:
            return LeafNode(y)

        best_split = self._best_split(X, y, num_features, sample_weights)
        if best_split is None:
            return LeafNode(y)

        left_indices = best_split['indices_left']
        right_indices = best_split['indices_right']

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1, sample_weights[left_indices])
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1, sample_weights[right_indices])

        return Node(best_split['feature_index'], best_split['threshold'], left_subtree, right_subtree)

    def _best_split(self, X, y, num_features, sample_weights=None):
        best_gain = -1
        best_split = None

        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                indices_left = np.where(X[:, feature_index] <= threshold)[0]
                indices_right = np.where(X[:, feature_index] > threshold)[0]

                if len(indices_left) > 0 and len(indices_right) > 0:
                    gain = self._information_gain(y, indices_left, indices_right, sample_weights)
                    if gain > best_gain:
                        best_gain = gain
                        best_split = {
                            'feature_index': feature_index,
                            'threshold': threshold,
                            'indices_left': indices_left,
                            'indices_right': indices_right
                        }
        return best_split

    def _information_gain(self, y, left_indices, right_indices, sample_weights=None):
        impurity_before = self._impurity(y, sample_weights)
        impurity_left = self._impurity(y[left_indices], sample_weights[left_indices])
        impurity_right = self._impurity(y[right_indices], sample_weights[right_indices])

        weighted_impurity = (len(left_indices) / len(y)) * impurity_left + (len(right_indices) / len(y)) * impurity_right
        return impurity_before - weighted_impurity

    def _impurity(self, y, sample_weights=None):
        if self.criterion == "gini":
            return self._gini_impurity(y, sample_weights)
        elif self.criterion == "entropy":
            return self._entropy_impurity(y, sample_weights)
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

    def _gini_impurity(self, y, sample_weights=None):
        classes, counts = np.unique(y, return_counts=True)
        if sample_weights is not None:
            counts = np.bincount(y, weights=sample_weights)
        probabilities = counts / np.sum(counts)
        return 1 - np.sum(probabilities ** 2)


    def _entropy_impurity(self, y, sample_weights=None):
        classes, counts = np.unique(y, return_counts=True)
        if sample_weights is not None:
            counts = np.bincount(y, weights=sample_weights)
        probabilities = counts / np.sum(counts)
        return -np.sum(probabilities * np.log2(probabilities + 1e-15))

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, tree):
        if isinstance(tree, LeafNode):
            return tree.predicted_class()
        else:
            if x[tree.feature_index] <= tree.threshold:
                return self._traverse_tree(x, tree.left)
            else:
                return self._traverse_tree(x, tree.right)

In [5]:
class AdaBoostForest:
    def __init__(self, n_estimators=5, max_depth=3, min_samples_split=2, criterion="gini"):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.trees = []
        self.tree_weights = []

    def fit(self, X, y):
        n_samples = X.shape[0]
        sample_weights = np.full(n_samples, (1 / n_samples))
        self.trees = []
        self.tree_weights = []

        for _ in range(self.n_estimators):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion)
            tree.fit(X, y, sample_weights=sample_weights)

            predictions = tree.predict(X)
            error = np.sum(sample_weights * (predictions != y)) / np.sum(sample_weights)

            tree_weight = 0.5 * np.log((1 - error) / (error + 1e-10))  # Avoid division by zero
            self.trees.append(tree)
            self.tree_weights.append(tree_weight)

            # Update sample weights
            sample_weights *= np.exp(-tree_weight * y * (2 * (predictions == y) - 1))
            sample_weights /= np.sum(sample_weights)

    def predict(self, X):
        weighted_predictions = np.zeros(X.shape[0])
        for tree, tree_weight in zip(self.trees, self.tree_weights):
            predictions = tree.predict(X)
            weighted_predictions += tree_weight * predictions

        return np.sign(weighted_predictions).astype(int)

**Load Dataset**

In [6]:
# Load Iris dataset
data = load_iris()
X, y = data.data, data.target
y[y == 0] = -1  # Adjust labels to {-1, 1} for AdaBoost binary classification

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Replace -1 with a valid class label (e.g., 0)
y_train_replaced = np.where(y_train == -1, 0, y_train)

In [9]:
# Train AdaBoostForest on the updated data
ada_boost_forest = AdaBoostForest(n_estimators=5, max_depth=3)
ada_boost_forest.fit(X_train, y_train_replaced)

In [10]:
# Predictions
predictions = ada_boost_forest.predict(X_test)
print(predictions[:5])

[1 0 1 1 1]


In [11]:
# Evaluate
accuracy = accuracy_score(y_test, predictions)
print(f"AdaBoost Forest Accuracy: {accuracy}")

AdaBoost Forest Accuracy: 0.3


In [12]:
# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))

Confusion Matrix:
 [[ 0 10  0  0]
 [ 0  0  0  0]
 [ 0  0  9  0]
 [ 0  0 11  0]]

Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00        10
           0       0.00      0.00      0.00         0
           1       0.45      1.00      0.62         9
           2       0.00      0.00      0.00        11

    accuracy                           0.30        30
   macro avg       0.11      0.25      0.16        30
weighted avg       0.13      0.30      0.19        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
