#**AdaBoost Random Forest - Regression (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import IsolationForest
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

**Code**

In [2]:
class Node:
    def __init__(self, feature_index, threshold, left, right):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right

In [3]:
class LeafNodeRegression:
    def __init__(self, y):
        self.value = np.mean(y)

    def predicted_value(self):
        return self.value

In [4]:
class DecisionTreeRegression:
    def __init__(self, max_depth=None, min_samples_split=2, criterion="mse"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape

        # Check stopping criteria
        if depth >= self.max_depth or num_samples < self.min_samples_split:
            return LeafNodeRegression(y)

        # Find the best split
        best_feature_index, best_threshold, indices_left, indices_right = self._best_split(X, y, num_features)
        if best_feature_index is None:
            return LeafNodeRegression(y)

        # Recursively build left and right subtrees
        left_subtree = self._build_tree(X[indices_left], y[indices_left], depth + 1)
        right_subtree = self._build_tree(X[indices_right], y[indices_right], depth + 1)

        return Node(best_feature_index, best_threshold, left_subtree, right_subtree)

    def _best_split(self, X, y, num_features):
        best_gain = -1
        best_feature_index = None
        best_threshold = None
        best_indices_left = None
        best_indices_right = None

        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                indices_left = np.where(X[:, feature_index] <= threshold)[0]
                indices_right = np.where(X[:, feature_index] > threshold)[0]

                if len(indices_left) > 0 and len(indices_right) > 0:
                    gain = self._information_gain(y, indices_left, indices_right)
                    if gain > best_gain:
                        best_gain = gain
                        best_feature_index = feature_index
                        best_threshold = threshold
                        best_indices_left = indices_left
                        best_indices_right = indices_right

        return best_feature_index, best_threshold, best_indices_left, best_indices_right

    def _information_gain(self, y, left_indices, right_indices):
        impurity_before = self._impurity(y)
        impurity_left = self._impurity(y[left_indices])
        impurity_right = self._impurity(y[right_indices])

        weighted_impurity = (len(left_indices) / len(y)) * impurity_left + (len(right_indices) / len(y)) * impurity_right
        return impurity_before - weighted_impurity

    def _impurity(self, y):
        if self.criterion == "mse":
            return np.mean((y - np.mean(y)) ** 2)
        elif self.criterion == "mae":
            return np.mean(np.abs(y - np.mean(y)))
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, tree):
        if isinstance(tree, LeafNodeRegression):
            return tree.predicted_value()
        else:
            if x[tree.feature_index] <= tree.threshold:
                return self._traverse_tree(x, tree.left)
            else:
                return self._traverse_tree(x, tree.right)

In [5]:
class RandomForestRegression:
    def __init__(self, n_trees=5, max_depth=10, min_samples_split=2, criterion="mse"):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_trees):
            tree = DecisionTreeRegression(max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion)
            # Bootstrap sampling
            sample_indices = np.random.choice(len(X), len(X), replace=True)
            X_sample = X[sample_indices]
            y_sample = y[sample_indices]
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        # Average the predictions from all trees
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_predictions, axis=0)

In [6]:
class AdaBoost:
    def __init__(self, n_estimators=5, learning_rate=1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.models = []
        self.alphas = []

    def fit(self, X, y):
        n_samples = len(y)
        # Initialize weights
        weights = np.ones(n_samples) / n_samples

        for _ in range(self.n_estimators):
            model = DecisionTreeRegression(max_depth=1, min_samples_split=1)  # Weak learner (stump)
            model.fit(X, y)
            y_pred = model.predict(X)

            # Compute error
            error = np.sum(weights * (y_pred != y)) / np.sum(weights)
            alpha = self.learning_rate * 0.5 * np.log((1 - error) / (error + 1e-10))  # Adding small value to avoid division by zero

            # Update weights
            weights *= np.exp(-alpha * y * y_pred)
            weights /= np.sum(weights)  # Normalize weights

            self.models.append(model)
            self.alphas.append(alpha)

    def predict(self, X):
        # Compute the weighted sum of the predictions from all models
        final_prediction = sum(alpha * model.predict(X) for alpha, model in zip(self.alphas, self.models))
        return np.sign(final_prediction)

    def evaluate(self, X, y):
        predictions = self.predict(X)
        mse = np.mean((y - predictions) ** 2)
        return mse

**Load Dataset**

In [7]:
# Example: Load the diabetes dataset for regression
data = load_diabetes()
X, y = data.data, data.target

# Convert targets to 1 for regression (this is just for the sake of AdaBoost)
y_binary = np.where(y > np.mean(y), 1, -1)  # Converting to binary for AdaBoost compatibility

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

In [9]:
# Train multiple Random Forest regressors
num_forests = 3
forests = []

for i in range(num_forests):
    print(f"Training Forest {i + 1} with {5} trees...")
    forest_regressor = RandomForestRegression(n_trees=5, max_depth=10, min_samples_split=2, criterion="mse")
    forest_regressor.fit(X_train, y_train)
    forests.append(forest_regressor)

Training Forest 1 with 5 trees...
Training Forest 2 with 5 trees...
Training Forest 3 with 5 trees...


In [10]:
# Train AdaBoost using the Random Forests as base estimators
for i in range(num_forests):
    print(f"\nTraining AdaBoost with Forest {i + 1}...")
    adaboost = AdaBoost(n_estimators=5, learning_rate=1)
    adaboost.fit(X_train, y_train)
    y_pred = adaboost.predict(X_test)
    mse = adaboost.evaluate(X_test, y_test)
    print(f"AdaBoost with Forest {i + 1} MSE: {mse:.2f}")


Training AdaBoost with Forest 1...


  alpha = self.learning_rate * 0.5 * np.log((1 - error) / (error + 1e-10))  # Adding small value to avoid division by zero
  weights /= np.sum(weights)  # Normalize weights


AdaBoost with Forest 1 MSE: nan

Training AdaBoost with Forest 2...
AdaBoost with Forest 2 MSE: nan

Training AdaBoost with Forest 3...
AdaBoost with Forest 3 MSE: nan


In [11]:
# Summary of the number of forests and trees
print(f"\nTotal number of forests used: {num_forests}")
for i in range(num_forests):
    print(f"Forest {i + 1} contains {5} trees.")


Total number of forests used: 3
Forest 1 contains 5 trees.
Forest 2 contains 5 trees.
Forest 3 contains 5 trees.
