#**AdaBoost Decision Tree - Regression (Scratch)**

**Import Libraries**

In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import IsolationForest
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

**Code**

In [3]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, leaf=False, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.leaf = leaf
        self.value = value

In [4]:
class DecisionTreeRegressor:
    def __init__(self, criterion='mse', max_depth=None, min_samples_split=2):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape

        if (self.max_depth is not None and depth >= self.max_depth) or len(np.unique(y)) == 1 or n_samples < self.min_samples_split:
            return self._create_leaf_node(y)

        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self._create_leaf_node(y)

        left_indices = X[:, best_feature] < best_threshold
        right_indices = X[:, best_feature] >= best_threshold

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)

    def _find_best_split(self, X, y):
        n_samples, n_features = X.shape
        best_feature, best_threshold, best_gain = None, None, -np.inf

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] < threshold
                right_indices = X[:, feature] >= threshold
                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                gain = self._calculate_gain(y, y[left_indices], y[right_indices])
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _calculate_gain(self, y, left_y, right_y):
        parent_loss = np.var(y)
        n = len(y)
        n_left, n_right = len(left_y), len(right_y)

        if n_left == 0 or n_right == 0:
            return 0

        child_loss = (n_left / n * np.var(left_y)) + (n_right / n * np.var(right_y))
        return parent_loss - child_loss

    def _create_leaf_node(self, y):
        leaf_value = np.mean(y)  # Average value for regression
        return Node(leaf=True, value=leaf_value)

    def predict(self, X):
        return np.array([self._predict(sample, self.root) for sample in X])

    def _predict(self, sample, tree):
        if tree.leaf:
            return tree.value
        if sample[tree.feature] < tree.threshold:
            return self._predict(sample, tree.left)
        else:
            return self._predict(sample, tree.right)

In [5]:
class AdaBoost:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators

    def fit(self, X, y, model):
        n_samples = len(y)
        self.models = []
        self.alphas = []
        w = np.ones(n_samples) / n_samples  # Initialize weights

        for _ in range(self.n_estimators):
            model.fit(X, y)  # Fit the model
            y_pred = model.predict(X)

            # Calculate the error
            error = np.abs(y - y_pred)
            error_rate = np.dot(w, error) / np.sum(w)  # Weighted error rate

            if error_rate <= 0:
                print("Perfect prediction encountered. Stopping training.")
                break

            alpha = 0.5 * np.log((1 - error_rate) / (error_rate + 1e-10))  # Avoid division by zero
            self.models.append(model)
            self.alphas.append(alpha)

            # Update weights
            w *= np.exp(-alpha * (y - y_pred))
            w /= np.sum(w)  # Normalize weights

            if np.all(error < 1e-10):  # Check if all errors are very small
                print("All errors are very small. Stopping training.")
                break

        print(f"Number of trees used: {len(self.models)}")

    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for model, alpha in zip(self.models, self.alphas):
            y_pred += alpha * model.predict(X)
        return y_pred

**Load Dataset**

In [6]:
# Simple dataset
X = np.array([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]])  # Features
y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])  # Target values

In [7]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8, 1), (2, 1), (8,), (2,))

In [8]:
# Create a new decision tree regressor
model = DecisionTreeRegressor(criterion='mse', max_depth=5, min_samples_split=3)

In [9]:
# Train the AdaBoost regressor
adaboost_model = AdaBoost(n_estimators=50)
adaboost_model.fit(X_train, y_train, model)

Number of trees used: 50


In [10]:
# Make predictions
y_pred = adaboost_model.predict(X_test)
print(y_pred)

[56.89362619  7.58581683]


In [11]:
# Evaluate the regression performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 1162.5003896661963
Mean Absolute Error: 26.739721508241985
R^2 Score: -93.89799099315888
