#**AdaBoost Decision Tree - Classification (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import IsolationForest
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

**Code**

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature        # Feature index to split on
        self.threshold = threshold    # Threshold value for the split
        self.left = left              # Left subtree
        self.right = right            # Right subtree
        self.value = value            # Class label for leaf nodes

In [3]:
class DecisionTree:
    def __init__(self, criterion='gini', max_depth=None, min_samples_split=2):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        unique_classes, counts = np.unique(y, return_counts=True)
        most_common_class = unique_classes[np.argmax(counts)]

        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or len(unique_classes) == 1 or n_samples < self.min_samples_split:
            return Node(value=most_common_class)

        best_feature, best_threshold = self._best_split(X, y)
        if best_feature is None:
            return Node(value=most_common_class)

        left_indices = X[:, best_feature] < best_threshold
        right_indices = X[:, best_feature] >= best_threshold

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left_tree, right=right_tree)

    def _best_split(self, X, y):
        best_feature = None
        best_threshold = None
        best_gain = -np.inf

        for feature in range(X.shape[1]):
            thresholds, classes = zip(*sorted(zip(X[:, feature], y)))
            num_left = np.zeros(len(np.unique(y)), dtype=int)
            num_right = np.bincount(classes)

            for i in range(1, len(y)):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1

                if thresholds[i] == thresholds[i - 1]:
                    continue

                gain = self._information_gain(y, num_left, num_right)

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = (thresholds[i] + thresholds[i - 1]) / 2

        return best_feature, best_threshold

    def _information_gain(self, y, num_left, num_right):
        p_left = num_left.sum() / y.size
        p_right = num_right.sum() / y.size

        if p_left == 0 or p_right == 0:
            return 0

        return self._gini_impurity(y) - (p_left * self._gini_impurity(num_left) + p_right * self._gini_impurity(num_right))

    def _gini_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        return 1 - sum((counts / counts.sum()) ** 2)

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.root) for sample in X])

    def _predict_sample(self, sample, node):
        if node.value is not None:  # Leaf node
            return node.value

        if sample[node.feature] < node.threshold:
            return self._predict_sample(sample, node.left)
        else:
            return self._predict_sample(sample, node.right)

In [4]:
class AdaBoost:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.models = []  # List to hold weak learners
        self.alphas = []  # List to hold the weights of the weak learners

    def fit(self, X, y):
        n_samples, n_features = X.shape
        # Initialize weights uniformly
        w = np.ones(n_samples) / n_samples

        for _ in range(self.n_estimators):
            # Train a weak learner
            model = DecisionTree(criterion='gini', max_depth=1, min_samples_split=2)
            model.fit(X, y)

            # Make predictions
            y_pred = model.predict(X)

            # Calculate error
            error = np.sum(w * (y_pred != y)) / np.sum(w)

            # Calculate alpha (weight of the weak learner)
            alpha = 0.5 * np.log((1 - error) / (error + 1e-10))

            # Update sample weights
            w *= np.exp(-alpha * (y_pred == y) * 2 - 1)  # Adjusted for binary labels
            w /= np.sum(w)  # Normalize weights

            # Store the model and its alpha
            self.models.append(model)
            self.alphas.append(alpha)

    def predict(self, X):
        # Initialize predictions to 0
        y_pred = np.zeros(X.shape[0])

        # Aggregate predictions from each weak learner
        for alpha, model in zip(self.alphas, self.models):
            y_pred += alpha * model.predict(X)

        # Return the sign of predictions (0 or 1)
        return (y_pred > 0).astype(int)

    def get_n_estimators(self):
        return len(self.models)

**Load Dataset**

In [5]:
# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

In [6]:
# Convert to binary classification for AdaBoost
y = np.where(y == 2, 1, 0)  # Convert to 0 and 1 for AdaBoost

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [8]:
# Train AdaBoost
adaboost = AdaBoost(n_estimators=50)
adaboost.fit(X_train, y_train)

In [9]:
# Get number of decision trees used
num_trees = adaboost.get_n_estimators()
print(f"Number of Decision Trees Used: {num_trees}")

Number of Decision Trees Used: 50


In [10]:
# Predictions
predictions = adaboost.predict(X_test)
print(predictions[:5])

[0 0 0 0 0]


In [11]:
# Accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"AdaBoost Accuracy: {accuracy:.2f}")

AdaBoost Accuracy: 0.63


In [12]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[19  0]
 [11  0]]
