In [1]:
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy


In [2]:
X = np.array([
    [1, 2],
    [2, 3],
    [3, 6],
    [4, 8],
    [5, 10],
    [6, 12],
    [7, 14],
    [8, 16],
    [9, 18],
    [10, 20]
])

y = np.array([2.1, 3.0, 5.8, 7.9, 10.1, 12.1, 13.8, 16.3, 18.2, 20.1])


In [5]:
def variance(y):
    return np.mean((y - np.mean(y)) ** 2)

def variance_reduction(y, y_left, y_right):
    total = len(y)
    left_weight = len(y_left) / total
    right_weight = len(y_right) / total
    return variance(y) - (left_weight * variance(y_left) + right_weight * variance(y_right))

def best_split_for_feature(X_col, y, feature_idx=None, verbose=True):
    thresholds = np.unique(X_col)
    best_gain = -np.inf
    best_threshold = None

    if verbose:
        print(f"\nðŸ“˜ Evaluating feature {feature_idx} with values: {X_col}")
        print("Thresholds to try:", thresholds)
    
    for t in thresholds:
        left_idx = X_col <= t
        right_idx = X_col > t
        if sum(left_idx) == 0 or sum(right_idx) == 0:
            continue
        y_left, y_right = y[left_idx], y[right_idx]
        gain = variance_reduction(y, y_left, y_right)

        if verbose:
            print(f"  Threshold: {t}, Variance Gain: {gain:.4f}")
        
        if gain > best_gain:
            best_gain = gain
            best_threshold = t

    if verbose:
        print(f"âœ… Best threshold for feature {feature_idx}: {best_threshold} (gain={best_gain:.4f})\n")

    return best_threshold, best_gain


# Pick feature 0
X_col = X[:, 0]
best_split_for_feature(X_col, y, feature_idx=0, verbose=True)



ðŸ“˜ Evaluating feature 0 with values: [ 1  2  3  4  5  6  7  8  9 10]
Thresholds to try: [ 1  2  3  4  5  6  7  8  9 10]
  Threshold: 1, Variance Gain: 8.6828
  Threshold: 2, Variance Gain: 17.5980
  Threshold: 3, Variance Gain: 22.8803
  Threshold: 4, Variance Gain: 25.9584
  Threshold: 5, Variance Gain: 26.6256
  Threshold: 6, Variance Gain: 25.2971
  Threshold: 7, Variance Gain: 22.5890
  Threshold: 8, Variance Gain: 16.8510
  Threshold: 9, Variance Gain: 9.3228
âœ… Best threshold for feature 0: 5 (gain=26.6256)



(np.int64(5), np.float64(26.625600000000002))

In [6]:
class DecisionStump2D:
    def __init__(self, feature_indices):
        self.feature_indices = feature_indices
        self.best_feature = None
        self.threshold = None
        self.left_value = None
        self.right_value = None

    def fit(self, X, y):
        best_gain = -np.inf

        for f in self.feature_indices:
            threshold, gain = best_split_for_feature(X[:, f], y, feature_idx=f, verbose=True)
            if gain > best_gain:
                best_gain = gain
                self.best_feature = f
                self.threshold = threshold

        # Final split
        left_idx = X[:, self.best_feature] <= self.threshold
        right_idx = X[:, self.best_feature] > self.threshold
        self.left_value = np.mean(y[left_idx])
        self.right_value = np.mean(y[right_idx])

        print(f"ðŸŽ¯ Final split: Feature {self.best_feature}, Threshold {self.threshold}")
        print(f"   Left prediction: {self.left_value:.2f}, Right prediction: {self.right_value:.2f}\n")

    def predict(self, X):
        condition = X[:, self.best_feature] <= self.threshold
        return np.where(condition, self.left_value, self.right_value)


In [7]:
def bootstrap_sample(X, y):
    n = X.shape[0]
    indices = np.random.choice(n, size=n, replace=True)
    return X[indices], y[indices]

def random_feature_indices(n_features, max_features):
    return np.random.choice(n_features, size=max_features, replace=False)


In [8]:
class ManualRandomForest2D:
    def __init__(self, n_trees=5, max_features=1):
        self.n_trees = n_trees
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        n_features = X.shape[1]
        
        for i in range(self.n_trees):
            print(f"\nðŸŒ² Training Tree {i+1}/{self.n_trees}")
            X_sample, y_sample = bootstrap_sample(X, y)
            features = random_feature_indices(n_features, self.max_features)
            stump = DecisionStump2D(features)
            stump.fit(X_sample, y_sample)
            self.trees.append(deepcopy(stump))

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(predictions, axis=0)


In [15]:
# Build forest
forest = ManualRandomForest2D(n_trees=5, max_features=1)
forest.fit(X, y)

# Predict
y_pred = forest.predict(X)





ðŸŒ² Training Tree 1/5

ðŸ“˜ Evaluating feature 0 with values: [10  8  2  4  7  3  6  9  9  8]
Thresholds to try: [ 2  3  4  6  7  8  9 10]
  Threshold: 2, Variance Gain: 11.4921
  Threshold: 3, Variance Gain: 19.2282
  Threshold: 4, Variance Gain: 24.7760
  Threshold: 6, Variance Gain: 23.7606
  Threshold: 7, Variance Gain: 21.6225
  Threshold: 8, Variance Gain: 13.7457
  Threshold: 9, Variance Gain: 5.3361
âœ… Best threshold for feature 0: 4 (gain=24.7760)

ðŸŽ¯ Final split: Feature 0, Threshold 4
   Left prediction: 5.57, Right prediction: 16.43


ðŸŒ² Training Tree 2/5

ðŸ“˜ Evaluating feature 1 with values: [12 12 12  8  3  6 18 20  6 18]
Thresholds to try: [ 3  6  8 12 18 20]
  Threshold: 3, Variance Gain: 8.0845
  Threshold: 6, Variance Gain: 19.0286
  Threshold: 8, Variance Gain: 23.2460
  Threshold: 12, Variance Gain: 22.8594
  Threshold: 18, Variance Gain: 8.1605
âœ… Best threshold for feature 1: 8 (gain=23.2460)

ðŸŽ¯ Final split: Feature 1, Threshold 8
   Left prediction: 