## RandomForestRegressor

In [1]:
import numpy as np

In [9]:
class RandomForest:
    def __init__(self, n_estimator=100, max_depth=None, min_samples_split=2):
        self.n_estimator = n_estimator
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []
        
    def fit(self, X, y):
        for _ in range(self.n_estimator):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            indices = np.random.choice(len(X), len(X), replace=True)
            tree.fit(X[indices], y[indices])
            self.trees.append(tree)
            
    def predict(self, X):
        predictions = np.zeros((len(X), len(self.trees)))
        for i, tree in enumerate(self.trees):
            predictions[:, i] = tree.predict(X)
        return np.mean(predictions, axis=1)

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape

        if depth >= self.max_depth or n_samples <= 2:
            return Node(value=np.mean(y))

        best_variance = float('inf')
        best_feature, best_threshold = None, None

        for feature in range(n_features):
            thresholds = sorted(set(X[:, feature]))

            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if sum(left_indices) == 0 or sum(right_indices) == 0:
                    continue

                variance = self._calculate_variance(y[left_indices], y[right_indices])

                if variance < best_variance:
                    best_variance = variance
                    best_feature = feature
                    best_threshold = threshold

        if best_feature is None or best_threshold is None:
            return Node(value=np.mean(y))

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _calculate_variance(self, left_value, right_value):
        var_left = np.var(left_value)
        var_right = np.var(right_value)
        return (len(left_value) * var_left + len(right_value) * var_right) / (len(left_value) + len(right_value))
        

    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

In [3]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
#Load the Diabetes dataset
data = load_diabetes()
X, y = data.data, data.target

In [5]:
#Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [10]:
#Initialize the random forest regressor model
random_forest = RandomForest(n_estimator=100, max_depth=5, min_samples_split=2)
random_forest.fit(X_train, y_train)

#Predictions
predictions = random_forest.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, predictions)
print(f"Mean squred error: {mse}")

Mean squred error: 2793.126923469427


## RandomForestClassifier

In [27]:
class RandomForestClassifier:
    def __init__(self, n_estimator=100, max_depth=None, min_samples_split=2):
        self.n_estimator = n_estimator
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []
        
    def fit(self, X, y):
        for _ in range(self.n_estimator):
            tree = DecisionTreeClassifier(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            indices = np.random.choice(len(X), len(X), replace=True)
            tree.fit(X[indices], y[indices])
            self.trees.append(tree)
            
    def predict(self, X):
        predictions = np.zeros((len(X), len(self.trees)))
        for i , tree in enumerate(self.trees):
            predictions[:, i] = tree.predict(X)
        predictions = predictions.astype(int)
        return np.array([np.bincount(row).argmax() for row in predictions])

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(set(y))

        if depth >= self.max_depth or n_classes == 1 or n_samples <= 2:
            return Node(value=max(y))

        best_gini = float('inf')
        best_feature, best_threshold = None, None

        for feature in range(n_features):
            thresholds = sorted(set(X[:, feature]))

            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if sum(left_indices) == 0 or sum(right_indices) == 0:
                    continue

                gini = self._calculate_gini(y[left_indices], y[right_indices])
                
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _calculate_gini(self, left_labels, right_labels):
        total = len(left_labels) + len(right_labels)
        gini_left = 1.0 - sum([(left_labels == c).mean()**2 for c in set(left_labels)])
        gini_right = 1.0 - sum([(right_labels == c).mean()**2 for c in set(right_labels)])
        return (len(left_labels) / total) * gini_left + (len(right_labels) / total) * gini_right

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

In [14]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
#Load Iris Data
iris = load_iris()
X, y = iris.data, iris.target

In [16]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [28]:
#Create the DT classifier instance
clf = RandomForestClassifier(n_estimator=100, max_depth=4, min_samples_split=2)
clf.fit(X_train, y_train)

#Predict our test set
y_pred = clf.predict(X_test)

#Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


## RandomForestClassifier using Sklearn

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
#Load Iris Data
iris = load_iris()
X, y = iris.data, iris.target

In [31]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [33]:
#Create the DT classifier instance
clf = RandomForestClassifier(n_estimators=100, random_state=44)

In [34]:
clf.fit(X_train, y_train)

In [35]:
#Predict our test set
y_pred = clf.predict(X_test)

#Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0
