<a href="https://colab.research.google.com/github/farahelmashad/ML_Models/blob/main/Random_Forest/Random_Forest_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Random Forest from scratch


In [4]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy.stats import mode



## Decision tree implementation for our random forest:


In [2]:
class Node:
    def __init__(self,feature=None, threshold=None, left=None,right=None, value=None):
        self.feature=feature
        self.threshold=threshold
        self.right=right
        self.left=left
        self.value=value # if it's a leaf node (majority for classification, mean for regression)
    def is_leaf_node(self,node):
      return node.value is not None


In [3]:
class DecisionTree:
    def __init__(self, task='classification', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, n_features=None):
        self.task=task
        if self.task=="regression":
            self.criterion="mse"
        else:
            self.criterion=criterion
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.min_samples_leaf=min_samples_leaf
        self.n_features= len(X[0]) if n_features is None else min(len(X[0]),n_features) # used for random forests
    def fit(self,X,y):
        self.X=X
        self.y=y
        self.n_features=len(X[0])
        self.root=self.build_tree(self.X,self.y,depth=0)
    def build_tree(self, X, y, depth):
        #check for the base case or stopping criteria
        if ( self.max_depth is not None and depth>=self.max_depth
            or len(y)<= self.min_samples_split
            or len(y)<= self.min_samples_leaf
            or np.unique(y).size == 1):
            if self.task=="classification":
                values_counter=Counter(y)
                majority_class=values_counter.most_common(1)[0][0]
                return Node(value=majority_class)
            elif self.task=="regression":
                return Node(value=np.mean(y))


        best_feature, best_threshold=self.find_best_split(X,y)
        X_left,y_left,X_right,y_right=self.split_data(X,y,best_feature,best_threshold)

        left_subtree=self.build_tree(X_left,y_left, depth+1)
        right_subtree=self.build_tree(X_right,y_right, depth+1)

        return Node(feature=best_feature, threshold=best_threshold,left=left_subtree, right=right_subtree)

    def find_best_split(self, X, y):
        best_gini=float('inf')
        best_ig=-float('inf')
        best_mse=float('inf')
        best_feature=None
        best_threshold=None
        no_features=len(X[0]) # actual number of feeatures we have
        features= np.random.choice(no_features, self.n_features,replace=False)
        for feature in features:
            # unique_values=np.array(sorted(np.unique(X[:,feature])))
            # midpoints=(unique_values[:-1]+unique_values[1:])/2
            thresholds = np.percentile(X[:, feature], np.linspace(0, 100, 10))
            thresholds = np.unique(thresholds)

            for threshold in thresholds:
                X_left,y_left,X_right,y_right=self.split_data(X,y,feature,threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                 continue

                if self.criterion=="gini":
                 weighted_gini=((len(y_left)/len(y))*self.Gini(y_left))+((len(y_right)/len(y))*self.Gini(y_right))
                 if weighted_gini<best_gini:
                    best_gini=weighted_gini
                    best_feature=feature
                    best_threshold=threshold

                elif self.criterion=="entropy":
                    weighted_entropy=((len(y_left)/len(y))*self.entropy(y_left))+((len(y_right)/len(y))*self.entropy(y_right))
                    information_gain=self.entropy(y)-weighted_entropy
                    if information_gain> best_ig:
                        best_ig=information_gain
                        best_feature=feature
                        best_threshold=threshold
                elif self.task=="regression":
                    weighted_mse=(((len(y_left)/len(y))*self.mse(y_left)))+(((len(y_right)/len(y))*self.mse(y_right)))
                    if weighted_mse<best_mse:
                        best_mse=weighted_mse
                        best_feature=feature
                        best_threshold=threshold

        return best_feature, best_threshold

    def split_data(self, X, y, feature, threshold):
     left = X[:, feature] <= threshold
     right = X[:, feature] > threshold
     X_left = X[left]
     y_left = y[left]
     X_right = X[right]
     y_right = y[right]
     return X_left, y_left, X_right, y_right

    def Gini(self,y):
        if len(y)==0:
            return 0.0
        values, counts=np.unique(y, return_counts=True)
        probabilites= counts/counts.sum()
        return 1-np.sum(probabilites**2)
    def entropy(self,y):
        if len(y)==0:
            return 0.0
        values, counts=np.unique(y, return_counts=True)
        probabilities=counts/counts.sum()
        entropies= -probabilities*np.log2(probabilities+1e-9) # aashan el log 0
        return entropies.sum()
    def mse(self, y):
     if len(y) == 0:
        return 0
     mean = np.mean(y)
     return np.mean((y - mean) ** 2)

    def traverse_tree(self,x,node):
      if node.is_leaf_node(node):
        return node.value
      if x[node.feature]<=node.threshold:
        return self.traverse_tree(x,node.left)
      elif x[node.feature]>node.threshold:
        return self.traverse_tree(x,node.right)


    def predict(self, X):
     y = [self.traverse_tree(x, self.root) for x in X]
     return np.array(y)


In [5]:
class RandomForest:
  def __init__(self,task="classification",n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', bootstrap=True):
    self.task=task
    self.n_estimators=n_estimators
    self.criterion= criterion if task=="classification" else "mse"
    self.max_depth=max_depth
    self.min_samples_split=min_samples_split
    self.min_samples_leaf=min_samples_leaf
    self.max_features=max_features if task=="classification" else "n/3"
    self.bootstrap=bootstrap
    self.trees=[]
  def fit(self, X, y):
    self.X=X
    self.y=y
    #set the number of features for the trees
    if self.max_features=="sqrt":
      self.n_features=int(np.sqrt(X.shape[1]))
    elif self.max_features=="log2":
      self.n_features=int(np.log2(X.shape[1]))
    elif self.task=="regression":
      self.n_features=int(X.shape[1]/3)
    #create the trees
    self.trees = [
     DecisionTree(
        task=self.task,
        criterion=self.criterion,
        max_depth=self.max_depth,
        min_samples_split=self.min_samples_split,
        min_samples_leaf=self.min_samples_leaf,
        n_features=self.n_features
    )
    for _ in range(self.n_estimators)]
    for tree in self.trees:
      if self.bootstrap:
        X_sample, y_sample=self._bootstrap(X,y)
        tree.fit(X_sample,y_sample)
      else:
        tree.fit(X,y)


  def _bootstrap(self,X,y):
    random_indices=np.random.choice(len(X),len(X), replace=True)
    return X[random_indices], y[random_indices]

  def predict(self, X):
    final=[]
    predictions=np.array([tree.predict(X) for tree in self.trees])

    if self.task=="classification":
      majority_votes, _ = mode(predictions, axis=0)
      return majority_votes.flatten()

    elif self.task=="regression":
      return np.mean(predictions,axis=0)







# Testing the Random Forest for classification against sklearn's implementation

In [8]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf=RandomForest(n_estimators=100, criterion="gini", max_depth=10, min_samples_split=2, min_samples_leaf=1, max_features="sqrt", bootstrap=True)
rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)

train_accuracy=accuracy_score(y_train, rf.predict(X_train))
accuracy = accuracy_score(y_test, y_pred)
print(f" Train Accuracy: {train_accuracy:.2f} \n Test Accuracy: {accuracy:.2f}")

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_sklearn = RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=10)
rf_sklearn.fit(X_train, y_train)

print("Sklearn train accuracy: ", accuracy_score(y_train, rf_sklearn.predict(X_train)))
print("Sklearn test accuracy:", accuracy_score(y_test, rf_sklearn.predict(X_test)))


 Train Accuracy: 1.00 
 Test Accuracy: 1.00
Sklearn train accuracy:  1.0
Sklearn test accuracy: 1.0


In [18]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score
data = fetch_california_housing()

X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf=RandomForest(n_estimators=100,task="regression", criterion="mse",max_depth=10, min_samples_split=10, min_samples_leaf=5)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
print("Random forest implementation: ")
print("MSE of our model (train): ", mean_squared_error(y_train, rf.predict(X_train)))
print("MSE of our model (test): ", mean_squared_error(y_test, y_pred))
print("r2 score of our model (train): ", r2_score(y_train,rf.predict(X_train)))
print("r2 score of our model (train): ", r2_score(y_test,y_pred))
print("")
rff=RandomForestRegressor(n_estimators=100,max_depth=10, min_samples_split=10, min_samples_leaf=5)
rff.fit(X_train,y_train)
y_pred_sk=rff.predict(X_test)
print("Scikit learn's implementation: ")
print("MSE of our model (train): ", mean_squared_error(y_train, rff.predict(X_train)))
print("MSE of our model (test): ", mean_squared_error(y_test, y_pred_sk))
print("r2 score of our model (train): ", r2_score(y_train,rff.predict(X_train)))
print("r2 score of our model (train): ", r2_score(y_test,y_pred_sk))



Random forest implementation: 
MSE of our model (train):  0.20448307352773165
MSE of our model (test):  0.3181240317854817
r2 score of our model (train):  0.8470329096297821
r2 score of our model (train):  0.7572330187995711

Scikit learn's implementation: 
MSE of our model (train):  0.19346647916572288
MSE of our model (test):  0.2964034068148744
r2 score of our model (train):  0.8552740630723279
r2 score of our model (train):  0.7738084737386582
