# Random Forests

In [1]:
import numpy as np
import matplotlib.pyplot as plt

## Out of Bag Evaluation

Each tree only see a subset of the dataset. Any data that a particular tree did not see is called **out of bag** (oob).  We can use them as validation set.

## Random subset of features

The $B$ bootstrapped dataset can be correlated.

A **random forest** is constructed by bagging + only a random" subset of $q \leq n$ features

Rule of thumb: $q = \sqrt{n}$ for classification trees and $q = \frac{n}{3}$ for regression trees

## 1. Scratch

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                test_size=0.3, shuffle=True, random_state=42)

In [3]:
#import
import random, math
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
from sklearn.metrics import accuracy_score

#class
class RandomForest:

    #init
    def __init__(self, B, bootstrap_ratio, with_no_replacement=True):
        self.B = B
        self.bootstrap_ratio = bootstrap_ratio
        self.with_no_replacement = with_no_replacement
        self.tree_params = {'max_depth': 2, 'max_features': 'sqrt'} 
        self.models = [DecisionTreeClassifier(**self.tree_params) for _ in range(B)]
    
    #fit
    def fit(self, X, y):
        m, n = X.shape
        sample_size = int(self.bootstrap_ratio * len(X))
        xsamples = np.zeros((self.B, sample_size, n))
        ysamples = np.zeros((self.B, sample_size))
        xsamples_oob = []
        ysamples_oob = []
        
        #bootstrapping
        for i in range(self.B):
            oob_idx = []
            idxes   = []
            for j in range(sample_size):
                idx = random.randrange(m)
                if (self.with_no_replacement):
                    while idx in idxes:
                        idx = random.randrange(m)
                idxes.append(idx)
                oob_idx.append(idx)
                xsamples[i, j, :] = X[idx]
                ysamples[i, j] = y[idx]
            mask = np.zeros((m), dtype=bool)
            mask[oob_idx] = True
            xsamples_oob.append(X[~mask])
            ysamples_oob.append(y[~mask])
        
        #fitting
        oob_score = 0
        print("-----Out of bag score for each tree------")
        for i, model in enumerate(self.models):
            _X = xsamples[i]
            _y = ysamples[i]
            model.fit(_X, _y)
            
            _X_test = np.asarray(xsamples_oob[i])
            _y_test = np.asarray(ysamples_oob[i])
            yhat = model.predict(_X_test)
            oob_score += accuracy_score(_y_test, yhat)
            print(f"Tree {i}", accuracy_score(_y_test, yhat))
        
        self.avg_oob_score = oob_score / len(self.models)
        print('========Average oob score========')
        print(self.avg_oob_score)
        
            
    #predict
    def predict(self, X): #<---X_test
        predictions = np.zeros((self.B, X.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X)
            predictions[i, :] = yhat
        return stats.mode(predictions)[0][0]
    

In [4]:
model = RandomForest(B=5, bootstrap_ratio=0.8)

In [5]:
model.fit(X_train, y_train)

-----Out of bag score for each tree------
Tree 0 0.9047619047619048
Tree 1 1.0
Tree 2 1.0
Tree 3 0.9047619047619048
Tree 4 0.8571428571428571
0.9333333333333332


In [6]:
yhat  = model.predict(X_test)

  return stats.mode(predictions)[0][0]


In [7]:
print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

