In [1]:
from rpart.DecisionTreeClassifier import DecisionTreeClassifier
import numpy as np
import pandas as pd 
from collections import Counter

class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, metric='gini', split_method=None, chimerge_threshold=0.05, chimerge_max_intervals=None, max_features='sqrt', bootstrap=True, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.metric = metric
        self.split_method = split_method
        self.chimerge_threshold = chimerge_threshold
        self.chimerge_max_intervals = chimerge_max_intervals
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.trees = []

    def _sample_features(self, X):
        if self.max_features == 'sqrt':
            n_features = int(np.sqrt(X.shape[1]))
        elif self.max_features == 'log2':
            n_features = int(np.log2(X.shape[1]))
        elif isinstance(self.max_features, int):
            n_features = self.max_features
        else:
            raise ValueError("Invalid max_features. Supported values are 'sqrt', 'log2', or an integer.")
        
        return X.sample(n=n_features, axis=1)

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        return X.iloc[indices], y.iloc[indices]

    def fit(self, X, y):
        if self.random_state is not None:
            np.random.seed(self.random_state)

        for _ in range(self.n_estimators):
            tree = DecisionTreeClassifier(max_depth=self.max_depth, metric=self.metric, split_method=self.split_method, chimerge_threshold=self.chimerge_threshold, chimerge_max_intervals=self.chimerge_max_intervals)

            if self.bootstrap:
                X_sample, y_sample = self._bootstrap_sample(X, y)
            else:
                X_sample, y_sample = X.copy(), y.copy()

            X_sample = self._sample_features(X_sample)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        predictions = np.empty((self.n_estimators, X.shape[0]), dtype=object)

        for i, tree in enumerate(self.trees):
            predictions[i] = tree.predict(X)

        most_common = np.array([Counter(predictions[:, i]).most_common(1)[0][0] for i in range(X.shape[0])])
        return most_common.astype(int) if np.issubdtype(most_common.dtype, np.number) else most_common

    def predict_proba(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.apply_along_axis(lambda x: np.bincount(x) / x.size, axis=0, arr=tree_preds)

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd 

adult = pd.read_csv('data/adult.csv')
X = adult.drop('income', axis=1)
y = adult['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
# Create and train the DecisionTreeClassifier
clf = RandomForestClassifier(n_estimators=50, max_depth=3, metric='gini')
clf.fit(X_train, y_train)

In [4]:
from sklearn.metrics import accuracy_score, classification_report
# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7633333333333333
Classification Report:
              precision    recall  f1-score   support

       <=50K       0.76      1.00      0.86       227
        >50K       1.00      0.03      0.05        73

    accuracy                           0.76       300
   macro avg       0.88      0.51      0.46       300
weighted avg       0.82      0.76      0.67       300

