In [1]:
import pandas as pd

train = pd.read_csv("p2_data/data_train.csv", header=None)
target = train.shape[1]-1
y = train[target].to_numpy()
X = train.drop(columns=[target]).to_numpy()

test = pd.read_csv("p2_data/data_test.csv", header=None)
y_test = test[target]
X_test = test.drop(columns=[target])


Implement Random Forest:<br/>
1. It gets the number of trees (n) and their height
2. Samples the dataset with replacement(bootstrap) n times
3. For each tree randomly selects a few features
4. Train all trees
5. sum the predictions of all trees and use argmax

In [15]:
from sklearn import tree
import random
from sklearn.utils import resample
import numpy as np

class RandomForest:
    def __init__(self, n, depth):
        self.n = n
        self.depth = depth
        self.trees = []
        self.features = []
        self.samplesX = []
        self.samplesy = []
        self.n_classes = 2
        self.build()
        
    def build(self):
        for i in range(self.n):
            clf = tree.DecisionTreeClassifier(max_depth=self.depth)
            self.trees.append(clf)
            
    def fit(self, X, y, n):
        self.n_classes = len(np.unique(y))
        for i in range(self.n):
            self.bootstrap(X, y)
            self.random_features(n, X.shape[1]-1)
            self.trees[i] = self.trees[i].fit(self.samplesX[i][:, self.features[i]], self.samplesy[i])
            
    def predict(self, X):
        predictions = np.zeros(shape=(len(X), self.n_classes))
        for i in range(self.n):
            predictions = np.add(predictions,self.trees[i].predict_proba(X[:, self.features[i]]))
            
        return np.argmax(predictions, axis=1)

    def bootstrap(self, X, y):
        r = random.randint(0, 1000)
        bootX = resample(X, replace=True, n_samples=len(X), random_state=r)
        booty = resample(y, replace=True, n_samples=len(X), random_state=r)
        self.samplesX.append(bootX)
        self.samplesy.append(booty)
        
    def random_features(self, n, n_features):
        features = []
        for i in range(n):
            features.append(random.randint(0, n_features))
        self.features.append(features)


In [16]:
clf = RandomForest(15, 3)
clf.fit(X, y, 3)

In [18]:
print(y)
predictions = clf.predict(X)
print(predictions)

[8 2 1 ... 5 1 7]
[8 2 1 ... 3 2 7]


In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(y, predictions)

0.8291966906858821

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, predictions)

array([[742,   6,   0,   0,   4,   0,  11,   0,  17,   0],
       [  0, 542, 160,  55,  12,   0,   4,   0,   0,   6],
       [  0,   8, 753,   1,   1,   0,   3,  13,   1,   0],
       [  0,  15,   1, 697,   1,   0,   0,   0,   0,   5],
       [  1,   2,   0,   1, 751,   1,  20,   0,   0,   4],
       [  1,   0,   0, 179,   1, 456,   4,   0,   2,  77],
       [  0,   0,   2,   3,   2,   1, 703,   9,   0,   0],
       [  0,  90,   4,   5,   1,   1,   7, 658,  11,   1],
       [129,  12,   6,  11,   5,   4,  10, 108, 379,  55],
       [  1,  74,   0,  87,  20,   2,   0,   0,   2, 533]])