In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [40]:
data = pd.read_csv('data/bm.csv').to_numpy()

In [41]:
random_state = np.random.randint(1, 1001)
X_train, X_test, y_train, y_test = train_test_split(data[:, :2], data[:, 2], test_size=0.1, random_state=random_state)

In [42]:
# this refers to indices of X_train, the r here is filled with random indices of a pair of x and y values
from sklearn.tree import DecisionTreeClassifier
from numpy.random import default_rng
from sklearn.metrics import accuracy_score
rng = default_rng()

n = 5000
r = np.zeros([n, 100], dtype=int)
XX = np.zeros([n, 2, 100])

# boostrap sampling
for i in range (100):
    r[:, i] = rng.choice(n, size=n, replace=True)
    XX[:, :, i] = X_train[r[:, i], :]

In [43]:
# training the trees with a big max_depth, so that they over fit and that I can use the pruning method.
trees = []
for i in range (100):
    clf_tree = DecisionTreeClassifier()
    clf_tree.fit(X_train[r[:, i]], y_train[r[:, i]])
    trees.append(clf_tree)

I think I will use the 'pruning' method (stopping when the impurity is not significant)

note: classification error should only be used for performance, not for training.
note: probably the best way to decide on the max_depth to use gini impurity measures and see after how many depth levels the change starts being insignificant. That's the place to stop

In [44]:
predictions = np.zeros([X_test.shape[0], 100], dtype=int)
pred_list = []
for i in range(len(trees)):
    prediction = trees[i].predict(X_test)
    predictions[:, i] = prediction
    pred_list.append(prediction)

In [45]:
accuracies=[]
for pred in pred_list:
    accuracies.append(accuracy_score(y_test, pred))

In [54]:
# implement majoirty vote prediction along axis 1
random_forest_predictions = []
for row in predictions:
    unique_elements, counts = np.unique(row, return_counts=True)
    majority_vote = unique_elements[np.argmax(counts)]
    random_forest_predictions.append(majority_vote)

random_forest_predictions = np.array(random_forest_predictions)
random_forest_accuracy = np.sum(y_test == random_forest_predictions) / len(y_test)

In [55]:
generalization_error = 1 - random_forest_accuracy