In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [11]:
class AdaBoost:
    def __init__(self, B, base_learner_factory=None):
        self.B = B
        self.base_learner_factory = base_learner_factory
        if base_learner_factory is None:
            def create_stump():
                model = DecisionTreeClassifier(max_depth=1)
                return model
            self.base_learner_factory = create_stump
        self.ensemble = None
    
    def predict(self, X, num_ensembles=None):
        if num_ensembles == None:
            num_ensembles = len(self.ensemble)
        num_ensembles = max(0, num_ensembles)
        num_ensembles = min(num_ensembles, len(self.ensemble))
        ensemble = self.ensemble[:num_ensembles]

        y_pred = []
        for i in range(len(ensemble)):
            y_pred_i = self.ensemble[i][0].predict(X)
            y_pred.append(y_pred_i)
        y_pred = np.vstack(y_pred).T
        betas = np.array([x[1] for x in ensemble])
        y_1_val = np.sum((y_pred == 1).astype(int) * np.log(1 / betas), axis=1)
        y_minus1_val = np.sum((y_pred == -1).astype(int) * np.log(1 / betas), axis=1)
        y_pred = (y_1_val > y_minus1_val).astype(int) * 2 - 1
        return y_pred

    def fit(self, X, y):
        self.ensemble = []
        w = np.ones(len(y)) / len(y)
        for i in range(self.B):
            bl = self.base_learner_factory()
            bl.fit(X, y, w)
            y_train_pred = bl.predict(X)
            fail_idx = y != y_train_pred
            eps = w[fail_idx].sum()
            beta = eps / (1 - eps)
            self.ensemble.append((bl, beta))
            w[~fail_idx] *= beta
            w /= w.sum()

In [12]:
df = 10
n = 10000

X = np.pow(np.random.normal(size=(n, df)), 2)
y = (X.sum(axis=1) > np.median(X.sum(axis=1))).astype(int) * 2 - 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)