Loading Dataset

In [65]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [66]:
data = datasets.load_breast_cancer()
X, y = data.data, data.target
y[ y==0 ] = -1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=5)


Model

In [67]:
class DecisionStump:
    def __init__(self):
        self.polarity = 1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None
        
    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_idx]
        predictions = np.ones(n_samples)
        
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1
            
        return predictions

In [68]:
class Adaboost:
    def __init__(self, n_clf=5):
        self.n_clf = n_clf
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        w = np.full(n_samples, (1/n_samples))
        
        self.clf = list()
        for _ in range(self.n_clf):
            clf = DecisionStump()
            
            min_err = float('inf')
            
            for feature_i in range(n_features):
                X_column = X[:, feature_i]
                thresholds = np.unique(X_column)
                for threshold in thresholds:
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1
                    
                    missclassified = w[y != predictions]
                    error = np.sum(missclassified)
                    
                    if error > .5:
                        error = 1 - error
                        p = -1
                        
                    if error < min_err:
                        min_err = error
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_idx = feature_i
                        
            EPS = 1e-10
            clf.alpha = .5 * np.log((1-error) / (error+EPS))
            
            predictions = clf.predict(X)
            w *= np.exp(-clf.alpha * y * predictions)
            w /= np.sum(w)
            
            self.clf.append(clf)
            
    def predict(self, X):
        clf_preds = [clf.alpha * clf.predict(X) for clf in self.clf]
        y_preds = np.sum(clf_preds, axis=0)
        return np.sign(y_preds)

Testing

In [69]:
def accuracy(y_true, y_pred):
    acc = np.sum(y_pred == y_true) / len(y_pred)
    return acc

In [70]:
clf = Adaboost(n_clf=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy(y_test, y_pred)

0.956140350877193