In [1]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import math

In [2]:
def load_iris(ratio=0.8):
    features, target = datasets.load_iris(True)
    num_samples = len(target)
    num_train = math.ceil(num_samples * ratio)
    
    # 随机打乱数据
    idx = np.random.permutation(np.arange(num_samples))
    traindata = features[idx[:num_train]], target[idx[:num_train]]
    validdata = features[idx[num_train:]], target[idx[num_train:]]
    
    return traindata, validdata

In [3]:
(X_train, Y_train), (X_valid, Y_valid) = load_iris()

# 作业四

## 四、AdaBoost算法

利用算法8.1对iris数据集进行分类

* 利用sklearn提供的`DecisionTreeClassifier`构造单层决策树作为基本分类器
* 调整式(8.7)以适用于多分类的情况

ETA：0.5-3 hours

In [4]:
class AdaBoostClassifier:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators # M
    
    def fit(self, X, y):
        """
        Inputs:
          X: array of shape (N, C)
          y: array of shape (N, )
        """
        
        self.classes_ = np.unique(y)           
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators)
        self.estimator_errors_ = np.ones(self.n_estimators)
        
        num_samples = X.shape[0]    # N
        sample_weight = np.full((num_samples, ), 1./num_samples)

        for iboost in range(self.n_estimators):
            sample_weight, estimator, estimator_weight, estimator_error = self._boost(
                X, y,
                sample_weight)
            
            self.estimators_.append(estimator)
            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error
                            
        return self
            
    def _boost(self, X, y, sample_weight):
        estimator = DecisionTreeClassifier(max_depth=1)
        estimator.fit(X,y,sample_weight = sample_weight)
        
        #G_m
        G_m = estimator.predict(X) 

        #e_m
        a1 = np.array([sample_weight[i] * (G_m[i] != y[i]) for i in range(X.shape[0])])
        estimator_error = np.sum(a1) 
        
        #alpha_m
        estimator_weight = 1/2 * np.log((1-estimator_error)/estimator_error)

        a2 = np.array([1 if y[i]==G_m[i] else -1 for i in range(y.shape[0])])
        
        #Z_m
        Z_m = 0
        for i in range(y.shape[0]):
            Z_m += sample_weight[i] * np.exp(-estimator_weight * a2[i] )
        
        for i in range(y.shape[0]):
            sample_weight[i] = sample_weight[i] / Z_m * np.exp(-estimator_weight * a2[i] )
                 
        
        return sample_weight, estimator, estimator_weight, estimator_error
    
    def predict(self, X):
        pred_1 = np.zeros((self.n_estimators, X.shape[0]))
        for i in range(len(self.estimators_)):
            pred_1[i, :] = self.estimators_[i].predict(X)
            
            
        pred_2 = np.zeros([self.classes_.shape[0], X.shape[0]])

        for i in self.classes_:
            pred_2[i,:] = np.sum([self.estimator_weights_ * (pred_1[:,j] == i) for j in range(X.shape[0])] , axis = 1)

        pred = np.argmax(pred_2,axis = 0)
        return pred

In [5]:
def accuracy(Y_real, Y_pred):
    return np.sum(Y_real == Y_pred)/len(Y_real)

In [6]:
model = AdaBoostClassifier()

model.fit(X_train, Y_train)

accu = accuracy(model.predict(X_valid), Y_valid)

print(f"Accuracy: {accu:.4f}")

Accuracy: 1.0000
