In [1]:
import numpy as np
from sklearn.datasets import make_classification, load_iris, load_digits
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
# make_classification?
# DecisionTreeClassifier?
# train_test_split?
# load_iris?

In [3]:
dataset = make_classification(n_samples=10000, n_informative=18,n_features=20, flip_y=0.2, random_state=217)
X = dataset[0]
y = dataset[1]
y[y==0] = -1 # -1和1代表正负样本
# dataset = load_digits()
# X = dataset.data
# y = dataset.target

In [4]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
clfs = []
for tree_depth in range(1, 5, 1):
    clf = DecisionTreeClassifier(max_depth=tree_depth)
    clf.fit(train_X, train_y)
    score = clf.score(test_X, test_y)
    clfs.append(clf)
    print('score:', score)

('score:', 0.6581818181818182)
('score:', 0.6836363636363636)
('score:', 0.71787878787878789)
('score:', 0.71060606060606057)


In [6]:
class Adaboost:
    def __init__(self, training_X, training_y):
        self.training_X = training_X
        self.training_y = training_y
        self.N = len(self.training_X)
        self.weights = np.ones(self.N)/self.N # 数据集初始化权重为平均值
        self.RULES = [] # Adaboost集成学习分类器，也就是这里的规则
        self.ALPHAS = [] # 对应各个规则的权重，这是Adaboost集成学习训练的目标

    def set_rule(self, rule_model):
        """
        增加一个规则，重新训练
        """
        train_y_pred = rule_model.predict(self.training_X)
        
        error = np.sum(self.weights * (train_y_pred!=self.training_y))
#         print('error:', error)
        alpha = 0.5*np.log((1-error)/error)
#         print('alpha:', alpha)
        w = self.weights * np.exp(-alpha*train_y_pred*self.training_y)
        self.weights = w/w.sum()
#         print('w:', w)
        self.RULES.append(rule_model)
        self.ALPHAS.append(alpha)
        
    def evalute(self, testing_X):
        test_num = len(testing_X)
        testing_y_pred = np.zeros(test_num)
        for test_id in range(test_num):
            for rule_id, rule in enumerate(self.RULES):
                testing_X_sample = testing_X[test_id][np.newaxis, :]
                testing_y_pred[test_id] += self.ALPHAS[rule_id]*rule.predict(testing_X_sample)
            testing_y_pred[test_id] = np.sign(testing_y_pred[test_id])
        return testing_y_pred
            

In [7]:
model = Adaboost(train_X, train_y)
for clf in clfs:
    model.set_rule(clf)

In [8]:
np.sum(model.evalute(test_X)==test_y)*1.0/len(test_X)

0.69818181818181824