In [1]:
# Nạp các gói thư viện cần thiết
import pandas as pd
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier 
import numpy as np
# Đọc dữ liệu iris từ UCI (https://archive.ics.uci.edu/ml/datasets/Iris) 
# # hoặc từ thư viện scikit-learn
# Tham khảo https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
iris = datasets.load_iris()
columns=["Petal Length","Petal Width","Sepal Length","Sepal Width"]
X = pd.DataFrame(iris.data, columns=columns)
y = iris.target 
print(X.describe())
# Sử dụng nghi thức kiểm tra hold-out
# Chia dữ liệu ngẫu nhiên thành 2 tập dữ liệu con:
# training set và test set theo tỷ lệ 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


       Petal Length  Petal Width  Sepal Length  Sepal Width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [2]:
# Xây dựng boosting của 50 cây quyết định, cây có độ sâu tối đa là 3
model = AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(max_depth=1), n_estimators=50)
model.fit(X_train, y_train)
# Dự đoán nhãn tập kiểm tra 
y_pred = model.predict(X_test)
# Đánh giá mô hình bằng chỉ số RMSE (Root Mean Squared Error)
# Tham khảo: https://en.wikipedia.org/wiki/Root-mean-square_deviation
from sklearn.metrics import mean_absolute_error, mean_squared_error
print('Gia tri Mean Absolute Error: %.3f' % mean_absolute_error(y_test, y_pred))
print('Gia tri Mean Squared Error: %.3f' % mean_squared_error(y_test, y_pred))
print('Gia tri Root Mean Squared Error: %.3f' % np.sqrt(mean_squared_error(y_test, y_pred)))

Gia tri Mean Absolute Error: 0.067
Gia tri Mean Squared Error: 0.067
Gia tri Root Mean Squared Error: 0.258


In [3]:
def load_datasets(name):
    if name == 'Iris':
        data = datasets.load_iris()
    elif name == 'Breast Cancer':
        data = datasets.load_breast_cancer()
    elif name == 'Wine':
        data = datasets.load_wine()
    elif name == 'Handwritten Digits':
        data = datasets.load_digits()
    df = pd.DataFrame(data.data)
    y = data.target
    return df, y

In [21]:
s_MAE = s_MSE = s_RMSE = 0

def scorer(model, X_test, y_test):
    global s_MAE, s_MSE, s_RMSE
    y_pred = model.predict(X_test)
    MAE = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(MSE)
    s_MAE += MAE
    s_MSE += MSE
    s_RMSE += RMSE
    return MAE + MSE + RMSE

def score_dataset(dataset_name, model, X, y, cv=None):
    global s_MAE, s_MSE, s_RMSE
    if cv:
        s_MAE = s_MSE = s_RMSE = 0
        scores = cross_val_score(model, df, y, cv=cv, scoring=scorer)
        print('MAE = %.3f' % (s_MAE/cv))
        print('MSE = %.3f' % (s_MSE/cv))
        print('RMSE = %.3f' % (s_RMSE/cv))
    else:
        for nFold in range(2, 10+1):
            s_MAE = s_MSE = s_RMSE = 0
            scores = cross_val_score(model, df, y, cv=nFold, scoring=scorer)
            print('%s %2d-folds: MAE = %.3f, MSE = %.3f, RMSE = %.3f' % (dataset_name, nFold, s_MAE/nFold, s_MSE/nFold, s_RMSE/nFold))

In [22]:
dataset_name = 'Breast Cancer'
df, y = load_datasets(dataset_name)
model = AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(max_depth=1), n_estimators=50)
score_dataset(dataset_name, model, df, y)

Breast Cancer  2-folds: MAE = 0.042, MSE = 0.042, RMSE = 0.201
Breast Cancer  3-folds: MAE = 0.042, MSE = 0.042, RMSE = 0.198
Breast Cancer  4-folds: MAE = 0.033, MSE = 0.033, RMSE = 0.177
Breast Cancer  5-folds: MAE = 0.030, MSE = 0.030, RMSE = 0.162
Breast Cancer  6-folds: MAE = 0.035, MSE = 0.035, RMSE = 0.186
Breast Cancer  7-folds: MAE = 0.035, MSE = 0.035, RMSE = 0.180
Breast Cancer  8-folds: MAE = 0.032, MSE = 0.032, RMSE = 0.171
Breast Cancer  9-folds: MAE = 0.039, MSE = 0.039, RMSE = 0.180
Breast Cancer 10-folds: MAE = 0.039, MSE = 0.039, RMSE = 0.179


In [23]:
dataset_name = 'Wine'
df, y = load_datasets(dataset_name)
model = AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(max_depth=1), n_estimators=50)
score_dataset(dataset_name, model, df, y)

Wine  2-folds: MAE = 0.140, MSE = 0.152, RMSE = 0.389
Wine  3-folds: MAE = 0.157, MSE = 0.168, RMSE = 0.409
Wine  4-folds: MAE = 0.207, MSE = 0.241, RMSE = 0.468
Wine  5-folds: MAE = 0.203, MSE = 0.237, RMSE = 0.393
Wine  6-folds: MAE = 0.117, MSE = 0.117, RMSE = 0.300
Wine  7-folds: MAE = 0.167, MSE = 0.167, RMSE = 0.362
Wine  8-folds: MAE = 0.212, MSE = 0.246, RMSE = 0.416
Wine  9-folds: MAE = 0.156, MSE = 0.156, RMSE = 0.352
Wine 10-folds: MAE = 0.117, MSE = 0.117, RMSE = 0.250


In [24]:
dataset_name = 'Handwritten Digits'
df, y = load_datasets(dataset_name)
model = AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(max_depth=1), n_estimators=50)
score_dataset(dataset_name, model, df, y)

Handwritten Digits  2-folds: MAE = 3.398, MSE = 20.189, RMSE = 4.479
Handwritten Digits  3-folds: MAE = 2.561, MSE = 12.013, RMSE = 3.435
Handwritten Digits  4-folds: MAE = 2.728, MSE = 13.583, RMSE = 3.655
Handwritten Digits  5-folds: MAE = 2.856, MSE = 14.895, RMSE = 3.835
Handwritten Digits  6-folds: MAE = 2.896, MSE = 15.198, RMSE = 3.869
Handwritten Digits  7-folds: MAE = 2.532, MSE = 11.766, RMSE = 3.385
Handwritten Digits  8-folds: MAE = 2.663, MSE = 13.164, RMSE = 3.585
Handwritten Digits  9-folds: MAE = 2.680, MSE = 13.185, RMSE = 3.582
Handwritten Digits 10-folds: MAE = 2.772, MSE = 14.070, RMSE = 3.701
