<a href="https://colab.research.google.com/github/duonghiepit/AdaBoost_Classifier/blob/main/AdaBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!gdown 1vXWqMxf0YQ31IGmMZpMOiuPNtgM17BZV

Downloading...
From: https://drive.google.com/uc?id=1vXWqMxf0YQ31IGmMZpMOiuPNtgM17BZV
To: /content/advertising.csv
  0% 0.00/4.06k [00:00<?, ?B/s]100% 4.06k/4.06k [00:00<00:00, 13.3MB/s]


In [4]:
!gdown 1WMX6Tr69cqwLuRUbdF9Va38bufkzLyMQ

Downloading...
From: https://drive.google.com/uc?id=1WMX6Tr69cqwLuRUbdF9Va38bufkzLyMQ
To: /content/spambase.zip
  0% 0.00/126k [00:00<?, ?B/s]100% 126k/126k [00:00<00:00, 102MB/s]


In [5]:
!unzip /content/spambase.zip

Archive:  /content/spambase.zip
  inflating: spambase.DOCUMENTATION  
  inflating: spambase.data           
  inflating: spambase.names          


In [25]:
# Imports
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

## Helper functions

In [26]:
def compute_error(y, y_pred, w_i):
    '''
    Calcualte the error rate of a weak classifier m. Arguments:
    y: actual target value (giá trị thực thế)
    y_pred: predicted value by weak classifier (giá trị dự đoán bởi phân loại yếu)
    w-i: individual weights for each observation (Trọng số riêng cho mỗi quan sát)

    Note that all arrays should be the same length (Lưu ý rằng tất cả các mảng phải có cùng độ dài)
    '''

    return (sum(w_i * (np.not_equal(y, y_pred)).astype(int))) / sum(w_i)

In [27]:
def compute_alpla(error):
    '''
    Calculate the weight of a weak classifier m in the majority vote of the final classifier. This is called
    alpha in chapter 10.1 of The Elements of Statistical Learning. Arguments:
    error: error rate from weak classifier m

    Tính trọng số của bộ phân loại yếu m trong đa số phiếu bầu của bộ phân loại cuối cùng.
    Điều này được gọi là alpha trong chương 10.1 của Các yếu tố của học tập thống kê.
    Đối số: lỗi: tỷ lệ lỗi từ bộ phân loại yếu m

    Amount of Say (alpha)
    '''

    epsilon = 0.01
    return np.log((1 - error + epsilon)/(error + epsilon))

In [28]:
def update_weights_formular1(w_i, alpha, y, y_pred):
    result = w_i * np.exp(-alpha * y * y_pred)
    w_norm = result / np.sum(result)

    return w_norm

def update_weights_formular2(w_i, alpha, y, y_pred):
    result = w_i * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))
    w_norm = result / np.sum(result)

    return w_norm

## Define AdaBoost class

In [29]:
class MyAdaBoost:

    def __init__(self):
        #self.w_i = None
        self.alphas = []
        self.G_M = []
        self.M = None
        self.training_errors = []
        self.prediction_errors = []

    def fit(self, X, y, M = 100):
        '''
        Fit model. Arguments:
        X: independent variables (Biến độc lập)
        y: target variable (Biến mục tiêu)
        M: number of boosting rounds. Default is 100
        '''

        # Clear before calling
        self.alphas = []
        self.training_errors = []
        self.M = M

        # Iterate over M weak classifiers
        for m in range(M):

            # Set weights for current boosting iteration
            if m == 0:
                w_i = np.ones(len(y)) * 1 / len(y)  # At m = 0, weights are all the same and equal to 1 / N
            else:
                # w_i = update_weights_formular1(w_i, alpha_m, y, y_pred)
                w_i = update_weights_formular2(w_i, alpha_m, y, y_pred)
            #print(w_i)

            # (a) Fit weak classifier and predict labels
            G_m = DecisionTreeClassifier(max_depth = 1) # Stump: Two terminal-node classification tree
            G_m.fit(X, y, sample_weight = w_i)
            y_pred = G_m.predict(X)

            self.G_M.append(G_m) # Save to list of weak classifiers

            # (b) Compute error
            error_m= compute_error(y, y_pred, w_i)
            self.training_errors.append(error_m)
            # print(error_m)

            # (c) Compute alpha
            alpha_m = compute_alpla(error_m)
            self.alphas.append(alpha_m)
            # print(alpha_m)

        assert len(self.G_M) == len(self.alphas)

    def predict(self, X):
        '''
        Predict using fitted model. Arguments:
        X: independent variables
        '''

        # Initialise dataframe with weak predictions for each observation
        # Khởi tạo khung dữ liệu với các dự đoán yếu cho mỗi lần quan sát
        weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.M))

        # Predict class label for each weak classifier, weighted by alpha_m
        # Dự đoán nhãn cho từng phân loại yếu, có trọng số là alpha_m
        for m in range(self.M):
            y_pred_m = self.G_M[m].predict(X) * self.alphas[m]
            #weak_preds.iloc[:, m] = y_pred_m
            weak_preds[weak_preds.columns[m]] = y_pred_m

        # Estimate final predictions
        y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

        return y_pred

    def error_rates(self, X, y):
        '''
        Get the error rates of each weak classifier. Arguments:
        X: independent variables
        y: target variables associated to X
        '''

        self.prediction_errors = [] # Clear befort calling

        # Predict class label for each weak classifier
        for m in range(self.M):
            y_pred_m = self.G_M[m].predict(X)
            error_m = compute_error(y = y, y_pred = y_pred_m, w_i = np.ones(len(y)))
            self.prediction_errors.append(error_m)

In [30]:
result = [0.07, 0.07, 0.07, 0.22, 0.07, 0.07, 0.07, 0.22]
w_norm = result / np.sum(result)
print(w_norm)

[0.08139535 0.08139535 0.08139535 0.25581395 0.08139535 0.08139535
 0.08139535 0.25581395]


In [31]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

In [32]:
# Prepare dataset
X, y = make_classification(n_samples = 1000, n_features = 20, random_state=42)
y = y * 2 - 1 # Original AdaBoost uses {1, -1} as class labels

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [33]:
# Prepare spam dataset

df = pd.read_csv('/content/spambase.data', header = None)
names = pd.read_csv('/content/spambase.names', sep = ':', skiprows=range(0, 33), header = None)
col_names = list(names[0])
col_names.append('Spam')
df.columns = col_names
df.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,Spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [34]:
df['Spam'] = df['Spam'] * 2 -1
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Spam']).values, df['Spam'], test_size=0.2, random_state=42)

In [36]:
# Fit model
ab = MyAdaBoost()
ab.fit(X_train, y_train, M=50)

# Predict on test set
y_pred = ab.predict(X_test)
print('The accuracy_score of the model is: ', round(accuracy_score(y_test, y_pred), 4))

The accuracy_score of the model is:  0.9349


## Using the Library Scikit-Learn implementation of AdaBoost

In [37]:
from sklearn.ensemble import AdaBoostClassifier

ab_sk = AdaBoostClassifier(n_estimators=50)
ab_sk.fit(X_train, y_train)
y_pred_sk = ab_sk.predict(X_test)
print('The accuracy_score of the model is:', round(accuracy_score(y_test, y_pred_sk), 4))

The accuracy_score of the model is: 0.9359
