In [9]:
import numpy as np


class SimpleXGBoostRegressor:
    def __init__(self, learning_rate=0.3, max_depth=1, lambda_reg=0):
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.lambda_reg = lambda_reg
        self.tree = None

    def _calculate_similarity_score(self, residuals):
        """Tính Similarity Score theo công thức trong README"""
        sum_residuals = np.sum(residuals)
        n_samples = len(residuals)
        return (sum_residuals**2) / (n_samples + self.lambda_reg)

    def _find_best_split(self, X, residuals):
        """Tìm điểm split tốt nhất dựa trên Gain"""
        n_samples = len(X)
        if n_samples <= 1:
            return None, None, None

        # Tính Similarity Score của root
        root_score = self._calculate_similarity_score(residuals)

        best_gain = -np.inf
        best_split_point = None
        best_left_indices = None
        best_right_indices = None

        # Thử tất cả các điểm split có thể (giữa các điểm liên tiếp)
        for i in range(n_samples - 1):
            split_point = (X[i] + X[i + 1]) / 2

            # Tạo các mask cho left và right branches
            left_mask = X <= split_point
            right_mask = X > split_point

            # Tính similarity score cho cả hai nhánh
            left_score = self._calculate_similarity_score(residuals[left_mask])
            right_score = self._calculate_similarity_score(
                residuals[right_mask])

            # Tính Gain
            gain = left_score + right_score - root_score

            # Cập nhật nếu tìm thấy gain tốt hơn
            if gain > best_gain:
                best_gain = gain
                best_split_point = split_point
                best_left_indices = left_mask
                best_right_indices = right_mask

        return best_split_point, best_left_indices, best_right_indices

    def _calculate_output(self, residuals):
        """Tính output của node theo công thức"""
        return np.sum(residuals) / len(residuals)

    def fit(self, X, y):
        """Huấn luyện mô hình XGBoost"""
        X = np.array(X)
        y = np.array(y)

        # Bước 1: Khởi tạo f0 bằng giá trị trung bình của Y
        self.f0 = np.mean(y)
        residuals = y - self.f0

        # Tạo cây quyết định với độ sâu 1
        split_point, left_indices, right_indices = self._find_best_split(
            X, residuals)

        if split_point is None:
            # Không tìm được split point, trả về giá trị trung bình
            self.tree = {"split_point": None,
                         "output": self._calculate_output(residuals)}
        else:
            # Tính output cho mỗi nhánh
            left_output = self._calculate_output(residuals[left_indices])
            right_output = self._calculate_output(residuals[right_indices])

            # Lưu cây
            self.tree = {
                "split_point": split_point,
                "left": {"output": left_output},
                "right": {"output": right_output}
            }

        return self

    def predict(self, X):
        """Dự đoán với mô hình đã huấn luyện"""
        X = np.array(X)
        predictions = np.full(X.shape, self.f0)

        if self.tree["split_point"] is None:
            # Nếu không có split point, áp dụng cùng output cho tất cả
            predictions += self.learning_rate * self.tree["output"]
        else:
            # Nếu có split point, áp dụng output tương ứng
            left_mask = X <= self.tree["split_point"]
            right_mask = X > self.tree["split_point"]

            predictions[left_mask] += self.learning_rate * \
                self.tree["left"]["output"]
            predictions[right_mask] += self.learning_rate * \
                self.tree["right"]["output"]

        return predictions


# Áp dụng cho dataset
X = np.array([23, 24, 26, 27])
y = np.array([50, 70, 80, 85])

model = SimpleXGBoostRegressor(learning_rate=0.3, max_depth=1, lambda_reg=0)
model.fit(X, y)

# In kết quả chi tiết
print("Giá trị f0 (mean của Y):", model.f0)
print("Cây quyết định:", model.tree)
print("Dự đoán trên tập huấn luyện:", model.predict(X))
print("Giá trị thực tế:", y)

Giá trị f0 (mean của Y): 71.25
Cây quyết định: {'split_point': np.float64(23.5), 'left': {'output': np.float64(-21.25)}, 'right': {'output': np.float64(7.083333333333333)}}
Dự đoán trên tập huấn luyện: [64.875 73.375 73.375 73.375]
Giá trị thực tế: [50 70 80 85]


In [3]:
import numpy as np

class SimpleXGBoostRegressor:
    def __init__(self, lr=0.3, reg_lambda=0):
        self.lr = lr
        self.reg_lambda = reg_lambda
        self.tree = None

    def _calc_similarity_score(self, residuals):
        sum_residuals = np.sum(residuals)
        n_samples = len(residuals)
        return (sum_residuals ** 2) / (n_samples + self.reg_lambda)

    def _evaluate_specific_splits(self, X, residuals, split_points):
        """Tính toán chi tiết cho các điểm split cụ thể"""
        root_score = self._calc_similarity_score(residuals)
        print(f"Root Similarity Score: {root_score:.4f}")
        print("Đánh giá các điểm split cụ thể:")

        results = []

        for split_point in split_points:
            left_mask = X <= split_point
            right_mask = X > split_point

            left_residuals = residuals[left_mask]
            right_residuals = residuals[right_mask]

            left_score = self._calc_similarity_score(left_residuals)
            right_score = self._calc_similarity_score(right_residuals)

            gain = left_score + right_score - root_score

            print(f"Split Point: X < {split_point}")
            print(f"  Left branch samples: {np.sum(left_mask)}")
            print(f"  Right branch samples: {np.sum(right_mask)}")
            print(f"  Left Similarity Score: {left_score:.4f}")
            print(f"  Right Similarity Score: {right_score:.4f}")
            print(f"  Gain: {gain:.4f}")

            results.append({
                "split_point": split_point,
                "left_mask": left_mask,
                "right_mask": right_mask,
                "left_score": left_score,
                "right_score": right_score,
                "gain": gain
            })

        # Tìm split point có gain lớn nhất
        best_result = max(results, key=lambda x: x["gain"])
        print(
            f"\nBest Split Point: X < {best_result['split_point']} with Gain: {best_result['gain']:.4f}")

        return best_result

    def _calc_gain(self, left_score, right_score, root_score):
        return left_score + right_score - root_score

    def _calc_output(self, residuals):
        """Tính output của node theo công thức"""
        return np.sum(residuals) / len(residuals)

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)

        # Bước 1: Khởi tạo f0 bằng giá trị trung bình của Y
        self.f0 = np.mean(y)
        print(f"Step 1: Khởi tạo f0 = mean(Y) = {self.f0}")

        residuals = y - self.f0
        print(f"Residuals = Y - f0 = {residuals}")

        # Bước 2-4: Đánh giá các điểm split cụ thể
        print("\nStep 2-4: Đánh giá các điểm split cụ thể và tính Gain")
        best_split = self._evaluate_specific_splits(
            X, residuals, [23.5, 25, 26.5])

        # Bước 5: Tính output cho mỗi nhánh
        print("\nStep 5: Tính output cho mỗi nhánh")
        left_output = self._calc_output(residuals[best_split["left_mask"]])
        right_output = self._calc_output(residuals[best_split["right_mask"]])

        print(f"Left Output = {left_output:.4f}")
        print(f"Right Output = {right_output:.4f}")

        # Lưu cây
        self.tree = {
            "split_point": best_split["split_point"],
            "left": {"output": left_output},
            "right": {"output": right_output}
        }

        return self

    def predict(self, X):
        X = np.array(X)
        predictions = np.full(X.shape, self.f0)

        # Nếu có split point, áp dụng output tương ứng
        left_mask = X <= self.tree["split_point"]
        right_mask = X > self.tree["split_point"]

        predictions[left_mask] += self.lr * self.tree["left"]["output"]
        predictions[right_mask] += self.lr * self.tree["right"]["output"]

        return predictions

In [7]:
X = np.array([23, 24, 26, 27])
y = np.array([50, 70, 80, 85])

print("Dataset:")
print(f"X = {X}")
print(f"y = {y}")
print("\nXGBoost Regression with λ = 0, depth = 1, lr = 0.3")
print("-" * 60)

model = SimpleXGBoostRegressor(lr=0.3, max_depth=10, reg_lambda=0)
model.fit(X, y)

print("Step 6: Dự đoán và đánh giá")
predictions = model.predict(X)
print(f"Dự đoán: {predictions}")
print(f"Giá trị thực tế: {y}")
print(f"Sai số (Predictions - Actual): {predictions - y}")

Dataset:
X = [23 24 26 27]
y = [50 70 80 85]

XGBoost Regression with λ = 0, depth = 1, lr = 0.3
------------------------------------------------------------
Step 1: Khởi tạo f0 = mean(Y) = 71.25
Residuals = Y - f0 = [-21.25  -1.25   8.75  13.75]

Step 2-4: Đánh giá các điểm split cụ thể và tính Gain
Root Similarity Score: 0.0000
Đánh giá các điểm split cụ thể:
Split Point: X < 23.5
  Left branch samples: 1
  Right branch samples: 3
  Left Similarity Score: 451.5625
  Right Similarity Score: 150.5208
  Gain: 602.0833
Split Point: X < 25
  Left branch samples: 2
  Right branch samples: 2
  Left Similarity Score: 253.1250
  Right Similarity Score: 253.1250
  Gain: 506.2500
Split Point: X < 26.5
  Left branch samples: 3
  Right branch samples: 1
  Left Similarity Score: 63.0208
  Right Similarity Score: 189.0625
  Gain: 252.0833

Best Split Point: X < 23.5 with Gain: 602.0833

Step 5: Tính output cho mỗi nhánh
Left Output = -21.2500
Right Output = 7.0833
Step 6: Dự đoán và đánh giá
Dự đoá

In [10]:
import numpy as np


class SimpleXGBoostClassifier:
    def __init__(self, lr=0.3, max_depth=1, reg_lambda=0):
        self.lr = lr
        self.max_depth = max_depth
        self.reg_lambda = reg_lambda
        self.tree = None

    def _calc_similarity_score(self, residuals, probabilities):
        """Tính Similarity Score theo công thức trong README cho phân loại"""
        sum_residuals = np.sum(residuals)
        # Mẫu số là tổng của previous_prob * (1 - previous_prob)
        denominator = np.sum(
            probabilities * (1 - probabilities)) + self.reg_lambda
        return (sum_residuals ** 2) / denominator

    def _evaluate_specific_splits(self, X, residuals, probabilities, split_points):
        """Tính toán chi tiết cho các điểm split cụ thể"""
        root_score = self._calc_similarity_score(residuals, probabilities)
        print(f"Root Similarity Score: {root_score:.4f}")
        print("Đánh giá các điểm split cụ thể:")

        results = []

        for split_point in split_points:
            left_mask = X <= split_point
            right_mask = X > split_point

            left_residuals = residuals[left_mask]
            right_residuals = residuals[right_mask]

            left_probs = probabilities[left_mask]
            right_probs = probabilities[right_mask]

            left_score = self._calc_similarity_score(
                left_residuals, left_probs)
            right_score = self._calc_similarity_score(
                right_residuals, right_probs)

            gain = left_score + right_score - root_score

            print(f"Split Point: X < {split_point}")
            print(f"  Left branch samples: {np.sum(left_mask)}")
            print(f"  Right branch samples: {np.sum(right_mask)}")
            print(f"  Left Similarity Score: {left_score:.4f}")
            print(f"  Right Similarity Score: {right_score:.4f}")
            print(f"  Gain: {gain:.4f}")

            results.append({
                "split_point": split_point,
                "left_mask": left_mask,
                "right_mask": right_mask,
                "left_score": left_score,
                "right_score": right_score,
                "gain": gain
            })

        # Tìm split point có gain lớn nhất
        best_result = max(results, key=lambda x: x["gain"])
        print(
            f"\nBest Split Point: X < {best_result['split_point']} with Gain: {best_result['gain']:.4f}")

        return best_result

    def _calc_output(self, residuals, probabilities):
        """Tính output cho phân loại:
        Output = Sum of Residuals / Sum(Previous Probabilities * (1 - Previous Probabilities))
        """
        return np.sum(residuals) / np.sum(probabilities * (1 - probabilities))

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)

        # Bước 1: Khởi tạo f0 = 0.5 (xác suất ban đầu)
        self.initial_prob = 0.5
        print(f"Step 1: Khởi tạo xác suất ban đầu = {self.initial_prob}")

        # Tính residuals ban đầu (y - predicted_prob)
        residuals = y - self.initial_prob
        print(f"Residuals = Y - initial_prob = {residuals}")

        # Tạo mảng probabilities ban đầu
        initial_probs = np.full(len(X), self.initial_prob)

        # Bước 2-4: Đánh giá các điểm split cụ thể
        print("\nStep 2-4: Đánh giá các điểm split cụ thể và tính Gain")
        best_split = self._evaluate_specific_splits(
            X, residuals, initial_probs, [23.5, 25, 26.5])

        # Bước 5: Tính output cho mỗi nhánh
        print("\nStep 5: Tính output cho mỗi nhánh")
        left_output = self._calc_output(
            residuals[best_split["left_mask"]], initial_probs[best_split["left_mask"]])
        right_output = self._calc_output(
            residuals[best_split["right_mask"]], initial_probs[best_split["right_mask"]])

        print(f"Left Output = {left_output:.4f}")
        print(f"Right Output = {right_output:.4f}")

        # Lưu cây
        self.tree = {
            "split_point": best_split["split_point"],
            "left": {"output": left_output},
            "right": {"output": right_output}
        }

        return self

    def predict_proba(self, X):
        """Dự đoán xác suất theo công thức trong README"""
        X = np.array(X)
        log_predictions = np.zeros(X.shape)
        probabilities = np.full(X.shape, self.initial_prob)

        # Tính log prediction cho mỗi điểm dữ liệu
        for i, x in enumerate(X):
            # Chọn output dựa vào điều kiện split
            if x <= self.tree["split_point"]:
                output = self.tree["left"]["output"]
            else:
                output = self.tree["right"]["output"]

            # Tính log prediction
            # Log Prediction = log(prev_prob/(1-prev_prob)) + lr * output
            prev_prob = self.initial_prob
            log_predictions[i] = np.log(
                prev_prob / (1 - prev_prob)) + self.lr * output

            # Tính xác suất
            # Probability = e^(log_pred) / (1 + e^(log_pred))
            probabilities[i] = np.exp(
                log_predictions[i]) / (1 + np.exp(log_predictions[i]))

        return probabilities

    def predict(self, X):
        """Dự đoán nhãn (0/1) dựa trên ngưỡng 0.5"""
        return (self.predict_proba(X) >= 0.5).astype(int)


# Giả sử dữ liệu phân loại
X = np.array([23, 24, 26, 27])
y = np.array([0, 0, 1, 1])  # Nhãn nhị phân 0/1

print("Dataset:")
print(f"X = {X}")
print(f"y = {y}")
print("\nXGBoost Classification with λ = 0, depth = 1, lr = 0.3")
print("-" * 60)

model = SimpleXGBoostClassifier(lr=0.3, max_depth=1, reg_lambda=0)
model.fit(X, y)

print("\nStep 6: Dự đoán xác suất")
probabilities = model.predict_proba(X)
predictions = model.predict(X)
print(f"Xác suất dự đoán: {probabilities}")
print(f"Nhãn dự đoán: {predictions}")
print(f"Nhãn thực tế: {y}")

Dataset:
X = [23 24 26 27]
y = [0 0 1 1]

XGBoost Classification with λ = 0, depth = 1, lr = 0.3
------------------------------------------------------------
Step 1: Khởi tạo xác suất ban đầu = 0.5
Residuals = Y - initial_prob = [-0.5 -0.5  0.5  0.5]

Step 2-4: Đánh giá các điểm split cụ thể và tính Gain
Root Similarity Score: 0.0000
Đánh giá các điểm split cụ thể:
Split Point: X < 23.5
  Left branch samples: 1
  Right branch samples: 3
  Left Similarity Score: 1.0000
  Right Similarity Score: 0.3333
  Gain: 1.3333
Split Point: X < 25
  Left branch samples: 2
  Right branch samples: 2
  Left Similarity Score: 2.0000
  Right Similarity Score: 2.0000
  Gain: 4.0000
Split Point: X < 26.5
  Left branch samples: 3
  Right branch samples: 1
  Left Similarity Score: 0.3333
  Right Similarity Score: 1.0000
  Gain: 1.3333

Best Split Point: X < 25 with Gain: 4.0000

Step 5: Tính output cho mỗi nhánh
Left Output = -2.0000
Right Output = 2.0000

Step 6: Dự đoán xác suất
Xác suất dự đoán: [0.35434

In [11]:
!mkdir data
%cd data

!gdown 1xwJmYJxEia06sxUdJyGO7JFx4DNK1fbp
!gdown 1pVdH-2b_odeuEPdXbLQYDcHXxgqqBK4i

%cd ..


d:\AIVietNam\2024\aio-2024-hw\module-3\13_09_2024_M03EX05\data


Downloading...
From: https://drive.google.com/uc?id=1xwJmYJxEia06sxUdJyGO7JFx4DNK1fbp
To: d:\AIVietNam\2024\aio-2024-hw\module-3\13_09_2024_M03EX05\data\Problem3.csv

  0%|          | 0.00/37.4k [00:00<?, ?B/s]
100%|██████████| 37.4k/37.4k [00:00<00:00, 845kB/s]


d:\AIVietNam\2024\aio-2024-hw\module-3\13_09_2024_M03EX05


Downloading...
From: https://drive.google.com/uc?id=1pVdH-2b_odeuEPdXbLQYDcHXxgqqBK4i
To: d:\AIVietNam\2024\aio-2024-hw\module-3\13_09_2024_M03EX05\data\Problem4.csv

  0%|          | 0.00/12.3k [00:00<?, ?B/s]
100%|██████████| 12.3k/12.3k [00:00<?, ?B/s]


# Using XGBoost for Regression with library

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder

data_df = pd.read_csv('./data/Problem3.csv')
data_df.head()


Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,4.468204,26.2,94.3,1.808289,8.2,51,6.7,False,0.0
1,7,4,oct,tue,4.517431,35.4,669.1,2.04122,18.0,33,0.9,False,0.0
2,7,4,oct,sat,4.517431,43.7,686.9,2.04122,14.6,33,1.3,False,0.0
3,8,6,mar,fri,4.529368,33.3,77.5,2.302585,8.3,97,4.0,True,0.0
4,8,6,mar,sun,4.503137,51.3,102.2,2.360854,11.4,99,1.8,False,0.0


In [14]:
categorical_features = data_df.select_dtypes(include=['object', 'bool']).columns.to_list()

for col_name in categorical_features:
    n_categories = data_df[col_name].nunique()
    print(f'Number of categories in {col_name}: {n_categories}')

ordinal_encoder = OrdinalEncoder()
encoded_categorical_cols = ordinal_encoder.fit_transform(
    data_df[categorical_features],
)

encoded_categorical_cols = pd.DataFrame(
    encoded_categorical_cols, 
    columns=categorical_features,
)

numerical_features = data_df.drop(categorical_features, axis=1)
encoded_df = pd.concat([numerical_features, encoded_categorical_cols], axis=1)
encoded_df.head()



Number of categories in month: 12
Number of categories in day: 7
Number of categories in rain: 2


Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,area,month,day,rain
0,7,5,4.468204,26.2,94.3,1.808289,8.2,51,6.7,0.0,7.0,0.0,0.0
1,7,4,4.517431,35.4,669.1,2.04122,18.0,33,0.9,0.0,10.0,5.0,0.0
2,7,4,4.517431,43.7,686.9,2.04122,14.6,33,1.3,0.0,10.0,2.0,0.0
3,8,6,4.529368,33.3,77.5,2.302585,8.3,97,4.0,0.0,7.0,0.0,1.0
4,8,6,4.503137,51.3,102.2,2.360854,11.4,99,1.8,0.0,7.0,3.0,0.0


In [15]:
X = encoded_df.drop(columns=['area'])
y = encoded_df['area']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

xg_reg = xgb.XGBRegressor(
    seed=42,
    learning_rate=0.01,
    n_estimators=102,
    max_depth=3,
)

xg_reg.fit(X_train, y_train)


In [16]:
preds = xg_reg.predict(X_test)

mse = mean_squared_error(y_test, preds)
mae = mean_absolute_error(y_test, preds)

print(f"MSE: {mse}")
print(f"MAE: {mae}")


MSE: 1.740456362147205
MAE: 1.1184518818795126


# Using XGBoost for Classification with library


In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

data_df = pd.read_csv('./data/Problem4.csv')
data_df.head()


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [18]:
X, y = data_df.iloc[:, :-1], data_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [19]:
xg_class = xgb.XGBClassifier(
    seed=42
)

xg_class.fit(X_train, y_train)


In [20]:
preds = xg_class.predict(X_test)

train_acc = accuracy_score(y_train, xg_class.predict(X_train))
test_acc = accuracy_score(y_test, preds)

print(f"Train Accuracy: {train_acc}")
print(f"Test Accuracy: {test_acc}")




Train Accuracy: 1.0
Test Accuracy: 0.9629629629629629
