In [110]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [111]:
cancer = load_breast_cancer()
data, target = cancer.data, cancer.target

In [112]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}, y_test: {y_test.shape}")

scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

Shape of X_train: (455, 30), y_train: (455,)
Shape of X_test: (114, 30), y_test: (114,)


In [113]:
class AdaBoost:
    def __init__(self, M):
        self.M = M
        self.alphas = []
        self.models = []

    def fit(self, X, y):
        w = np.full(X.shape[0], 1 / X.shape[0])
        y = np.where(y == 0, -1, 1)
        for m in range(self.M):
            tree = DecisionTreeClassifier(max_depth=1)
            tree.fit(X, y, sample_weight=w)
            self.models.append(tree)
            predictions = tree.predict(X)

            miss_indexes = np.where(predictions != y)
            eps = np.sum(w[miss_indexes]) / np.sum(w)
            alpha = np.log((1 - eps) / (eps + 1e-10))
            self.alphas.append(alpha)
            w = w * np.exp(-self.alphas[-1] * y * predictions)
            w = w / np.sum(w)

    def predict(self, x):
        final = np.zeros(x.shape[0])
        for alpha, model in zip(self.alphas, self.models):
            preds = model.predict(x)
            final += alpha * preds

        return np.where(final >= 0, 1, -1)


In [114]:
scores = []
for i in range(1, 101):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=True)
    ada = AdaBoostClassifier(n_estimators=i, random_state=0)
    score = cross_val_score(ada, X_train, y_train, cv=cv, scoring='accuracy')
    scores.append(score.mean())

best_estimator = np.argmax(scores)

In [115]:
y_test = np.where(y_test == 0, -1, 1)

In [116]:
ada = AdaBoost(best_estimator)
ada.fit(X_train, y_train)
preds = ada.predict(X_test)

print(f"Sklearn Adaboost Accuracy: {accuracy_score(y_test, preds):.3f}")
print(f"Sklearn Adaboost F1-Score: {f1_score(y_test, preds, average='weighted'):.3f}")

Sklearn Adaboost Accuracy: 0.912
Sklearn Adaboost F1-Score: 0.913


In [139]:
from sklearn.metrics import classification_report


def compare_models(model_name, data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    scale = StandardScaler()
    X_train_scale = scale.fit_transform(X_train)
    X_test_scale = scale.transform(X_test)

    print(f"Shape of X_train: {X_train_scale.shape}, y_train: {y_train.shape}")
    print(f"Shape of X_test: {X_test_scale.shape}, y_test: {y_test.shape}")

    adaboost = AdaBoostClassifier(n_estimators=best_estimator, random_state=0)
    adaboost.fit(X_train_scale, y_train)

    adaboost_prediction = adaboost.predict(X_test_scale)

    print(f"adaboost classification report for {model_name}")
    print(classification_report(y_test, adaboost_prediction))


    xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
    xgb_model.fit(X_train_scale, y_train)

    xgb_predictions = xgb_model.predict(X_test_scale)

    print(f"xgboost classification report for {model_name}")
    print(classification_report(y_test, xgb_predictions))

In [140]:
# implement on balance dataset
print(np.unique(cancer.target,return_counts=True))
compare_models("breast cancer", cancer.data, cancer.target)

(array([0, 1]), array([212, 357], dtype=int64))
Shape of X_train: (455, 30), y_train: (455,)
Shape of X_test: (114, 30), y_test: (114,)
adaboost classification report for breast cancer
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

xgboost classification report for breast cancer
              precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [141]:
df = pd.read_csv("./assets/1.csv")
target = df["target"]
print(target.value_counts())
data = df.drop("target", axis=1)
compare_models("first dataset", data, target)

target
0    499
1    226
Name: count, dtype: int64
Shape of X_train: (580, 13), y_train: (580,)
Shape of X_test: (145, 13), y_test: (145,)
adaboost classification report for first dataset
              precision    recall  f1-score   support

           0       0.91      0.94      0.92        98
           1       0.86      0.81      0.84        47

    accuracy                           0.90       145
   macro avg       0.89      0.87      0.88       145
weighted avg       0.90      0.90      0.90       145

xgboost classification report for first dataset
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        98
           1       1.00      0.98      0.99        47

    accuracy                           0.99       145
   macro avg       0.99      0.99      0.99       145
weighted avg       0.99      0.99      0.99       145



In [142]:
# implement oh imbalance data
df = pd.read_csv("./assets/2.csv")
target = df["target"]
print(target.value_counts())
data = df.drop("target", axis=1)
compare_models("first dataset", data, target)

target
0    499
1    126
Name: count, dtype: int64
Shape of X_train: (500, 13), y_train: (500,)
Shape of X_test: (125, 13), y_test: (125,)
adaboost classification report for first dataset
              precision    recall  f1-score   support

           0       0.89      0.97      0.93        96
           1       0.86      0.62      0.72        29

    accuracy                           0.89       125
   macro avg       0.88      0.79      0.82       125
weighted avg       0.89      0.89      0.88       125

xgboost classification report for first dataset
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        96
           1       1.00      0.79      0.88        29

    accuracy                           0.95       125
   macro avg       0.97      0.90      0.93       125
weighted avg       0.95      0.95      0.95       125



In [143]:
# implement oh imbalance data
df = pd.read_csv("./assets/3.csv")
target = df["target"]
print(target.value_counts())
data = df.drop("target", axis=1)
compare_models("third dataset", data, target)

target
0    499
1     26
Name: count, dtype: int64
Shape of X_train: (420, 13), y_train: (420,)
Shape of X_test: (105, 13), y_test: (105,)
adaboost classification report for third dataset
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       102
           1       0.50      0.67      0.57         3

    accuracy                           0.97       105
   macro avg       0.75      0.82      0.78       105
weighted avg       0.98      0.97      0.97       105

xgboost classification report for third dataset
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       102
           1       1.00      0.67      0.80         3

    accuracy                           0.99       105
   macro avg       1.00      0.83      0.90       105
weighted avg       0.99      0.99      0.99       105

