In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
import catboost as cb


In [2]:
def prepare_data(path):
    data = pd.read_csv(path)
    data.drop(columns=data.columns[0], axis=1, inplace=True)
    data.set_index("X", inplace=True)
    data = pd.get_dummies(data,columns=['alcohol_level'], drop_first=True)
    data.index.name = None
    y = data['quality']
    X = data.drop(columns='quality')
    y = np.where(y == 'high', 1, 0)
    return X, y

In [3]:
class MyGradientBoostingClassifier:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, epsilon=0.001):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.epsilon = epsilon
        self.models = []
        self.mse = []

    def calc_probability(self, y):
        log = np.log(sum(y)/(len(y)-sum(y)))
        return np.exp(log)/(1+np.exp(log))

    def fit(self, X, y):
        # Initialize with the log(odds)
        initial_prediction = self.calc_probability(y)
        # Make initial prediction
        predictions = np.full_like(y, initial_prediction, dtype=float)

        for i in range(self.n_estimators):
            error = mean_squared_error(y, predictions)
            if len(self.mse) > 5 and np.abs(self.mse[len(self.mse)-2]-error) < self.epsilon:
                print(f"Overfitting, stopping the fit at {len(self.mse)} trees!")
                break
            self.mse.append(error)
            residuals = y - predictions
            # Fit a weak learner to the negative gradient (residuals)
            model = DecisionTreeRegressor(max_depth=self.max_depth)
            model.fit(X, residuals)
            # Make predictions with the weak learner
            weak_learner_predictions = model.predict(X)
            # Update the ensemble's predictions with a fraction of the weak learner's predictions
            predictions += self.learning_rate * weak_learner_predictions
            # Save the weak learner in the ensemble
            self.models.append(model)

    def predict(self, X):
        # For classification problems, initialize with the log(odds) or probability
        #predictions = np.full(X.shape[0], np.mean([model.tree_.value.max() for model in self.models]))
        predictions = np.full(X.shape[0], self.calc_probability([model.tree_.value.max() for model in self.models]))
        #print(predictions)
        # Accumulate predictions from each weak learner
        for model in self.models:
            weak_learner_predictions = model.predict(X)
            predictions += self.learning_rate * weak_learner_predictions
        # Convert to binary predictions for classification problems
        #print(predictions)
        binary_predictions = np.where(predictions >= 0.5, 1, 0)
        return binary_predictions

In [4]:
#Prepare the data
X, y = prepare_data("wine_quality.csv")
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
print("Trying out different learning rates")
alphas = [0.00001, 0.0001, 0.001, 0.1, 0.3, 0.5, 0.7, 0.85, 1, 10]
for alpha in alphas:
    model = MyGradientBoostingClassifier(n_estimators=100, learning_rate=alpha, epsilon=0.00001)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Learning rate: {alpha}, Accuracy: {accuracy}")
    #plt.plot(range(len(model.mse)), model.mse)
print("A good learning rate is 0.5")

Trying out different learning rates
Overfitting, stopping the fit at 6 trees!
Learning rate: 1e-05, Accuracy: 0.7761904761904762
Learning rate: 0.0001, Accuracy: 0.7761904761904762
Learning rate: 0.001, Accuracy: 0.7761904761904762
Learning rate: 0.1, Accuracy: 0.7925170068027211
Learning rate: 0.3, Accuracy: 0.7870748299319728
Learning rate: 0.5, Accuracy: 0.827891156462585
Learning rate: 0.7, Accuracy: 0.8258503401360544
Learning rate: 0.85, Accuracy: 0.8054421768707483
Learning rate: 1, Accuracy: 0.7843537414965986
Learning rate: 10, Accuracy: 0.7761904761904762
A good learning rate is 0.5


  log = np.log(sum(y)/(len(y)-sum(y)))


In [9]:
print("Testing different numbers of trees")
alphas = [1, 20, 50, 100, 200, 400, 600, 800, 1000, 1500, 3000]
for alpha in alphas:
    model = MyGradientBoostingClassifier(n_estimators=alpha, learning_rate=0.5, epsilon=0.00001)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Number of trees: {alpha}, Accuracy: {accuracy}")
    #plt.plot(range(len(model.mse)), model.mse)

print("The model started overfitting at around 700 trees. I prevented it by implementing an epsilon. if the loss function change in two iterations is below this epsilon, the fitting stops.")

Testing different numbers of trees
Number of trees: 1, Accuracy: 0.7952380952380952
Number of trees: 20, Accuracy: 0.7183673469387755
Number of trees: 50, Accuracy: 0.791156462585034
Number of trees: 100, Accuracy: 0.827891156462585
Number of trees: 200, Accuracy: 0.8517006802721089
Number of trees: 400, Accuracy: 0.8612244897959184
Overfitting, stopping the fit at 588 trees!
Number of trees: 600, Accuracy: 0.8646258503401361
Overfitting, stopping the fit at 588 trees!
Number of trees: 800, Accuracy: 0.863265306122449
Overfitting, stopping the fit at 588 trees!
Number of trees: 1000, Accuracy: 0.863265306122449
Overfitting, stopping the fit at 588 trees!
Number of trees: 1500, Accuracy: 0.8659863945578231
Overfitting, stopping the fit at 588 trees!
Number of trees: 3000, Accuracy: 0.8659863945578231
The model started overfitting at around 700 trees. I prevented it by implementing an epsilon. if the loss function change in two iterations is below this epsilon, the fitting stops.


In [10]:
print("Comparing my classifier and scikit-learn's")
my_model = MyGradientBoostingClassifier(n_estimators=400, max_depth=3, learning_rate=0.5, epsilon=0.00001)
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
my_accuracy = accuracy_score(y_test, y_pred)
print(f"My accuracy: {accuracy}")
sk_gb = GradientBoostingClassifier(n_estimators=400, max_depth=3, learning_rate=0.5)
sk_gb.fit(X_train, y_train)
y_pred = sk_gb.predict(X_test)
sk_acc = accuracy_score(y_test, y_pred)
print(f"SK accuracy: {sk_acc}")


Comparing my classifier and scikit-learn's
My accuracy: 0.8659863945578231
SK accuracy: 0.863265306122449


In [18]:
print("Comparing XGBoost, LightGBM and CatBoost")
# XGBoost
xgb_classifier = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# LightGBM
params = {'verbose': -1}
lgb_classifier = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
lgb_classifier.fit(X_train, y_train)
y_pred_lgb = lgb_classifier.predict(X_test)
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)

# CatBoost
cb_classifier = cb.CatBoostClassifier(iterations=100, learning_rate=0.1, depth=3)
cb_classifier.fit(X_train, y_train)
y_pred_cb = cb_classifier.predict(X_test)
accuracy_cb = accuracy_score(y_test, y_pred_cb)

print(f"My accuracy: {accuracy}, SK accuracy: {sk_acc}")
print(f"XGBoost Accuracy: {accuracy_xgb}")
print(f"LightGBM Accuracy: {accuracy_lgb}")
print(f"CatBoost Accuracy: {accuracy_cb}")

Comparing XGBoost, LightGBM and CatBoost
[LightGBM] [Info] Number of positive: 731, number of negative: 2697
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000131 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1230
[LightGBM] [Info] Number of data points in the train set: 3428, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.213244 -> initscore=-1.305482
[LightGBM] [Info] Start training from score -1.305482
0:	learn: 0.6559831	total: 1.53ms	remaining: 152ms
1:	learn: 0.6259296	total: 2.91ms	remaining: 143ms
2:	learn: 0.6013461	total: 4.21ms	remaining: 136ms
3:	learn: 0.5791206	total: 5.49ms	remaining: 132ms
4:	learn: 0.5595701	total: 6.83ms	remaining: 130ms
5:	learn: 0.5429802	total: 8.16ms	remaining: 128ms
6:	learn: 0.5289734	total: 9.45ms	remaining: 126ms
7:	learn: 0.5167539	total: 10.8ms	remaining: 124ms
8:	learn: 0.5061933	total: 12.1ms	remaining: 123ms
9:	learn: 