In [93]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
import catboost as cb


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\gregorzadnik\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\gregorzadnik\AppData\Local\Temp\ipykernel_14500\3577771154.py", line 10, in <module>
    import lightgbm as lgb
  File "c:\Python39\lib\site-packages\lightgbm\__init__.py", line 8, in <module>
    from .basic import Booster, Dataset, Sequence, register_logger
  File "c:\Python39\lib\site-packages\lightgbm\basic.py", line 21, in <module>
    from .compat import PANDAS_INSTALLED, concat, dt_DataTable, pd_CategoricalDtype, pd_DataFrame, pd_Series
  File "c:\Python39\lib\site-packages\lightgbm\compat.py", line 141, in <module>
    from dask import delayed
  File "C:\Users\gregorzadnik\AppData\Roaming\Python\Python39\site-packages\dask\__init__.py", line 8, in <module>
    from .delayed import delayed
  File "C:\Users\gregorzadnik\AppData\Roaming\Python\P

In [52]:
def prepare_data(path):
    data = pd.read_csv(path)
    data.drop(columns=data.columns[0], axis=1, inplace=True)
    data.set_index("X", inplace=True)
    data = pd.get_dummies(data,columns=['alcohol_level'], drop_first=True)
    data.index.name = None
    y = data['quality']
    X = data.drop(columns='quality')
    y = np.where(y == 'high', 1, 0)
    return X, y

In [81]:
class MyGradientBoostingClassifier:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, epsilon=0.001):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.epsilon = epsilon
        self.models = []
        self.mse = []

    def calc_probability(self, y):
        log = np.log(sum(y)/(len(y)-sum(y)))
        return np.exp(log)/(1+np.exp(log))

    def fit(self, X, y):
        # Initialize with the log(odds)
        initial_prediction = self.calc_probability(y)
        # Make initial prediction
        predictions = np.full_like(y, initial_prediction, dtype=float)

        for i in range(self.n_estimators):
            error = mean_squared_error(y, predictions)
            if len(self.mse) > 5 and np.abs(self.mse[len(self.mse)-2]-error) < self.epsilon:
                print(f"Overfitting, stopping the fit at {len(self.mse)} trees!")
                break
            self.mse.append(error)
            residuals = y - predictions
            # Fit a weak learner to the negative gradient (residuals)
            model = DecisionTreeRegressor(max_depth=self.max_depth)
            model.fit(X, residuals)
            # Make predictions with the weak learner
            weak_learner_predictions = model.predict(X)
            # Update the ensemble's predictions with a fraction of the weak learner's predictions
            predictions += self.learning_rate * weak_learner_predictions
            # Save the weak learner in the ensemble
            self.models.append(model)

    def predict(self, X):
        # For classification problems, initialize with the log(odds) or probability
        #predictions = np.full(X.shape[0], np.mean([model.tree_.value.max() for model in self.models]))
        predictions = np.full(X.shape[0], self.calc_probability([model.tree_.value.max() for model in self.models]))
        #print(predictions)
        # Accumulate predictions from each weak learner
        for model in self.models:
            weak_learner_predictions = model.predict(X)
            predictions += self.learning_rate * weak_learner_predictions
        # Convert to binary predictions for classification problems
        #print(predictions)
        binary_predictions = np.where(predictions >= 0.5, 1, 0)
        return binary_predictions

In [64]:
#Prepare the data
X, y = prepare_data("wine_quality.csv")
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [74]:
print("Trying out different learning rates")
alphas = [0.00001, 0.0001, 0.001, 0.1, 0.3, 0.5, 0.7, 0.85, 1, 10]
for alpha in alphas:
    model = MyGradientBoostingClassifier(n_estimators=100, learning_rate=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Learning rate: {alpha}, Accuracy: {accuracy}")
    #plt.plot(range(len(model.mse)), model.mse)
print("A good learning rate is 0.5")

Trying out different learning rates
Learning rate: 1e-05, Accuracy: 0.7761904761904762
Learning rate: 0.0001, Accuracy: 0.7761904761904762
Learning rate: 0.001, Accuracy: 0.7761904761904762
Learning rate: 0.1, Accuracy: 0.7925170068027211
Learning rate: 0.3, Accuracy: 0.789795918367347
Learning rate: 0.5, Accuracy: 0.8285714285714286
Learning rate: 0.7, Accuracy: 0.827891156462585
Learning rate: 0.85, Accuracy: 0.8068027210884354
Learning rate: 1, Accuracy: 0.7836734693877551
Learning rate: 10, Accuracy: 0.7761904761904762


  log = np.log(sum(y)/(len(y)-sum(y)))


In [87]:
print("Testing different numbers of trees")
alphas = [1, 20, 50, 100, 200, 400, 600, 800, 1000, 1500, 3000]
for alpha in alphas:
    model = MyGradientBoostingClassifier(n_estimators=alpha, learning_rate=0.5, epsilon=0.00001)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Number of trees: {alpha}, Accuracy: {accuracy}")
    #plt.plot(range(len(model.mse)), model.mse)

print("The model started overfitting at around 700 trees. I prevented it by implementing an epsilon. When the loss function change in two iterations is below this epsilon, the fitting stops.")

Testing different numbers of trees
Number of trees: 1, Accuracy: 0.7952380952380952
Number of trees: 20, Accuracy: 0.717687074829932
Number of trees: 50, Accuracy: 0.791156462585034
Number of trees: 100, Accuracy: 0.8285714285714286
Number of trees: 200, Accuracy: 0.8496598639455782
Number of trees: 400, Accuracy: 0.8612244897959184
Overfitting, stopping the fit at 588 trees!
Number of trees: 600, Accuracy: 0.8646258503401361
Overfitting, stopping the fit at 588 trees!
Number of trees: 800, Accuracy: 0.8659863945578231
Overfitting, stopping the fit at 588 trees!
Number of trees: 1000, Accuracy: 0.8653061224489796
Overfitting, stopping the fit at 588 trees!
Number of trees: 1500, Accuracy: 0.8659863945578231
Overfitting, stopping the fit at 588 trees!
Number of trees: 3000, Accuracy: 0.8646258503401361
The model started overfitting at around 700 trees. I prevented it by implementing an epsilon. When the loss function change in two iterations is below this epsilon, the fitting stops.


In [89]:
print("Comparing my classifier and scikit-learn's")
my_model = MyGradientBoostingClassifier(n_estimators=400, max_depth=3, learning_rate=0.5, epsilon=0.00001)
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
my_accuracy = accuracy_score(y_test, y_pred)
print(f"My accuracy: {accuracy}")
sk_gb = GradientBoostingClassifier(n_estimators=400, max_depth=3, learning_rate=0.5)
sk_gb.fit(X_train, y_train)
y_pred = sk_gb.predict(X_test)
sk_acc = accuracy_score(y_test, y_pred)
print(f"SK accuracy: {sk_acc}")


Comparing my classifier and scikit-learn's
My accuracy: 0.8646258503401361
SK accuracy: 0.8625850340136054


In [46]:
print("Comparing XGBoost, LightGBM and CatBoost")
# XGBoost
xgb_classifier = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb}")

# LightGBM
lgb_classifier = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
lgb_classifier.fit(X_train, y_train)
y_pred_lgb = lgb_classifier.predict(X_test)
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f"LightGBM Accuracy: {accuracy_lgb}")

# CatBoost
cb_classifier = cb.CatBoostClassifier(iterations=100, learning_rate=0.1, depth=3)
cb_classifier.fit(X_train, y_train)
y_pred_cb = cb_classifier.predict(X_test)
accuracy_cb = accuracy_score(y_test, y_pred_cb)
print(f"CatBoost Accuracy: {accuracy_cb}")

0.37405771332259113