In [1]:
import numpy as np
from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
class SimpleLightGBM:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.estimators = []
        self.histograms = []

    def _calculate_gradient(self, y_true, y_pred):
        return y_true - y_pred
    
    def _build_histogram(self, feature_value, gradients):
        unique_values = np.unique(feature_value)
        histogram = []
        for value in unique_values:
            mask = feature_value == value
            bin_mean = np.mean(gradients[mask])
            histogram.append(bin_mean)
        return histogram
    
    def split(self, histogram):
        return np.mean(histogram)
    
    def fit(self, X, y):
        y_pred = np.zeros(len(y))
        
        for _ in range(self.n_estimators):
            gradients = self._calculate_gradient(y, y_pred)
            histograms = []
            for feature in range(X.shape[1]):
                histogram = self._build_histogram(X[:, feature], gradients)
                histograms.append(histogram)
            self.histograms.append(histogram)
            
            leaf_values = []
            for hist in histograms:
                leaf_value = self.split(hist)
                leaf_values.append(leaf_value)

            y_pred += self.learning_rate * np.sum(leaf_values, axis=0)
            self.estimators.append(leaf_values)
        
    def predict(self, X):
        y_pred = np.zeros(len(X))
        for leaf_values in self.estimators:
            y_pred += self.learning_rate * np.sum(leaf_values, axis=0)
        return y_pred

In [4]:
#Load iris dataset
iris = load_iris()
X  = iris.data
y = iris.target

In [5]:
#Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
#Initialize and train LightGBM Model
lgbm = SimpleLightGBM(n_estimators=200, learning_rate=0.1, max_depth=3)
lgbm.fit(X_train, y_train)

In [18]:
#Make prediction
y_pred = lgbm.predict(X_test)

In [19]:
#Calculate the accuracy
accuracy = accuracy_score(y_test, np.round(y_pred))
print(f"Accuracy: {accuracy}")

Accuracy: 0.3


## LGBM

In [2]:
import lightgbm as lgb

In [3]:
#Load wine dataset
wine = load_wine()
X = wine.data
y = wine.target

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [5]:
#Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [6]:
#Create lgbm dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

In [7]:
type(train_data)

lightgbm.basic.Dataset

In [11]:
#Set parameters for lightgbm
param = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric' : 'multi_logloss',
    'num_leaves' : 31,
    'learning_rate': 0.1,
    'max_depth': 5
}

In [13]:
#Train the model
num_round = 100
lgb_model = lgb.train(param, train_data, num_round, valid_sets=[test_data], callbacks = [lgb.early_stopping(10)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 142, number of used features: 13
[LightGBM] [Info] Start training from score -1.149165
[LightGBM] [Info] Start training from score -0.912776
[LightGBM] [Info] Start training from score -1.266948
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.0033833


In [16]:
#Predict on test set
y_pred = lgb_model.predict(X_test)
y_pred_class = [np.argmax(pred) for pred in y_pred]

#Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_class)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


### GOSS & EFB

In [18]:
#Set parameters for lightgbm
param = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric' : 'multi_logloss',
    'num_leaves' : 31,
    'learning_rate': 0.1,
    'max_depth': 5,
    'boosting_type' :'gbdt',
    'feature_fraction' : 0.8,
    'bagging_fraction' : 1.0,
    'bagging_freq':0,
    'top_rate': 0.2,
    'other_rate':0.1,
    'enable_bundle': True
}

In [20]:
#Train the model
num_round = 100
lgb_model = lgb.train(param, train_data, num_round, valid_sets=[test_data], callbacks = [lgb.early_stopping(10)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 142, number of used features: 13
[LightGBM] [Info] Start training from score -1.149165
[LightGBM] [Info] Start training from score -0.912776
[LightGBM] [Info] Start training from score -1.266948
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.00270705


In [21]:
#Predict on test set
y_pred = lgb_model.predict(X_test)
y_pred_class = [max(enumerate(pred), key=lambda x: x[1])[0] for pred in y_pred]

#Calculate accuracy
accuracy = (y_pred_class == y_test).sum()/len(y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0
