# Lightgbm classification and regression

https://machinelearningmastery.com/light-gradient-boosted-machine-lightgbm-ensemble/

## Setup libraries

In [24]:
import pandas as pd
import seaborn as sns

In [25]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

In [26]:
import lightgbm
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor

In [27]:
import matplotlib.pyplot as plt

In [28]:
lightgbm.__version__

'3.3.1'

## Reusable functions

In [29]:
def get_accuracy(n_scores):
    return 'Accuracy: {:.3f} Stdev: {:.3f}'.format(
        np.mean(n_scores), np.std(n_scores))

## Get datasets

Get classification dataset

In [30]:
def get_clf_dataset():
    X, y = make_classification(
        n_samples=1000, n_features=20, n_informative=15, n_redundant=5,
        random_state=1)
    return X, y

Get regresson dataset

In [31]:
def get_reg_dataset():
    X, y = make_regression(
        n_samples=1000, n_features=20, n_informative=15, noise=0.1,
        random_state=7)
    return X, y

## LightGBM for classification

Size of classification data

In [32]:
X, y = get_clf_dataset()
print(X.shape, y.shape)

(1000, 20) (1000,)


Values of the target variable

In [33]:
pd.Series(y).value_counts()

0    501
1    499
dtype: int64

Train model

In [34]:
model = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
n_scores = cross_val_score(
    model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

Display results

In [35]:
print("Accuracy: {:.3f} ({:.3f})".format(
    np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.929 (0.026)


## LightGBM for regression

Size of regression data

In [36]:
X, y = get_reg_dataset()
print(X.shape, y.shape)

(1000, 20) (1000,)


Description of the target variable

In [38]:
pd.Series(y).describe()

count    1000.000000
mean        3.477325
std       220.375737
min      -663.293131
25%      -143.686509
50%         4.344965
75%       160.826633
max       661.461884
dtype: float64

Train model

In [39]:
model = LGBMRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3)
n_scores = cross_val_score(
    model, X, y, scoring='neg_mean_absolute_error', cv=cv,
    n_jobs=-1, error_score='raise')

Display results

In [41]:
print("Mean absolute error (MAE): {:.3f} ({:.3f})".format(
    np.mean(n_scores), np.std(n_scores)))

Mean absolute error (MAE): -59.897 (4.350)
