In [19]:
import pandas as pd
import lightgbm as lgbm
from flaml import AutoML
from flaml.ml import sklearn_metric_loss_score

In [20]:
df = pd.read_csv("278k_song_labelled.csv")
df = df.loc[:, ~df.columns.str.contains("unnamed", case=False)]
df.head()

Unnamed: 0,duration (ms),danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spec_rate,labels
0,195000.0,0.611,0.614,-8.815,0.0672,0.0169,0.000794,0.753,0.52,128.05,3.446154e-07,2
1,194641.0,0.638,0.781,-6.848,0.0285,0.0118,0.00953,0.349,0.25,122.985,1.464234e-07,1
2,217573.0,0.56,0.81,-8.029,0.0872,0.0071,8e-06,0.241,0.247,170.044,4.00785e-07,1
3,443478.0,0.525,0.699,-4.571,0.0353,0.0178,8.8e-05,0.0888,0.199,92.011,7.959809e-08,0
4,225862.0,0.367,0.771,-5.863,0.106,0.365,1e-06,0.0965,0.163,115.917,4.693131e-07,1


In [21]:
df.describe()

Unnamed: 0,duration (ms),danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spec_rate,labels
count,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0
mean,232496.1,0.552583,0.556866,-10.363654,0.087913,0.386583,0.255044,0.189217,0.449602,119.196002,4.754654e-07,1.179101
std,117183.0,0.188905,0.279681,6.672049,0.1125,0.364504,0.373745,0.163596,0.267471,30.462256,9.190229e-07,1.021033
min,6706.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,172013.0,0.431,0.342,-12.747,0.0359,0.0338,0.0,0.0962,0.22,95.07225,1.531461e-07,0.0
50%,213105.5,0.571,0.591,-8.397,0.0471,0.262,0.00109,0.121,0.434,119.94,2.345459e-07,1.0
75%,264866.0,0.693,0.792,-5.842,0.0822,0.754,0.645,0.227,0.665,138.86975,4.449937e-07,2.0
max,3919895.0,0.989,1.0,4.882,0.965,0.996,1.0,1.0,1.0,244.947,5.97186e-05,3.0


In [22]:
df.labels.value_counts()

labels
1    106429
0     82058
2     47065
3     42386
Name: count, dtype: int64

In [23]:
sample_cnt = df.shape[0]
train_eval_split = int(0.8 * sample_cnt)

train_sample = df.iloc[:train_eval_split, :]
test_sample = df.iloc[train_eval_split:, :]

In [24]:
def split_xy(df:pd.DataFrame):
    label_col_flags = df.columns.str.find("labels") == 0
    assert sum(label_col_flags) == 1
    x = df.loc[:, ~label_col_flags]
    y = df.loc[:, label_col_flags]
    return x, y.labels

In [25]:
train_x, train_y = split_xy(train_sample)
print("training:", train_x.shape, train_y.shape)
eval_x, eval_y = split_xy(test_sample)
print("eval:", eval_x.shape, eval_y.shape)

training: (222350, 11) (222350,)
eval: (55588, 11) (55588,)


In [26]:
lgbm_classifier = lgbm.LGBMClassifier(max_depth=5, learning_rate=0.01, min_split_gain=0.1, min_child_samples=100, num_leaves=47, objective='multiclass', silent=False)
lgbm_classifier.fit(train_x, train_y, eval_set=(eval_x, eval_y))



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2803
[LightGBM] [Info] Number of data points in the train set: 222350, number of used features: 11
[LightGBM] [Info] Start training from score -1.213356
[LightGBM] [Info] Start training from score -0.966460
[LightGBM] [Info] Start training from score -1.738693
[LightGBM] [Info] Start training from score -1.919879
[1]	valid_0's multi_logloss: 1.29469
[2]	valid_0's multi_logloss: 1.27402
[3]	valid_0's multi_logloss: 1.25405
[4]	valid_0's multi_logloss: 1.2348
[5]	valid_0's multi_logloss: 1.21615
[6]	valid_0's multi_logloss: 1.19808
[7]	valid_0's multi_logloss: 1.18062
[8]	valid_0's multi_logloss: 1.16369
[9]	valid_0's multi_logloss: 1.14725
[10]	valid_0's multi_logloss: 1.13127
[11]	valid_0's multi_logloss: 1.11572
[12]	valid_0's multi_logloss: 1.10062
[13]	valid_0's multi_logloss: 1.08594
[14]	valid_0's multi_logloss: 1.07162
[15]	valid_0

In [27]:
pred_y = lgbm_classifier.predict(eval_x)

In [11]:
pd.Series(pred_y).value_counts()

1    24684
0    15342
3     9646
2     5916
Name: count, dtype: int64

In [12]:
eval_y.value_counts()

labels
1    21841
0    15976
3     9784
2     7987
Name: count, dtype: int64

In [28]:

print("default loss:", 1-sklearn_metric_loss_score("accuracy", pred_y, eval_y))


default loss: 0.8935741526948262


## FLAML 

In [29]:
lgbm_flaml = AutoML()
settings = {
    "time_budget": 60,  # total running time in seconds
    "metric": 'accuracy',  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']
    "estimator_list": ['lgbm'],  # list of ML learners; we tune lightgbm in this example
    "task": 'multiclass',  # task type    
    "log_file_name": 'spotify_song_catigory_flaml.log',  # flaml log file
    "seed": 7654321,    # random seed
}
lgbm_flaml.fit(X_train=train_x, y_train=train_y, **settings)

[flaml.automl.logger: 06-29 09:09:03] {1693} INFO - task = multiclass
[flaml.automl.logger: 06-29 09:09:03] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 06-29 09:09:03] {1703} INFO - Evaluation method: holdout
[flaml.automl.logger: 06-29 09:09:03] {1801} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 06-29 09:09:03] {1911} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 06-29 09:09:03] {2221} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-29 09:09:03] {2347} INFO - Estimated sufficient time budget=5463s. Estimated necessary time budget=5s.
[flaml.automl.logger: 06-29 09:09:03] {2394} INFO -  at 0.4s,	estimator lgbm's best error=0.2242,	best estimator lgbm's best error=0.2242
[flaml.automl.logger: 06-29 09:09:03] {2221} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-29 09:09:03] {2394} INFO -  at 0.4s,	estimator lgbm's best error=0.2242,	best estimator lgbm's best error=0.2242
[flaml.au

In [30]:
start_point = lgbm_flaml.best_config
lgbm_flaml2 = AutoML()
settings = {
    "time_budget": 60,  # total running time in seconds
    "metric": 'accuracy',  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']
    "estimator_list": ['lgbm'],  # list of ML learners; we tune lightgbm in this example
    "task": 'multiclass',  # task type    
    "log_file_name": 'spotify_song_catigory_flaml.log',  # flaml log file
    "seed": 7654321,    # random seed
}
lgbm_flaml2.fit(X_train=train_x, y_train=train_y, starting_points=start_point, X_val=eval_x, y_val=eval_y, **settings)

[flaml.automl.logger: 06-29 09:10:08] {1693} INFO - task = multiclass
[flaml.automl.logger: 06-29 09:10:08] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 06-29 09:10:08] {1703} INFO - Evaluation method: holdout
[flaml.automl.logger: 06-29 09:10:08] {1801} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 06-29 09:10:08] {1911} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 06-29 09:10:08] {2221} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-29 09:10:08] {2347} INFO - Estimated sufficient time budget=5914s. Estimated necessary time budget=6s.
[flaml.automl.logger: 06-29 09:10:08] {2394} INFO -  at 0.3s,	estimator lgbm's best error=0.2117,	best estimator lgbm's best error=0.2117
[flaml.automl.logger: 06-29 09:10:08] {2221} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-29 09:10:08] {2394} INFO -  at 0.4s,	estimator lgbm's best error=0.2117,	best estimator lgbm's best error=0.2117
[flaml.au

In [31]:
pred_y = lgbm_flaml2.predict(eval_x)
print("flaml loss:", 1-sklearn_metric_loss_score("accuracy", pred_y, eval_y))

flaml loss: 0.957113045981147


## Bayesian optimization

In [5]:
import optuna
from optuna.trial import Trial

In [36]:
import logging
import sys 

def get_suggestion(func, param, kwargs):
    def _sugg(trial):
        return getattr(trial, func)(param, **kwargs)
    return _sugg

lgmb_param_set = {
    "num_leaves": get_suggestion("suggest_int", "num_leaves", {"low": 63, "high":1023}), 
    "learning_rate": get_suggestion("suggest_float", "learning_rate", {"low": 0.001, "high": 1, "log": True}), 
    "n_estimators": get_suggestion("suggest_int", "n_estimators", {"low": 10,  "high": 1000, "log": True}), 
    "subsample_for_bin": get_suggestion("suggest_int", "subsample_for_bin", {"low": 10000, "high": 50000, "log": True}), 
    "min_split_gain": get_suggestion("suggest_float", "min_split_gain", {"low": 0.0, "high": 0.1}), 
    "min_child_weight": get_suggestion("suggest_float","min_child_weight", {"low": .0001, "high": 0.01, "log": True}),
    "min_child_samples": get_suggestion("suggest_int", "min_child_samples", {"low": 10, "high": 100}),
    "colsample_bytree": get_suggestion("suggest_float", "colsample_bytree", {"low": 0.1, "high": 1.0}), 
    "reg_alpha": get_suggesion("suggest_float", "reg_alpha", {"low": 0.0, "high": 0.99}), 
    "reg_lambda": get_suggestion("suggest_float", "reg_lambda", {"low": 0.0, "high": 0.99}),
    "boosting_type": get_suggestion("suggest_categorical", "boosting_type", {"choices": ['gbdt', 'dart', 'goss']})
}

def load_dataset(csv_path, split_ratio=0.8):
    df = pd.read_csv(csv_path)
    df = df.loc[:, ~df.columns.str.contains("unnamed", case=False)]
    sample_cnt = df.shape[0]
    train_eval_split = int(split_ratio * sample_cnt)

    train_sample = df.iloc[:train_eval_split, :]
    test_sample = df.iloc[train_eval_split:, :]
    def split_xy(df:pd.DataFrame):
        label_col_flags = df.columns.str.find("labels") == 0
        assert sum(label_col_flags) == 1
        x = df.loc[:, ~label_col_flags]
        y = df.loc[:, label_col_flags]
        return x, y.labels
    train_x, train_y = split_xy(train_sample)
    eval_x, eval_y = split_xy(test_sample)
    return train_x, train_y, eval_x, eval_y

def objective(trial):
    train_x, train_y, eval_x, eval_y = load_dataset("278k_song_labelled.csv")
    param_set = {}
    for p, s in lgmb_param_set.items():
        param_set[p] = s(trial)

    lgbm_classifier = lgbm.LGBMClassifier(objective='multiclass', **param_set)
    lgbm_classifier.fit(train_x, train_y)
    pred_y = lgbm_classifier.predict(eval_x)
    loss = sklearn_metric_loss_score("accuracy", pred_y, eval_y)
    trial.report(loss, param_set['n_estimators'])
    if trial.should_prune():
        raise optuna.TrialPruned()
    
    return loss


optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study = optuna.create_study(pruner=optuna.pruners.HyperbandPruner())
study.optimize(objective, 5)

[I 2023-06-29 09:16:13,780] A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0


A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0
A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0
A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0
A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0
A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0
A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0
A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0
A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0
A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0
A new study created in memory with name: no-name-33fe284d-e69c-402e-b83c-b6c1376954f0


[I 2023-06-29 09:19:06,003] Trial 0 finished with value: 0.20646542419227176 and parameters: {'num_leaves': 731, 'learning_rate': 0.0012150794866611176, 'n_estimators': 353, 'subsample_for_bin': 19730, 'min_split_gain': 0.09177185212607926, 'min_child_weight': 0.001088091277809029, 'min_child_samples': 36, 'colsample_bytree': 0.378499385758178, 'reg_alpha': 0.18264034315092803, 'reg_lambda': 0.1344564735495806, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.20646542419227176.


Trial 0 finished with value: 0.20646542419227176 and parameters: {'num_leaves': 731, 'learning_rate': 0.0012150794866611176, 'n_estimators': 353, 'subsample_for_bin': 19730, 'min_split_gain': 0.09177185212607926, 'min_child_weight': 0.001088091277809029, 'min_child_samples': 36, 'colsample_bytree': 0.378499385758178, 'reg_alpha': 0.18264034315092803, 'reg_lambda': 0.1344564735495806, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.20646542419227176.
Trial 0 finished with value: 0.20646542419227176 and parameters: {'num_leaves': 731, 'learning_rate': 0.0012150794866611176, 'n_estimators': 353, 'subsample_for_bin': 19730, 'min_split_gain': 0.09177185212607926, 'min_child_weight': 0.001088091277809029, 'min_child_samples': 36, 'colsample_bytree': 0.378499385758178, 'reg_alpha': 0.18264034315092803, 'reg_lambda': 0.1344564735495806, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.20646542419227176.
Trial 0 finished with value: 0.20646542419227176 and parameters: {'num_leave

[I 2023-06-29 09:19:47,837] Trial 1 finished with value: 0.030294308124055536 and parameters: {'num_leaves': 484, 'learning_rate': 0.05398174657392375, 'n_estimators': 245, 'subsample_for_bin': 33246, 'min_split_gain': 0.008247842164138808, 'min_child_weight': 0.00226070404942164, 'min_child_samples': 98, 'colsample_bytree': 0.6267341611507831, 'reg_alpha': 0.15822591691356322, 'reg_lambda': 0.19825096325778524, 'boosting_type': 'goss'}. Best is trial 1 with value: 0.030294308124055536.


Trial 1 finished with value: 0.030294308124055536 and parameters: {'num_leaves': 484, 'learning_rate': 0.05398174657392375, 'n_estimators': 245, 'subsample_for_bin': 33246, 'min_split_gain': 0.008247842164138808, 'min_child_weight': 0.00226070404942164, 'min_child_samples': 98, 'colsample_bytree': 0.6267341611507831, 'reg_alpha': 0.15822591691356322, 'reg_lambda': 0.19825096325778524, 'boosting_type': 'goss'}. Best is trial 1 with value: 0.030294308124055536.
Trial 1 finished with value: 0.030294308124055536 and parameters: {'num_leaves': 484, 'learning_rate': 0.05398174657392375, 'n_estimators': 245, 'subsample_for_bin': 33246, 'min_split_gain': 0.008247842164138808, 'min_child_weight': 0.00226070404942164, 'min_child_samples': 98, 'colsample_bytree': 0.6267341611507831, 'reg_alpha': 0.15822591691356322, 'reg_lambda': 0.19825096325778524, 'boosting_type': 'goss'}. Best is trial 1 with value: 0.030294308124055536.
Trial 1 finished with value: 0.030294308124055536 and parameters: {'num_

[I 2023-06-29 09:20:06,528] Trial 2 finished with value: 0.47200834712527884 and parameters: {'num_leaves': 364, 'learning_rate': 0.0015364739270441077, 'n_estimators': 125, 'subsample_for_bin': 14656, 'min_split_gain': 0.08210890778892284, 'min_child_weight': 0.001012426666302464, 'min_child_samples': 42, 'colsample_bytree': 0.27785193060276014, 'reg_alpha': 0.02120669071442607, 'reg_lambda': 0.2572639529013939, 'boosting_type': 'gbdt'}. Best is trial 1 with value: 0.030294308124055536.


Trial 2 finished with value: 0.47200834712527884 and parameters: {'num_leaves': 364, 'learning_rate': 0.0015364739270441077, 'n_estimators': 125, 'subsample_for_bin': 14656, 'min_split_gain': 0.08210890778892284, 'min_child_weight': 0.001012426666302464, 'min_child_samples': 42, 'colsample_bytree': 0.27785193060276014, 'reg_alpha': 0.02120669071442607, 'reg_lambda': 0.2572639529013939, 'boosting_type': 'gbdt'}. Best is trial 1 with value: 0.030294308124055536.
Trial 2 finished with value: 0.47200834712527884 and parameters: {'num_leaves': 364, 'learning_rate': 0.0015364739270441077, 'n_estimators': 125, 'subsample_for_bin': 14656, 'min_split_gain': 0.08210890778892284, 'min_child_weight': 0.001012426666302464, 'min_child_samples': 42, 'colsample_bytree': 0.27785193060276014, 'reg_alpha': 0.02120669071442607, 'reg_lambda': 0.2572639529013939, 'boosting_type': 'gbdt'}. Best is trial 1 with value: 0.030294308124055536.
Trial 2 finished with value: 0.47200834712527884 and parameters: {'num

[I 2023-06-29 09:21:32,228] Trial 3 finished with value: 0.030258329135784745 and parameters: {'num_leaves': 531, 'learning_rate': 0.08461439300203369, 'n_estimators': 488, 'subsample_for_bin': 24375, 'min_split_gain': 0.00793471150457945, 'min_child_weight': 0.000972639359594038, 'min_child_samples': 23, 'colsample_bytree': 0.41843497258937146, 'reg_alpha': 0.8173042783935529, 'reg_lambda': 0.352022477842215, 'boosting_type': 'gbdt'}. Best is trial 3 with value: 0.030258329135784745.


Trial 3 finished with value: 0.030258329135784745 and parameters: {'num_leaves': 531, 'learning_rate': 0.08461439300203369, 'n_estimators': 488, 'subsample_for_bin': 24375, 'min_split_gain': 0.00793471150457945, 'min_child_weight': 0.000972639359594038, 'min_child_samples': 23, 'colsample_bytree': 0.41843497258937146, 'reg_alpha': 0.8173042783935529, 'reg_lambda': 0.352022477842215, 'boosting_type': 'gbdt'}. Best is trial 3 with value: 0.030258329135784745.
Trial 3 finished with value: 0.030258329135784745 and parameters: {'num_leaves': 531, 'learning_rate': 0.08461439300203369, 'n_estimators': 488, 'subsample_for_bin': 24375, 'min_split_gain': 0.00793471150457945, 'min_child_weight': 0.000972639359594038, 'min_child_samples': 23, 'colsample_bytree': 0.41843497258937146, 'reg_alpha': 0.8173042783935529, 'reg_lambda': 0.352022477842215, 'boosting_type': 'gbdt'}. Best is trial 3 with value: 0.030258329135784745.
Trial 3 finished with value: 0.030258329135784745 and parameters: {'num_leav

[I 2023-06-29 09:24:26,943] Trial 4 finished with value: 0.054094408865222765 and parameters: {'num_leaves': 673, 'learning_rate': 0.0021334151746530508, 'n_estimators': 726, 'subsample_for_bin': 20099, 'min_split_gain': 0.025434599459215758, 'min_child_weight': 0.00017017889295294726, 'min_child_samples': 53, 'colsample_bytree': 0.8383564042010848, 'reg_alpha': 0.9033506632164597, 'reg_lambda': 0.7456918975242166, 'boosting_type': 'gbdt'}. Best is trial 3 with value: 0.030258329135784745.


Trial 4 finished with value: 0.054094408865222765 and parameters: {'num_leaves': 673, 'learning_rate': 0.0021334151746530508, 'n_estimators': 726, 'subsample_for_bin': 20099, 'min_split_gain': 0.025434599459215758, 'min_child_weight': 0.00017017889295294726, 'min_child_samples': 53, 'colsample_bytree': 0.8383564042010848, 'reg_alpha': 0.9033506632164597, 'reg_lambda': 0.7456918975242166, 'boosting_type': 'gbdt'}. Best is trial 3 with value: 0.030258329135784745.
Trial 4 finished with value: 0.054094408865222765 and parameters: {'num_leaves': 673, 'learning_rate': 0.0021334151746530508, 'n_estimators': 726, 'subsample_for_bin': 20099, 'min_split_gain': 0.025434599459215758, 'min_child_weight': 0.00017017889295294726, 'min_child_samples': 53, 'colsample_bytree': 0.8383564042010848, 'reg_alpha': 0.9033506632164597, 'reg_lambda': 0.7456918975242166, 'boosting_type': 'gbdt'}. Best is trial 3 with value: 0.030258329135784745.
Trial 4 finished with value: 0.054094408865222765 and parameters: 

In [37]:
study.best_params, 1-study.best_value

({'num_leaves': 531,
  'learning_rate': 0.08461439300203369,
  'n_estimators': 488,
  'subsample_for_bin': 24375,
  'min_split_gain': 0.00793471150457945,
  'min_child_weight': 0.000972639359594038,
  'min_child_samples': 23,
  'colsample_bytree': 0.41843497258937146,
  'reg_alpha': 0.8173042783935529,
  'reg_lambda': 0.352022477842215,
  'boosting_type': 'gbdt'},
 0.9697416708642153)

In [35]:
lgbm_flaml2.best_config, 1-lgbm_flaml2.best_loss

({'n_estimators': 136,
  'num_leaves': 690,
  'min_child_samples': 31,
  'learning_rate': 0.7557849801382729,
  'log_max_bin': 10,
  'colsample_bytree': 1.0,
  'reg_alpha': 0.0009765625,
  'reg_lambda': 0.007756568049822362},
 0.957113045981147)

'lgbm'