In [3]:
import pandas as pd
from flaml import AutoML
from flaml.ml import sklearn_metric_loss_score

In [16]:
df = pd.read_csv("278k_song_labelled.csv")
df = df.loc[:, ~df.columns.str.contains("unnamed", case=False)]
df.head()

Unnamed: 0,duration (ms),danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spec_rate,labels
0,195000.0,0.611,0.614,-8.815,0.0672,0.0169,0.000794,0.753,0.52,128.05,3.446154e-07,2
1,194641.0,0.638,0.781,-6.848,0.0285,0.0118,0.00953,0.349,0.25,122.985,1.464234e-07,1
2,217573.0,0.56,0.81,-8.029,0.0872,0.0071,8e-06,0.241,0.247,170.044,4.00785e-07,1
3,443478.0,0.525,0.699,-4.571,0.0353,0.0178,8.8e-05,0.0888,0.199,92.011,7.959809e-08,0
4,225862.0,0.367,0.771,-5.863,0.106,0.365,1e-06,0.0965,0.163,115.917,4.693131e-07,1


In [18]:
df.describe()

Unnamed: 0,duration (ms),danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spec_rate,labels
count,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0,277938.0
mean,232496.1,0.552583,0.556866,-10.363654,0.087913,0.386583,0.255044,0.189217,0.449602,119.196002,4.754654e-07,1.179101
std,117183.0,0.188905,0.279681,6.672049,0.1125,0.364504,0.373745,0.163596,0.267471,30.462256,9.190229e-07,1.021033
min,6706.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,172013.0,0.431,0.342,-12.747,0.0359,0.0338,0.0,0.0962,0.22,95.07225,1.531461e-07,0.0
50%,213105.5,0.571,0.591,-8.397,0.0471,0.262,0.00109,0.121,0.434,119.94,2.345459e-07,1.0
75%,264866.0,0.693,0.792,-5.842,0.0822,0.754,0.645,0.227,0.665,138.86975,4.449937e-07,2.0
max,3919895.0,0.989,1.0,4.882,0.965,0.996,1.0,1.0,1.0,244.947,5.97186e-05,3.0


In [19]:
df.labels.value_counts()

labels
1    106429
0     82058
2     47065
3     42386
Name: count, dtype: int64

In [21]:
import lightgbm as lgbm

In [23]:
sample_cnt = df.shape[0]
train_eval_split = int(0.8 * sample_cnt)

train_sample = df.iloc[:train_eval_split, :]
test_sample = df.iloc[train_eval_split:, :]

In [42]:
def split_xy(df:pd.DataFrame):
    label_col_flags = df.columns.str.find("labels") == 0
    assert sum(label_col_flags) == 1
    x = df.loc[:, ~label_col_flags]
    y = df.loc[:, label_col_flags]
    return x, y.labels

In [43]:
train_x, train_y = split_xy(train_sample)
print("training:", train_x.shape, train_y.shape)
eval_x, eval_y = split_xy(test_sample)
print("eval:", eval_x.shape, eval_y.shape)

training: (222350, 11) (222350,)
eval: (55588, 11) (55588,)


In [48]:
lgbm_classifier = lgbm.LGBMClassifier(max_depth=5, learning_rate=0.01, min_split_gain=0.1, min_child_samples=100, num_leaves=47, objective='multiclass', silent=False)
lgbm_classifier.fit(train_x, train_y, eval_set=(eval_x, eval_y))



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2803
[LightGBM] [Info] Number of data points in the train set: 222350, number of used features: 11
[LightGBM] [Info] Start training from score -1.213356
[LightGBM] [Info] Start training from score -0.966460
[LightGBM] [Info] Start training from score -1.738693
[LightGBM] [Info] Start training from score -1.919879
[1]	valid_0's multi_logloss: 1.29469
[2]	valid_0's multi_logloss: 1.27402
[3]	valid_0's multi_logloss: 1.25405
[4]	valid_0's multi_logloss: 1.2348
[5]	valid_0's multi_logloss: 1.21615
[6]	valid_0's multi_logloss: 1.19808
[7]	valid_0's multi_logloss: 1.18062
[8]	valid_0's multi_logloss: 1.16369
[9]	valid_0's multi_logloss: 1.14725
[10]	valid_0's multi_logloss: 1.13127
[11]	valid_0's multi_logloss: 1.11572
[12]	valid_0's multi_logloss: 1.10062
[13]	valid_0's multi_logloss: 1.08594
[14]	valid_0's multi_logloss: 1.07162
[15]	valid_0

In [65]:
pred_y = lgbm_classifier.predict(eval_x)

In [52]:
pd.Series(pred_y).value_counts()

1    24684
0    15342
3     9646
2     5916
Name: count, dtype: int64

In [53]:
eval_y.value_counts()

labels
1    21841
0    15976
3     9784
2     7987
Name: count, dtype: int64

In [66]:

print("default loss:", 1-sklearn_metric_loss_score("accuracy", pred_y, eval_y))


default loss: 0.8935741526948262


## FLAML 

In [57]:
lgbm_flaml = AutoML()
settings = {
    "time_budget": 60,  # total running time in seconds
    "metric": 'accuracy',  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']
    "estimator_list": ['lgbm'],  # list of ML learners; we tune lightgbm in this example
    "task": 'multiclass',  # task type    
    "log_file_name": 'spotify_song_catigory_flaml.log',  # flaml log file
    "seed": 7654321,    # random seed
}
lgbm_flaml.fit(X_train=train_x, y_train=train_y, **settings)

[flaml.automl.logger: 06-28 22:27:42] {1693} INFO - task = multiclass
[flaml.automl.logger: 06-28 22:27:42] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 06-28 22:27:42] {1703} INFO - Evaluation method: holdout
[flaml.automl.logger: 06-28 22:27:43] {1801} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 06-28 22:27:43] {1911} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 06-28 22:27:43] {2221} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-28 22:27:43] {2347} INFO - Estimated sufficient time budget=6222s. Estimated necessary time budget=6s.
[flaml.automl.logger: 06-28 22:27:43] {2394} INFO -  at 0.5s,	estimator lgbm's best error=0.2242,	best estimator lgbm's best error=0.2242
[flaml.automl.logger: 06-28 22:27:43] {2221} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-28 22:27:43] {2394} INFO -  at 0.5s,	estimator lgbm's best error=0.2242,	best estimator lgbm's best error=0.2242
[flaml.au

In [61]:
start_point = lgbm_flaml.best_config
lgbm_flaml2 = AutoML()
settings = {
    "time_budget": 60,  # total running time in seconds
    "metric": 'accuracy',  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']
    "estimator_list": ['lgbm'],  # list of ML learners; we tune lightgbm in this example
    "task": 'multiclass',  # task type    
    "log_file_name": 'spotify_song_catigory_flaml.log',  # flaml log file
    "seed": 7654321,    # random seed
}
lgbm_flaml2.fit(X_train=train_x, y_train=train_y, starting_points=start_point, X_val=eval_x, y_val=eval_y, **settings)

[flaml.automl.logger: 06-28 22:31:10] {1693} INFO - task = multiclass
[flaml.automl.logger: 06-28 22:31:10] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 06-28 22:31:10] {1703} INFO - Evaluation method: holdout
[flaml.automl.logger: 06-28 22:31:10] {1801} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 06-28 22:31:10] {1911} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 06-28 22:31:10] {2221} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-28 22:31:10] {2347} INFO - Estimated sufficient time budget=7108s. Estimated necessary time budget=7s.
[flaml.automl.logger: 06-28 22:31:10] {2394} INFO -  at 0.4s,	estimator lgbm's best error=0.2117,	best estimator lgbm's best error=0.2117
[flaml.automl.logger: 06-28 22:31:10] {2221} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-28 22:31:10] {2394} INFO -  at 0.4s,	estimator lgbm's best error=0.2117,	best estimator lgbm's best error=0.2117
[flaml.au

In [67]:
pred_y = lgbm_flaml2.predict(eval_x)
print("flaml loss:", 1-sklearn_metric_loss_score("accuracy", pred_y, eval_y))

flaml loss: 0.9696337338994028


## Bayesian optimization

In [69]:
import optuna.integration.lightgbm as lgb
dtrain = lgb.Dataset(train_x, label=train_y)
dval = lgb.Dataset(eval_x, label=eval_y)
params = {
    "objective": "multiclass",
    "metric": "logloss",
    "verbosity": -1,
}

In [70]:
model = lgb.train(params, dtrain, valid_sets=[dval], verbose_eval=10000)        

[I 2023-06-28 22:42:10,323] A new study created in memory with name: no-name-8bc3c9ea-1943-400a-bc50-bd5c99ca4068
[LightGBM] [Fatal] Number of classes should be specified and greater than 1 for multiclass training
[W 2023-06-28 22:42:10,380] Trial 0 failed with parameters: {'feature_fraction': 0.6} because of the following error: LightGBMError('Number of classes should be specified and greater than 1 for multiclass training').
Traceback (most recent call last):
  File "/Users/flybywindwen/miniconda3/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/Users/flybywindwen/miniconda3/lib/python3.10/site-packages/optuna/integration/_lightgbm_tuner/optimize.py", line 240, in __call__
    booster = lgb.train(self.lgbm_params, train_set, **kwargs)
  File "/Users/flybywindwen/miniconda3/lib/python3.10/site-packages/lightgbm/engine.py", line 271, in train
    booster = Booster(params=params, train_set=train_set)
  File "/Use

LightGBMError: Number of classes should be specified and greater than 1 for multiclass training