In [8]:
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Generate simple synthetic data
np.random.seed(42)
n_samples = 1000
X = np.random.rand(n_samples, 5)  # 5 features
y = 2*X[:, 0] + 3*X[:, 1] + 0.5*X[:, 2] + np.random.randn(n_samples)  # Linear combination with noise

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Create XGBoost DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Specify XGBoost parameters
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 3,
    "learning_rate": 0.1,
    "subsample": 0.8
}

# Train the XGBoost model
num_boost_round = 100
bst = xgb.train(params, dtrain, num_boost_round, evals=[(dtest, "test")], early_stopping_rounds=10)

# Predict on the test set
y_pred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

[0]	test-rmse:2.54316
[1]	test-rmse:2.34789
[2]	test-rmse:2.18089
[3]	test-rmse:2.03527
[4]	test-rmse:1.90504
[5]	test-rmse:1.78542
[6]	test-rmse:1.69081
[7]	test-rmse:1.60441
[8]	test-rmse:1.52533
[9]	test-rmse:1.46244
[10]	test-rmse:1.40377
[11]	test-rmse:1.35454
[12]	test-rmse:1.31531
[13]	test-rmse:1.27765
[14]	test-rmse:1.24667
[15]	test-rmse:1.21857
[16]	test-rmse:1.19759
[17]	test-rmse:1.18107
[18]	test-rmse:1.16565
[19]	test-rmse:1.15174
[20]	test-rmse:1.14389
[21]	test-rmse:1.13432
[22]	test-rmse:1.12878
[23]	test-rmse:1.12412
[24]	test-rmse:1.11696
[25]	test-rmse:1.11490
[26]	test-rmse:1.11071
[27]	test-rmse:1.10390
[28]	test-rmse:1.09995
[29]	test-rmse:1.09661
[30]	test-rmse:1.09590
[31]	test-rmse:1.09577
[32]	test-rmse:1.09439
[33]	test-rmse:1.09194
[34]	test-rmse:1.09131
[35]	test-rmse:1.09066
[36]	test-rmse:1.09158
[37]	test-rmse:1.09083
[38]	test-rmse:1.08952
[39]	test-rmse:1.09210
[40]	test-rmse:1.09116
[41]	test-rmse:1.09054
[42]	test-rmse:1.08863
[43]	test-rmse:1.0906



In [10]:
# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Specify LightGBM parameters
params = {
    "objective": "regression",
    "metric": "rmse",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9
}

# Train the LightGBM model
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10)

# Predict on the test set
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 5
[LightGBM] [Info] Start training from score 2.649517
[1]	valid_0's rmse: 1.438
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 1.4121
[3]	valid_0's rmse: 1.38802
[4]	valid_0's rmse: 1.36588
[5]	valid_0's rmse: 1.34566
[6]	valid_0's rmse: 1.3248
[7]	valid_0's rmse: 1.3085
[8]	valid_0's rmse: 1.29286
[9]	valid_0's rmse: 1.27917
[10]	valid_0's rmse: 1.26565
[11]	valid_0's rmse: 1.25248
[12]	valid_0's rmse: 1.24001
[13]	valid_0's rmse: 1.22652
[14]	valid_0's rmse: 1.21587
[15]	valid_0's rmse: 1.20728
[16]	valid_0's rmse: 1.19726
[17]	valid_0's rmse: 1.19001
[18]	valid_0's rmse: 1.18147
[19]	valid_0's rmse: 1.17609
[20]	valid_0's rmse: 1.17005
[21]	valid_0's rmse: 1.16555
[22]	valid_0's rmse: 1.16062
[23]	valid_0's rmse: 1.15652
[24]	valid_0's rmse: 1.15291
[25]	valid_0's rmse: 1

