In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('feature_enhanced.csv')  # Adjust path if needed

# Separate features and target
X = df.drop('taxvaluedollarcnt', axis=1)
y = df['taxvaluedollarcnt']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train set size:", X_train.shape, "Test set size:", X_test.shape)

# Baseline MAE (mean predictor)
train_mean_value = y_train.mean()
baseline_pred_train = np.full(len(y_train), train_mean_value)
baseline_pred_test = np.full(len(y_test), train_mean_value)
baseline_mae_train = mean_absolute_error(y_train, baseline_pred_train)
baseline_mae_test = mean_absolute_error(y_test, baseline_pred_test)

# Evaluation metric function
def compute_metrics(y_true, y_pred, baseline_mae):
    mae_val = mean_absolute_error(y_true, y_pred)
    mse_val = mean_squared_error(y_true, y_pred)
    rmse_val = np.sqrt(mse_val)
    mask = y_true != 0
    mape_val = np.mean(np.abs((y_pred[mask] - y_true[mask]) / y_true[mask])) * 100
    y_pred_clip = np.where(y_pred < 0, 0, y_pred)
    y_true_clip = np.where(y_true < 0, 0, y_true)
    msle_val = mean_squared_error(np.log1p(y_true_clip), np.log1p(y_pred_clip))
    rmsle_val = np.sqrt(msle_val)
    r2_val = r2_score(y_true, y_pred)
    mase_val = mae_val / baseline_mae if baseline_mae != 0 else np.nan
    return r2_val, mae_val, rmse_val, mape_val, mase_val, rmsle_val

Train set size: (1373352, 32) Test set size: (343339, 32)


In [2]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=20,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=120,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033973 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

In [5]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=30,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=120,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033769 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in

In [6]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=40,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=120,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035679 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

In [7]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=50,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=120,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031459 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in

In [8]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=120,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034776 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

In [9]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=70,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=120,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032254 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017556 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [

In [10]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=80,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=120,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033338 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

<h1>Find out num_leaves=60 is the best among 20,30,40,50,60,70,80</h1>

In [11]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=50,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

In [12]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=100,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in

In [13]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=150,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033005 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016600 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in

In [14]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=200,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035624 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017555 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in

In [15]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=250,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

In [16]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=300,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

In [17]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=0.7,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034888 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

<h1>Find out min_child_samples=350 is the best among 50,100,150,200,250,300,350</h1>

In [18]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=0.5,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032605 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

In [19]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=0.6,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030507 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in

In [20]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=0.5,
    subsample_freq=1,
    colsample_bytree=0.8,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034961 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [

In [21]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=0.9,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031354 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

In [22]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=1,
    subsample_freq=1,
    colsample_bytree=0.7,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035483 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

<h1>Find out subsample=1 is the best among 0.5,0.6,0.7,0.8,0.9,1.0</h1>

In [23]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=1,
    subsample_freq=1,
    colsample_bytree=0.6,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memor

In [24]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=1,
    subsample_freq=1,
    colsample_bytree=0.8,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in

In [25]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=1,
    subsample_freq=1,
    colsample_bytree=0.9,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017979 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033828 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037585 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in

In [26]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=1,
    subsample_freq=1,
    colsample_bytree=1,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033587 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 1098682, number of used features: 32
[LightGBM] 

In [27]:
#LightGBM parameters
lgbm_reg = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=60,
    lambda_l1=2.0,
    lambda_l2=10.0,
    min_child_samples=350,
    subsample=1,
    subsample_freq=1,
    colsample_bytree=0.5,
    metric='rmse',
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    lgbm_reg.fit(X_tr, y_tr)
    y_val_pred = lgbm_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

lgbm_reg.fit(X_train, y_train)

y_test_pred = lgbm_reg.predict(X_test)

r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== LightGBM CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== LightGBM Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014654 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385717.968534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 1098681, number of used features: 32
[LightGBM] [Info] Start training from score 385725.474346
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memor

<h1>Find out colsample_bytree=0.6 is the best among 0.5,0.6,0.7,0.8,0.9,1.0</h1>