In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('feature_enhanced.csv')  # Adjust path if needed

# Separate features and target
X = df.drop('taxvaluedollarcnt', axis=1)
y = df['taxvaluedollarcnt']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train set size:", X_train.shape, "Test set size:", X_test.shape)

# Baseline MAE (mean predictor)
train_mean_value = y_train.mean()
baseline_pred_train = np.full(len(y_train), train_mean_value)
baseline_pred_test = np.full(len(y_test), train_mean_value)
baseline_mae_train = mean_absolute_error(y_train, baseline_pred_train)
baseline_mae_test = mean_absolute_error(y_test, baseline_pred_test)

# Evaluation metric function
def compute_metrics(y_true, y_pred, baseline_mae):
    mae_val = mean_absolute_error(y_true, y_pred)
    mse_val = mean_squared_error(y_true, y_pred)
    rmse_val = np.sqrt(mse_val)
    mask = y_true != 0
    mape_val = np.mean(np.abs((y_pred[mask] - y_true[mask]) / y_true[mask])) * 100
    y_pred_clip = np.where(y_pred < 0, 0, y_pred)
    y_true_clip = np.where(y_true < 0, 0, y_true)
    msle_val = mean_squared_error(np.log1p(y_true_clip), np.log1p(y_pred_clip))
    rmsle_val = np.sqrt(msle_val)
    r2_val = r2_score(y_true, y_pred)
    mase_val = mae_val / baseline_mae if baseline_mae != 0 else np.nan
    return r2_val, mae_val, rmse_val, mape_val, mase_val, rmsle_val

Train set size: (1373352, 32) Test set size: (343339, 32)


In [4]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=3,
    learning_rate=0.10,
    min_child_weight=100,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6054
MAE: 154806.6761
RMSE: 268143.2136
MAPE: 74.2822
MASE: 0.6393
RMSLE: 0.6613

===== XGBoost Test Results =====
R2: 0.6080
MAE: 154767.7234
RMSE: 267684.5943
MAPE: 74.0665
MASE: 0.6372
RMSLE: 0.6603


In [5]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.10,
    min_child_weight=100,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6105
MAE: 153975.6794
RMSE: 266421.6272
MAPE: 73.9850
MASE: 0.6359
RMSLE: 0.6571

===== XGBoost Test Results =====
R2: 0.6140
MAE: 153882.5540
RMSE: 265615.0543
MAPE: 73.7619
MASE: 0.6336
RMSLE: 0.6545


In [6]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.10,
    min_child_weight=100,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6122
MAE: 153533.5965
RMSE: 265839.7112
MAPE: 73.7528
MASE: 0.6341
RMSLE: 0.6512

===== XGBoost Test Results =====
R2: 0.6169
MAE: 153417.8342
RMSE: 264637.7618
MAPE: 73.4945
MASE: 0.6317
RMSLE: 0.6517


In [7]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.10,
    min_child_weight=100,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6129
MAE: 153188.9947
RMSE: 265599.0741
MAPE: 73.6000
MASE: 0.6326
RMSLE: 0.6505

===== XGBoost Test Results =====
R2: 0.6163
MAE: 153130.3989
RMSE: 264817.7169
MAPE: 73.3474
MASE: 0.6305
RMSLE: 0.6492


In [8]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=100,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6123
MAE: 152928.9154
RMSE: 265796.0971
MAPE: 73.4038
MASE: 0.6316
RMSLE: 0.6501

===== XGBoost Test Results =====
R2: 0.6174
MAE: 152769.4566
RMSE: 264448.5363
MAPE: 73.1736
MASE: 0.6290
RMSLE: 0.6489


In [9]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=8,
    learning_rate=0.10,
    min_child_weight=100,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6122
MAE: 152713.8811
RMSE: 265852.9580
MAPE: 73.2504
MASE: 0.6307
RMSLE: 0.6482

===== XGBoost Test Results =====
R2: 0.6153
MAE: 152704.4090
RMSE: 265184.4481
MAPE: 73.0217
MASE: 0.6287
RMSLE: 0.6467


In [10]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=9,
    learning_rate=0.10,
    min_child_weight=100,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6112
MAE: 152630.4855
RMSE: 266183.8685
MAPE: 73.1443
MASE: 0.6303
RMSLE: 0.6482

===== XGBoost Test Results =====
R2: 0.6147
MAE: 152530.8241
RMSE: 265381.4544
MAPE: 72.8834
MASE: 0.6280
RMSLE: 0.6478


<h1>Find out max_depth=7 is the best among 3,4,5,6,7,8,9</h1>

In [11]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=50,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6115
MAE: 152925.7057
RMSE: 266083.4401
MAPE: 73.3899
MASE: 0.6315
RMSLE: 0.6485

===== XGBoost Test Results =====
R2: 0.6164
MAE: 152841.9559
RMSE: 264799.4724
MAPE: 73.1514
MASE: 0.6293
RMSLE: 0.6479


In [12]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=100,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6123
MAE: 152928.9154
RMSE: 265796.0971
MAPE: 73.4038
MASE: 0.6316
RMSLE: 0.6501

===== XGBoost Test Results =====
R2: 0.6174
MAE: 152769.4566
RMSE: 264448.5363
MAPE: 73.1736
MASE: 0.6290
RMSLE: 0.6489


In [13]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=150,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6137
MAE: 152867.9376
RMSE: 265334.3269
MAPE: 73.4157
MASE: 0.6313
RMSLE: 0.6492

===== XGBoost Test Results =====
R2: 0.6172
MAE: 152801.4704
RMSE: 264530.3868
MAPE: 73.1794
MASE: 0.6291
RMSLE: 0.6473


In [14]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=200,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6137
MAE: 152916.7669
RMSE: 265315.5550
MAPE: 73.4281
MASE: 0.6315
RMSLE: 0.6495

===== XGBoost Test Results =====
R2: 0.6175
MAE: 152802.4631
RMSE: 264422.5190
MAPE: 73.1630
MASE: 0.6291
RMSLE: 0.6480


In [15]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=250,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6137
MAE: 152913.7369
RMSE: 265320.1593
MAPE: 73.4200
MASE: 0.6315
RMSLE: 0.6493

===== XGBoost Test Results =====
R2: 0.6178
MAE: 152825.6109
RMSE: 264324.0865
MAPE: 73.1885
MASE: 0.6292
RMSLE: 0.6487


In [16]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=300,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6141
MAE: 152930.2390
RMSE: 265187.1685
MAPE: 73.4386
MASE: 0.6316
RMSLE: 0.6494

===== XGBoost Test Results =====
R2: 0.6174
MAE: 152827.4543
RMSE: 264441.9659
MAPE: 73.1782
MASE: 0.6292
RMSLE: 0.6479


<h1>Find out min_child_weight=250 is the best among 50,100,150,200,250,300</h1>

In [17]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=100,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=250,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6117
MAE: 153392.1084
RMSE: 266005.2621
MAPE: 73.7974
MASE: 0.6335
RMSLE: 0.6486

===== XGBoost Test Results =====
R2: 0.6156
MAE: 153283.6223
RMSE: 265074.9028
MAPE: 73.5366
MASE: 0.6311
RMSLE: 0.6479


In [18]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=250,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6136
MAE: 153065.0324
RMSE: 265352.1162
MAPE: 73.5705
MASE: 0.6321
RMSLE: 0.6483

===== XGBoost Test Results =====
R2: 0.6176
MAE: 153009.2346
RMSE: 264385.1940
MAPE: 73.3276
MASE: 0.6300
RMSLE: 0.6490


In [19]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=300,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=250,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6140
MAE: 152949.0459
RMSE: 265230.7415
MAPE: 73.4781
MASE: 0.6316
RMSLE: 0.6487

===== XGBoost Test Results =====
R2: 0.6178
MAE: 152857.3220
RMSE: 264303.5414
MAPE: 73.2251
MASE: 0.6294
RMSLE: 0.6483


In [20]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=250,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6137
MAE: 152913.7369
RMSE: 265320.1593
MAPE: 73.4200
MASE: 0.6315
RMSLE: 0.6493

===== XGBoost Test Results =====
R2: 0.6178
MAE: 152825.6109
RMSE: 264324.0865
MAPE: 73.1885
MASE: 0.6292
RMSLE: 0.6487


In [21]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=500,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=250,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6133
MAE: 152900.7292
RMSE: 265466.1919
MAPE: 73.3891
MASE: 0.6314
RMSLE: 0.6493

===== XGBoost Test Results =====
R2: 0.6178
MAE: 152776.7593
RMSE: 264328.8481
MAPE: 73.1422
MASE: 0.6290
RMSLE: 0.6494


In [22]:
#XGBoost parameters
xgb_reg = XGBRegressor(
    n_estimators=600,
    max_depth=7,
    learning_rate=0.10,
    min_child_weight=250,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=0.5,
    tree_method='hist',
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2, cv_mae, cv_rmse, cv_mape, cv_mase, cv_rmsle = [], [], [], [], [], []

for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb_reg.fit(X_tr, y_tr)
    y_val_pred = xgb_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":    np.mean(cv_r2),
    "MAE":   np.mean(cv_mae),
    "RMSE":  np.mean(cv_rmse),
    "MAPE":  np.mean(cv_mape),
    "MASE":  np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

xgb_reg.fit(X_train, y_train)

y_test_pred_xgb = xgb_reg.predict(X_test)
r2_te, mae_te, rmse_te, mape_te, mase_te, rmsle_te = compute_metrics(
    y_test, y_test_pred_xgb, baseline_mae_test
)

test_results = {
    "R2":    r2_te,
    "MAE":   mae_te,
    "RMSE":  rmse_te,
    "MAPE":  mape_te,
    "MASE":  mase_te,
    "RMSLE": rmsle_te
}

print("\n===== XGBoost CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== XGBoost Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")



===== XGBoost CV Results =====
R2: 0.6129
MAE: 152896.6918
RMSE: 265610.1693
MAPE: 73.3583
MASE: 0.6314
RMSLE: 0.6490

===== XGBoost Test Results =====
R2: 0.6172
MAE: 152774.4342
RMSE: 264510.2125
MAPE: 73.1289
MASE: 0.6290
RMSLE: 0.6501


<h1>Find out n_estimators = 300 is the best among 100,200,300,400,500,600</h1>