In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('feature_enhanced.csv')  # Adjust path if needed

# Separate features and target
X = df.drop('taxvaluedollarcnt', axis=1)
y = df['taxvaluedollarcnt']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train set size:", X_train.shape, "Test set size:", X_test.shape)

# Baseline MAE (mean predictor)
train_mean_value = y_train.mean()
baseline_pred_train = np.full(len(y_train), train_mean_value)
baseline_pred_test = np.full(len(y_test), train_mean_value)
baseline_mae_train = mean_absolute_error(y_train, baseline_pred_train)
baseline_mae_test = mean_absolute_error(y_test, baseline_pred_test)

# Evaluation metric function
def compute_metrics(y_true, y_pred, baseline_mae):
    mae_val = mean_absolute_error(y_true, y_pred)
    mse_val = mean_squared_error(y_true, y_pred)
    rmse_val = np.sqrt(mse_val)
    mask = y_true != 0
    mape_val = np.mean(np.abs((y_pred[mask] - y_true[mask]) / y_true[mask])) * 100
    y_pred_clip = np.where(y_pred < 0, 0, y_pred)
    y_true_clip = np.where(y_true < 0, 0, y_true)
    msle_val = mean_squared_error(np.log1p(y_true_clip), np.log1p(y_pred_clip))
    rmsle_val = np.sqrt(msle_val)
    r2_val = r2_score(y_true, y_pred)
    mase_val = mae_val / baseline_mae if baseline_mae != 0 else np.nan
    return r2_val, mae_val, rmse_val, mape_val, mase_val, rmsle_val

Train set size: (1373352, 32) Test set size: (343339, 32)


In [5]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    min_samples_leaf=100,
    max_samples=0.5,
    max_features=8,
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5959
MAE: 154507.9566
RMSE: 271369.7698
MAPE: 74.2269
MASE: 0.6381
RMSLE: 0.6498

===== Test Results =====
R2: 0.6000
MAE: 154210.5057
RMSE: 270394.6034
MAPE: 73.8925
MASE: 0.6349
RMSLE: 0.6482


In [6]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_leaf=100,
    max_samples=0.5,
    max_features=8,
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5966
MAE: 154281.1377
RMSE: 271124.5756
MAPE: 73.9746
MASE: 0.6371
RMSLE: 0.6485

===== Test Results =====
R2: 0.6008
MAE: 153942.7373
RMSE: 270128.9593
MAPE: 73.5963
MASE: 0.6338
RMSLE: 0.6466


In [7]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=300,
    max_depth=25,
    min_samples_leaf=100,
    max_samples=0.5,
    max_features=8,
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5965
MAE: 154276.4741
RMSE: 271150.5689
MAPE: 73.9507
MASE: 0.6371
RMSLE: 0.6484

===== Test Results =====
R2: 0.6008
MAE: 153942.8917
RMSE: 270116.5269
MAPE: 73.5737
MASE: 0.6338
RMSLE: 0.6465


In [8]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=300,
    max_depth=30,
    min_samples_leaf=100,
    max_samples=0.5,
    max_features=8,
    n_jobs=4,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5966
MAE: 154280.9789
RMSE: 271145.7572
MAPE: 73.9475
MASE: 0.6371
RMSLE: 0.6483

===== Test Results =====
R2: 0.6007
MAE: 153943.0423
RMSE: 270163.1789
MAPE: 73.5737
MASE: 0.6338
RMSLE: 0.6465


<h1>Find out max_depth=25 is the best among 15,20,25,30</h1>

In [10]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=100,
    max_depth=25,
    min_samples_leaf=100,
    max_samples=0.5,
    max_features=8,
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5965
MAE: 154304.1475
RMSE: 271174.6049
MAPE: 73.9607
MASE: 0.6372
RMSLE: 0.6484

===== Test Results =====
R2: 0.6009
MAE: 153974.1739
RMSE: 270094.6593
MAPE: 73.5967
MASE: 0.6340
RMSLE: 0.6466


In [12]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=25,
    min_samples_leaf=100,
    max_samples=0.5,
    max_features=8,
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5966
MAE: 154277.2786
RMSE: 271127.0490
MAPE: 73.9520
MASE: 0.6371
RMSLE: 0.6484

===== Test Results =====
R2: 0.6009
MAE: 153950.1631
RMSE: 270094.0844
MAPE: 73.5852
MASE: 0.6339
RMSLE: 0.6466


In [2]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=400,
    max_depth=25,
    min_samples_leaf=100,
    max_samples=0.5,
    max_features=8,
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5965
MAE: 154279.9551
RMSE: 271161.8496
MAPE: 73.9520
MASE: 0.6371
RMSLE: 0.6484

===== Test Results =====
R2: 0.6008
MAE: 153940.8525
RMSE: 270138.4997
MAPE: 73.5710
MASE: 0.6338
RMSLE: 0.6465


<h1>Find out n_estimators=200 is the best among 100,200,300,400</h1>

In [3]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=25,
    min_samples_leaf=50,
    max_samples=0.5,
    max_features=8,
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.6035
MAE: 153305.2138
RMSE: 268816.5477
MAPE: 73.4974
MASE: 0.6331
RMSLE: 0.6462

===== Test Results =====
R2: 0.6077
MAE: 153001.8126
RMSE: 267785.3347
MAPE: 73.1570
MASE: 0.6299
RMSLE: 0.6445


In [4]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=25,
    min_samples_leaf=150,
    max_samples=0.5,
    max_features=8,
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5919
MAE: 154928.3462
RMSE: 272724.3463
MAPE: 74.2656
MASE: 0.6398
RMSLE: 0.6500

===== Test Results =====
R2: 0.5963
MAE: 154555.9234
RMSE: 271662.4850
MAPE: 73.8639
MASE: 0.6363
RMSLE: 0.6480


In [5]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=25,
    min_samples_leaf=200,
    max_samples=0.5,
    max_features=8,
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5883
MAE: 155407.3917
RMSE: 273895.9711
MAPE: 74.4920
MASE: 0.6418
RMSLE: 0.6511

===== Test Results =====
R2: 0.5926
MAE: 155042.8159
RMSE: 272887.1662
MAPE: 74.0788
MASE: 0.6384
RMSLE: 0.6491


In [6]:
#RF parameters
rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=25,
    min_samples_leaf=250,
    max_samples=0.5,
    max_features=8,
    n_jobs=-1,
    random_state=42
)

#5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_r2 = []
cv_mae = []
cv_rmse = []
cv_mape = []
cv_mase = []
cv_rmsle = []

for tr_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    rf_reg.fit(X_tr, y_tr)
    y_val_pred = rf_reg.predict(X_val)
    r2, mae, rmse, mape, mase, rmsle = compute_metrics(y_val, y_val_pred, baseline_mae_train)
    cv_r2.append(r2)
    cv_mae.append(mae)
    cv_rmse.append(rmse)
    cv_mape.append(mape)
    cv_mase.append(mase)
    cv_rmsle.append(rmsle)

cv_results = {
    "R2":   np.mean(cv_r2),
    "MAE":  np.mean(cv_mae),
    "RMSE": np.mean(cv_rmse),
    "MAPE": np.mean(cv_mape),
    "MASE": np.mean(cv_mase),
    "RMSLE": np.mean(cv_rmsle)
}

rf_reg.fit(X_train, y_train)
y_test_pred = rf_reg.predict(X_test)

test_r2, test_mae, test_rmse, test_mape, test_mase, test_rmsle = compute_metrics(y_test, y_test_pred, baseline_mae_test)

test_results = {
    "R2":   test_r2,
    "MAE":  test_mae,
    "RMSE": test_rmse,
    "MAPE": test_mape,
    "MASE": test_mase,
    "RMSLE": test_rmsle
}

print("\n===== CV Results =====")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n===== Test Results =====")
for k, v in test_results.items():
    print(f"{k}: {v:.4f}")


===== CV Results =====
R2: 0.5855
MAE: 155794.7258
RMSE: 274847.1620
MAPE: 74.6863
MASE: 0.6434
RMSLE: 0.6521

===== Test Results =====
R2: 0.5899
MAE: 155417.3711
RMSE: 273806.4299
MAPE: 74.2638
MASE: 0.6399
RMSLE: 0.6501


<h1>Find out min_samples_leaf = 50 is the best among 50,100,150,200,250</h1>