In [20]:
#import required library dependency 
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
# Load cleaned, encoded dataset
ce_data = pd.read_csv('../outputs/airbnb_cleaned_encoded.csv')
X = ce_data.drop('price', axis=1)
y = ce_data['price']

#TrainTestSplit Training & Testing purpose setting test_size = 0.2 & randomstate = 42
X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [22]:
#Setting hyperparameter 
parameter_grid = {
    'n_estimators': [300, 500, 800, 1000],
    'max_depth': [3, 5, 7, 10, 12],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 3, 5],
    'min_child_weight': [1, 3, 5]
}

xgb = XGBRegressor(random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=parameter_grid,
    n_iter=52,  # use n iterations higher number for in depth  search 
    scoring='r2',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("✅ Best Parameters:", random_search.best_params_)
print("📊 Best CV R² Score:", random_search.best_score_)



Fitting 5 folds for each of 52 candidates, totalling 260 fits
[CV] END colsample_bytree=1.0, gamma=3, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=1000, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=1.0, gamma=3, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=1000, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=1.0, gamma=3, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=1000, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=1.0, gamma=3, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=1000, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=1.0, gamma=3, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=1000, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=800, subsample=1.0; total time=   0.7s
[CV] END colsample_bytree=0.6, gamma=1, learning_rate=0.01, ma

In [23]:
best_xgb = random_search.best_estimator_
y_pred_best = best_xgb.predict(X_test)

rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print("📉 Tuned XGBoost Test Results:")
print(f"RMSE: {rmse_best:.2f}")
print(f"MAE: {mae_best:.2f}")
print(f"R²: {r2_best:.4f}")


#Not convincing!!!

📉 Tuned XGBoost Test Results:
RMSE: 57.90
MAE: 33.69
R²: 0.6272
