In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
import joblib

In [93]:
df = pd.read_csv("final_dataset.csv")

In [94]:
df["Hour"] = pd.to_datetime(df["Time"], format="%H:%M:%S").dt.hour
df["Date"] = pd.to_datetime(df["Date"])

In [97]:

df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
 

In [99]:
X = df[["Latitude", "Longitude", "Hour", "Month","Day","Cultural_activity_prefered"]]
y = df[["Total crowd", "Taxi zone crowd score", "Activity Score"]]

In [101]:
categorical_features = ["Cultural_activity_prefered"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[("cat", categorical_transformer, categorical_features)],
    remainder="passthrough"
)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [105]:
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", MultiOutputRegressor(
        XGBRegressor(objective="reg:squarederror", tree_method="hist")
    ))
])

In [107]:
param_grid = {
    "regressor__estimator__n_estimators": [100, 300, 500],
    "regressor__estimator__learning_rate": [0.01, 0.05, 0.1],
    "regressor__estimator__max_depth": [3, 5, 7],
    "regressor__estimator__subsample": [0.6, 0.8, 1.0],
    "regressor__estimator__colsample_bytree": [0.6, 0.8, 1.0],
    "regressor__estimator__gamma": [0, 0.1, 0.3],
    "regressor__estimator__reg_alpha": [0, 0.01, 0.1],
    "regressor__estimator__reg_lambda": [0.5, 1.0, 1.5],
}

In [109]:
random_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_grid,
    n_iter=30,               
    scoring="neg_mean_squared_error",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [111]:
random_search.fit(X_train, y_train)

print("Best parameters found:")
print(random_search.best_params_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best parameters found:
{'regressor__estimator__subsample': 1.0, 'regressor__estimator__reg_lambda': 0.5, 'regressor__estimator__reg_alpha': 0, 'regressor__estimator__n_estimators': 300, 'regressor__estimator__max_depth': 7, 'regressor__estimator__learning_rate': 0.1, 'regressor__estimator__gamma': 0, 'regressor__estimator__colsample_bytree': 1.0}


In [113]:
best_model = random_search.best_estimator_

In [115]:
y_pred = best_model.predict(X_test)

print("Tuned XGBoost Evaluation:")
r2_scores = []
rmse_scores = []

for i, target in enumerate(y_test.columns):
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    rmse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i], squared=False)
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    print(f"{target}: R² = {r2:.4f}, RMSE = {rmse:.2f}")

print("\n Average Metrics Across All Targets:")
print(f"Mean R² = {np.mean(r2_scores):.4f}")
print(f"Mean RMSE = {np.mean(rmse_scores):.2f}")

Tuned XGBoost Evaluation:
Total crowd: R² = 0.9391, RMSE = 1436.47
Taxi zone crowd score: R² = 0.9377, RMSE = 0.20
Activity Score: R² = 0.9983, RMSE = 0.09

 Average Metrics Across All Targets:
Mean R² = 0.9584
Mean RMSE = 478.92




In [117]:
def multioutput_r2(y_true, y_pred):
    y_true = np.array(y_true) if isinstance(y_true, pd.DataFrame) else y_true
    y_pred = np.array(y_pred) if isinstance(y_pred, pd.DataFrame) else y_pred
    return np.mean([r2_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])


In [119]:
def multioutput_rmse(y_true, y_pred):
    y_true = np.array(y_true) if isinstance(y_true, pd.DataFrame) else y_true
    y_pred = np.array(y_pred) if isinstance(y_pred, pd.DataFrame) else y_pred
    return np.mean([mean_squared_error(y_true[:, i], y_pred[:, i], squared=False) for i in range(y_true.shape[1])])

In [121]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)


In [123]:
r2_scores = cross_val_score(xgb_pipeline, X, y, cv=cv, scoring=make_scorer(multioutput_r2))
rmse_scores = cross_val_score(xgb_pipeline, X, y, cv=cv, scoring=make_scorer(multioutput_rmse, greater_is_better=False))
print("\nCross-validated R² scores:", r2_scores)
print("Mean R²:", r2_scores.mean())
print("Cross-validated RMSE scores:", -rmse_scores)
print("Mean RMSE:", -rmse_scores.mean())




Cross-validated R² scores: [0.94739889 0.94623216 0.94715355 0.94636767 0.94751081]
Mean R²: 0.9469326156503154
Cross-validated RMSE scores: [539.89487922 543.32241999 534.60939624 540.5182873  536.74132684]
Mean RMSE: 539.0172619175693




In [82]:
joblib.dump(best_model, "xgboost_model.pkl")

['xgboost_model.pkl']