In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, KFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.multioutput import MultiOutputRegressor
import joblib


df = pd.read_csv("final_dataset.csv")
df["Hour"] = pd.to_datetime(df["Time"], format="%H:%M:%S").dt.hour
df["Date"] = pd.to_datetime(df["Date"])
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day

X = df[["Latitude", "Longitude", "Hour", "Month", "Day", "Cultural_activity_prefered"]]
y = df[["Total crowd", "Taxi zone crowd score", "Activity Score"]]


categorical_features = ["Cultural_activity_prefered"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


catboost_regressor = CatBoostRegressor(
    cat_features=categorical_features,
    verbose=False,  
    random_state=42
)

catboost_pipeline = MultiOutputRegressor(catboost_regressor)


param_grid = {
    "estimator__iterations": [100, 300, 500],
    "estimator__learning_rate": [0.01, 0.05, 0.1],
    "estimator__depth": [3, 6, 9],
    "estimator__l2_leaf_reg": [1, 3, 5],
    "estimator__bagging_temperature": [0, 1, 2],
    "estimator__random_strength": [0, 1, 2]
}


random_search = RandomizedSearchCV(
    estimator=catboost_pipeline,
    param_distributions=param_grid,
    n_iter=20,
    scoring="neg_mean_squared_error",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)
print("Best CatBoost parameters:", random_search.best_params_)


best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("CatBoost Evaluation:")
for i, target in enumerate(y_test.columns):
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
    print(f"{target}: R² = {r2:.4f}, RMSE = {rmse:.2f}")

joblib.dump(best_model, "catboost_model.pkl")

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END estimator__bagging_temperature=1, estimator__depth=3, estimator__iterations=500, estimator__l2_leaf_reg=1, estimator__learning_rate=0.05, estimator__random_strength=2; total time=   1.0s
[CV] END estimator__bagging_temperature=1, estimator__depth=3, estimator__iterations=500, estimator__l2_leaf_reg=1, estimator__learning_rate=0.05, estimator__random_strength=2; total time=   1.0s
[CV] END estimator__bagging_temperature=1, estimator__depth=3, estimator__iterations=500, estimator__l2_leaf_reg=1, estimator__learning_rate=0.05, estimator__random_strength=2; total time=   1.3s
[CV] END estimator__bagging_temperature=2, estimator__depth=3, estimator__iterations=300, estimator__l2_leaf_reg=1, estimator__learning_rate=0.01, estimator__random_strength=2; total time=   0.5s
[CV] END estimator__bagging_temperature=2, estimator__depth=3, estimator__iterations=300, estimator__l2_leaf_reg=1, estimator__learning_rate=0.01, estimato

['catboost_model.pkl']