In [158]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
import joblib
import numpy as np
import matplotlib.pyplot as plt

In [160]:
df = pd.read_csv("final_dataset.csv")

In [161]:
df["Hour"] = pd.to_datetime(df["Time"], format="%H:%M:%S").dt.hour
df["Date"] = pd.to_datetime(df["Date"])

In [164]:
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day

In [166]:
X = df[["Latitude", "Longitude", "Hour", "Month", "Day", "Cultural_activity_prefered"]]
y = df[["Total crowd", "Taxi zone crowd score", "Activity Score"]]

In [168]:
categorical_features = ["Cultural_activity_prefered"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[("cat", categorical_transformer, categorical_features)],
    remainder="passthrough"
)

In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [172]:
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42)))
])

In [174]:
rf_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [176]:
y_pred = rf_pipeline.predict(X_test)

print("Random Forest Evaluation:")
r2_scores = []
rmse_scores = []

for i, target in enumerate(y.columns):
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    rmse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i], squared=False)
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    print(f"{target}: R2 = {r2:.4f}, RMSE = {rmse:.2f}")

print(f"\n[Random Forest Summary]")
print(f"Average R2 Score: {sum(r2_scores)/len(r2_scores):.4f}")
print(f"Average RMSE: {sum(rmse_scores)/len(rmse_scores):.2f}")

Random Forest Evaluation:
Total crowd: R2 = 0.9558, RMSE = 1223.70
Taxi zone crowd score: R2 = 0.9559, RMSE = 0.17
Activity Score: R2 = 0.9998, RMSE = 0.03

[Random Forest Summary]
Average R2 Score: 0.9705
Average RMSE: 407.97




In [178]:
def multi_r2_score(y_true, y_pred):
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.to_numpy()
    if isinstance(y_pred, pd.DataFrame):
        y_pred = y_pred.to_numpy()
    return np.mean([
        r2_score(y_true[:, i], y_pred[:, i]) 
        for i in range(y_true.shape[1])
    ])
r2_scorer = make_scorer(multi_r2_score, greater_is_better=True)

In [180]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
rf_scores = cross_val_score(rf_pipeline, X, y, cv=cv, scoring=r2_scorer)

In [182]:
scores = cross_val_score(rf_pipeline, X, y, cv=5, scoring=r2_scorer)
print("R2 per fold:", scores)
print("Mean R2:", np.mean(scores))

R2 per fold: [0.78953624 0.9472438  0.95220675 0.92684738 0.93018263]
Mean R2: 0.9092033616660877


In [154]:
joblib.dump(rf_pipeline, "random_forest_model.pkl")

['random_forest_model.pkl']