In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, KFold
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
import joblib
import warnings
warnings.filterwarnings('ignore')

In [20]:

df = pd.read_csv("final_dataset.csv")

In [21]:

df["Hour"] = pd.to_datetime(df["Time"], format="%H:%M:%S").dt.hour
df["Date"] = pd.to_datetime(df["Date"])
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day

In [22]:

X = df[["Latitude", "Longitude", "Hour", "Month", "Day", "Cultural_activity_prefered"]]
y = df[["Total crowd", "Taxi zone crowd score", "Activity Score"]]

In [23]:

categorical_features = ["Cultural_activity_prefered"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[("cat", categorical_transformer, categorical_features)],
    remainder="passthrough"
)

In [24]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:

lgbm_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", MultiOutputRegressor(
        LGBMRegressor(
            objective='regression',
            boosting_type='gbdt',    
            verbose=-1,              
            random_state=42,
            force_col_wise=True     
        )
    ))
])

In [26]:

param_grid = {
   
    "regressor__estimator__n_estimators": [100, 300, 500, 800],
    "regressor__estimator__learning_rate": [0.01, 0.05, 0.1, 0.2],
    
    
    "regressor__estimator__max_depth": [3, 6, 9, 12],
    "regressor__estimator__num_leaves": [31, 50, 100, 150],
    
    
    "regressor__estimator__reg_alpha": [0, 0.01, 0.1, 1.0],     
    "regressor__estimator__reg_lambda": [0, 0.01, 0.1, 1.0],     
    
   
    "regressor__estimator__subsample": [0.6, 0.8, 0.9, 1.0],     
    "regressor__estimator__colsample_bytree": [0.6, 0.8, 0.9, 1.0], 
    
   
    "regressor__estimator__min_child_samples": [10, 20, 30],     
    "regressor__estimator__min_split_gain": [0.0, 0.1, 0.2],     
}

In [27]:

random_search = RandomizedSearchCV(
    estimator=lgbm_pipeline,
    param_distributions=param_grid,
    n_iter=30,                          
    scoring="neg_mean_squared_error",
    cv=3,                           
    verbose=2,
    n_jobs=-1,                         
    random_state=42
)

In [28]:

print("Starting LightGBM parameter tuning...")
random_search.fit(X_train, y_train)

print("Best LightGBM parameters found:")
print(random_search.best_params_)
print(f"Best cross-validation score: {-random_search.best_score_:.4f}")

Starting LightGBM parameter tuning...
Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END regressor__estimator__colsample_bytree=0.8, regressor__estimator__learning_rate=0.2, regressor__estimator__max_depth=3, regressor__estimator__min_child_samples=10, regressor__estimator__min_split_gain=0.1, regressor__estimator__n_estimators=100, regressor__estimator__num_leaves=50, regressor__estimator__reg_alpha=0.1, regressor__estimator__reg_lambda=0.1, regressor__estimator__subsample=0.9; total time=   2.6s
[CV] END regressor__estimator__colsample_bytree=0.8, regressor__estimator__learning_rate=0.2, regressor__estimator__max_depth=3, regressor__estimator__min_child_samples=10, regressor__estimator__min_split_gain=0.1, regressor__estimator__n_estimators=100, regressor__estimator__num_leaves=50, regressor__estimator__reg_alpha=0.1, regressor__estimator__reg_lambda=0.1, regressor__estimator__subsample=0.9; total time=   2.6s
[CV] END regressor__estimator__colsample_bytree=0.8, re

In [29]:

best_model = random_search.best_estimator_

In [None]:

y_pred = best_model.predict(X_test)

print("LightGBM Evaluation:")
r2_scores = []
rmse_scores = []

for i, target in enumerate(y_test.columns):
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
  
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    rmse = np.sqrt(mse)
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    print(f"{target}: R² = {r2:.4f}, RMSE = {rmse:.2f}")

print(f"\nAverage Metrics Across All Targets:")
print(f"Mean R² = {np.mean(r2_scores):.4f}")
print(f"Mean RMSE = {np.mean(rmse_scores):.2f}")

LightGBM Evaluation:
Total crowd: R² = 0.9359, RMSE = 704.11
Taxi zone crowd score: R² = 0.8662, RMSE = 0.14
Activity Score: R² = 0.9953, RMSE = 0.07

Average Metrics Across All Targets:
Mean R² = 0.9325
Mean RMSE = 234.77


In [31]:

def multioutput_r2(y_true, y_pred):
    y_true = np.array(y_true) if isinstance(y_true, pd.DataFrame) else y_true
    y_pred = np.array(y_pred) if isinstance(y_pred, pd.DataFrame) else y_pred
    return np.mean([r2_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])

def multioutput_rmse(y_true, y_pred):
    y_true = np.array(y_true) if isinstance(y_true, pd.DataFrame) else y_true
    y_pred = np.array(y_pred) if isinstance(y_pred, pd.DataFrame) else y_pred
    return np.mean([np.sqrt(mean_squared_error(y_true[:, i], y_pred[:, i])) 
                   for i in range(y_true.shape[1])])

In [None]:

cv = KFold(n_splits=5, shuffle=True, random_state=42)

print("\nPerforming cross-validation...")
r2_scores_cv = cross_val_score(lgbm_pipeline, X, y, cv=cv, scoring=make_scorer(multioutput_r2))
rmse_scores_cv = cross_val_score(lgbm_pipeline, X, y, cv=cv, 
                                scoring=make_scorer(multioutput_rmse, greater_is_better=False))

print("\nCross-validated results:")
print("R² scores per fold:", r2_scores_cv)
print("Mean R²:", r2_scores_cv.mean())
print("R² std:", r2_scores_cv.std())
print("\nRMSE scores per fold:", -rmse_scores_cv)
print("Mean RMSE:", -rmse_scores_cv.mean())
print("RMSE std:", rmse_scores_cv.std())


Performing cross-validation...

Cross-validated results:
R² scores per fold: [0.9542476  0.93986912 0.91969017 0.93085705 0.93949816]
Mean R²: 0.9368324206554351
R² std: 0.01139454290165753

RMSE scores per fold: [241.00840328 286.66948067 318.44505074 270.92015419 264.33752053]
Mean RMSE: 276.2761218809933
RMSE std: 25.695404395374386


In [33]:


fitted_preprocessor = best_model.named_steps['preprocessor']
cat_feature_names = fitted_preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()


numerical_feature_names = ['Latitude', 'Longitude', 'Hour', 'Month', 'Day']


all_feature_names = cat_feature_names + numerical_feature_names


importances = []
for estimator in best_model.named_steps['regressor'].estimators_:
    importances.append(estimator.feature_importances_)

mean_importance = np.mean(importances, axis=0)
feature_importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': mean_importance
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance_df.head(10))


Top 10 Most Important Features:
                                           feature   importance
9                                              Day  6321.666667
5                                         Latitude  5163.000000
7                                             Hour  5064.333333
6                                        Longitude  4452.333333
8                                            Month  3294.666667
2            Cultural_activity_prefered_Filmmaking   201.333333
1               Cultural_activity_prefered_Busking   143.000000
0              Cultural_activity_prefered_Art Sale   120.000000
4    Cultural_activity_prefered_Street photography    89.666667
3  Cultural_activity_prefered_Portrait photography    88.333333


In [None]:

joblib.dump(best_model, "lightgbm_model.pkl")
print("\nModel saved as 'lightgbm_model.pkl'")


feature_importance_df.to_csv("lightgbm_feature_importance.csv", index=False)
print("Feature importance saved as 'lightgbm_feature_importance.csv'")


Model saved as 'lightgbm_model.pkl'
Feature importance saved as 'lightgbm_feature_importance.csv'
