In [None]:
import pandas as pd
import sys
import os

# Add project root to Python path to allow module imports
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.utils.forecasting import forecast_and_evaluate
from src.utils.cluster_zone import cluster_zones_with_kpi

## Forecast and evaluate
**Objective:** Apply time series forecasting and evaluation to the cleaned January data using the `forecast_and_evaluate` function (from `src.utils.forecasting`).
* **Hourly forecasting:** Forecast trip data at hourly frequency ('H') with 168 test periods and ARIMA order (1, 0, 1), then display metrics and predictions.
* **Daily forecasting:** Forecast at daily frequency ('D') with 7 test periods and the same ARIMA order, then display metrics and predictions.

# Hourly

In [None]:
df1 = pd.read_parquet("../processed/cleaned_data/cleaned_yellow_tripdata_2021-01.parquet")

df1_forecast_hourly = forecast_and_evaluate(
    df1=df1,
    freq='H',
    test_periods= 168,
    arima_order=(1, 0, 1)
)

In [None]:
df1_forecast_hourly['metrics']

In [None]:
df1_forecast_hourly['predictions']

# Daily

In [None]:
df1_forecast_daily = forecast_and_evaluate(
    df1=df1,
    freq='D',
    test_periods=7,        # 7 ngày test
    arima_order=(1, 0, 1)
)

In [None]:
df1_forecast_daily['metrics']

In [None]:
df1_forecast_daily['predictions']

## Cluster Zones by Temporal–KPI Patterns
**Objective:** Group taxi zones into homogeneous clusters based on their temporal demand and performance characteristics using unsupervised learning.

* **Data Aggregation:** Aggregate trip-level data at the *zone–time-bin* level (zone × hour / time window), computing key KPIs such as number of trips, median (p50) and tail (p95) trip duration, and demand index.
* **Feature Construction:** Represent each zone by a KPI vector capturing its typical operational pattern across time (e.g., rush-hour intensity, variability, and extreme congestion behavior).
* **Normalization:** Apply feature scaling (e.g., standardization) to ensure all KPIs contribute comparably to the distance metric used in clustering.
* **Clustering Method:** Use **K-means clustering** to partition zones into groups with similar demand–performance profiles.
* **Cluster Interpretation:** Assign semantic labels to clusters (e.g., *High-demand core*, *Rush-hour dominant*, *Low-activity peripheral*) based on centroid characteristics and temporal patterns.
* **Analytical Purpose:** Enable comparative analysis across zone types, support operational insights, and provide a structured basis for downstream tasks such as forecasting, anomaly detection, or policy evaluation.


In [None]:
# Perform integrated clustering
qa_flags = pd.read_parquet("../processed/flags_for_analysis/flag_yellow_tripdata_2021-01.parquet")
clustered_df = cluster_zones_with_kpi(df1, qa_flags)

In [None]:
clustered_df[0].head()

# Cluster zone 12 months

In [None]:
# Define output directory
output_dir = "../processed/cluster_zone"
os.makedirs(output_dir, exist_ok=True)

# Generate month list from 01 to 12
months = [f"{i:02d}" for i in range(1, 13)]

for month in months:
    output_path = os.path.join(output_dir, f"clustered_yellow_tripdata_2021-{month}.parquet")
    
    # Skip processing if the output file already exists
    if os.path.exists(output_path):
        continue

    # Only log and process if file is missing
    print(f"Processing month {month}...")
    
    # Load input datasets
    df_path = f"../processed/cleaned_data/cleaned_yellow_tripdata_2021-{month}.parquet"
    qa_path = f"../processed/flags_for_analysis/flag_yellow_tripdata_2021-{month}.parquet"
    
    df1 = pd.read_parquet(df_path)
    qa_flags = pd.read_parquet(qa_path)
    
    # Execute clustering algorithm
    clustered_df, centroids = cluster_zones_with_kpi(df1, qa_flags)
    
    # Save results to parquet
    clustered_df.to_parquet(output_path)
    
    print(f"Successfully processed and saved month {month}")