In [1]:
import pandas as pd
import sys
import os

# Add project root to Python path to allow module imports
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.utils.forecasting import forecast_and_evaluate
from src.utils.cluster_zone import cluster_zones_with_kpi

## Forecast and evaluate
**Objective:** Apply time series forecasting and evaluation to the cleaned January data using the `forecast_and_evaluate` function (from `src.utils.forecasting`).
* **Hourly forecasting:** Forecast trip data at hourly frequency ('H') with 168 test periods and ARIMA order (1, 0, 1), then display metrics and predictions.
* **Daily forecasting:** Forecast at daily frequency ('D') with 7 test periods and the same ARIMA order, then display metrics and predictions.

# Hourly

In [2]:
df1 = pd.read_parquet("../processed/cleaned_data/cleaned_yellow_tripdata_2021-01.parquet")

df1_forecast_hourly = forecast_and_evaluate(
    df1=df1,
    freq='H',
    test_periods= 168,
    arima_order=(1, 0, 1)
)

In [3]:
df1_forecast_hourly['metrics']

Unnamed: 0,MAE,MAPE,RMSE
Baseline,166.916667,0.114262,287.163903
ARIMA,1267.487062,3.724754,1414.31123
Linear Regression,1281.870999,4.993034,1401.112613


In [4]:
df1_forecast_hourly['predictions']

Unnamed: 0_level_0,Actual,Baseline,ARIMA,Linear Regression
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-25 00:00:00-05:00,220,269,364.557096,1915.032717
2021-01-25 01:00:00-05:00,111,160,452.490704,1915.802725
2021-01-25 02:00:00-05:00,65,81,534.602998,1916.572733
2021-01-25 03:00:00-05:00,39,50,611.279355,1917.342741
2021-01-25 04:00:00-05:00,79,104,682.879642,1918.112749
...,...,...,...,...
2021-01-31 19:00:00-05:00,1428,1292,1692.815496,2040.544017
2021-01-31 20:00:00-05:00,887,1093,1692.816741,2041.314025
2021-01-31 21:00:00-05:00,735,872,1692.817905,2042.084033
2021-01-31 22:00:00-05:00,499,778,1692.818991,2042.854041


# Daily

In [5]:
df1_forecast_daily = forecast_and_evaluate(
    df1=df1,
    freq='D',
    test_periods=7,        # 7 ngày test
    arima_order=(1, 0, 1)
)

In [6]:
df1_forecast_daily['metrics']

Unnamed: 0,MAE,MAPE,RMSE
Baseline,2977.714286,0.06666,4136.511505
ARIMA,9485.773476,0.220426,9906.138448
Linear Regression,6379.14677,0.174013,8562.545157


In [7]:
df1_forecast_daily['predictions']

Unnamed: 0_level_0,Actual,Baseline,ARIMA,Linear Regression
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-25 00:00:00-05:00,44314,34864,33515.513602,45164.554348
2021-01-26 00:00:00-05:00,44948,47917,37900.422707,45527.378696
2021-01-27 00:00:00-05:00,49339,46685,39582.472579,45890.203043
2021-01-28 00:00:00-05:00,53085,50449,40227.706428,46253.027391
2021-01-29 00:00:00-05:00,51257,50984,40475.217946,46615.851739
2021-01-30 00:00:00-05:00,36799,39552,40570.163294,46978.676087
2021-01-31 00:00:00-05:00,29219,29110,40606.584302,47341.500435


## Cluster Zones by Temporal–KPI Patterns
**Objective:** Group taxi zones into homogeneous clusters based on their temporal demand and performance characteristics using unsupervised learning.

* **Data Aggregation:** Aggregate trip-level data at the *zone–time-bin* level (zone × hour / time window), computing key KPIs such as number of trips, median (p50) and tail (p95) trip duration, and demand index.
* **Feature Construction:** Represent each zone by a KPI vector capturing its typical operational pattern across time (e.g., rush-hour intensity, variability, and extreme congestion behavior).
* **Normalization:** Apply feature scaling (e.g., standardization) to ensure all KPIs contribute comparably to the distance metric used in clustering.
* **Clustering Method:** Use **K-means clustering** to partition zones into groups with similar demand–performance profiles.
* **Cluster Interpretation:** Assign semantic labels to clusters (e.g., *High-demand core*, *Rush-hour dominant*, *Low-activity peripheral*) based on centroid characteristics and temporal patterns.
* **Analytical Purpose:** Enable comparative analysis across zone types, support operational insights, and provide a structured basis for downstream tasks such as forecasting, anomaly detection, or policy evaluation.


In [8]:
# Perform integrated clustering
qa_flags = pd.read_parquet("../processed/flags_for_analysis/flag_yellow_tripdata_2021-01.parquet")
clustered_df = cluster_zones_with_kpi(df1, qa_flags)

In [9]:
clustered_df[0].head()

Unnamed: 0,zone,time_bin,duration_p50,duration_p95,trips_index_100,cluster,cluster_name
0,Allerton/Pelham Gardens,Early Morning,29.0,44.0,0.721475,1,Low Demand – Smooth Flow
1,Allerton/Pelham Gardens,Morning,24.0,36.8,1.082212,1,Low Demand – Smooth Flow
2,Allerton/Pelham Gardens,Morning Rush,14.0,34.5,6.132533,0,Efficient High Volume
3,Allerton/Pelham Gardens,Midday,16.0,44.25,16.353422,0,Efficient High Volume
4,Allerton/Pelham Gardens,Evening Rush,12.0,41.5,3.12639,0,Efficient High Volume


# Cluster zone 12 months

In [10]:
# Define output directory
output_dir = "../processed/cluster_zone"
os.makedirs(output_dir, exist_ok=True)

# Generate month list from 01 to 12
months = [f"{i:02d}" for i in range(1, 13)]

for month in months:
    output_path = os.path.join(output_dir, f"clustered_yellow_tripdata_2021-{month}.parquet")
    
    # Skip processing if the output file already exists
    if os.path.exists(output_path):
        continue

    # Only log and process if file is missing
    print(f"Processing month {month}...")
    
    # Load input datasets
    df_path = f"../processed/cleaned_data/cleaned_yellow_tripdata_2021-{month}.parquet"
    qa_path = f"../processed/flags_for_analysis/flag_yellow_tripdata_2021-{month}.parquet"
    
    df1 = pd.read_parquet(df_path)
    qa_flags = pd.read_parquet(qa_path)
    
    # Execute clustering algorithm
    clustered_df, centroids = cluster_zones_with_kpi(df1, qa_flags)
    
    # Save results to parquet
    clustered_df.to_parquet(output_path)
    
    print(f"Successfully processed and saved month {month}")

Processing month 01...
Successfully processed and saved month 01
Processing month 02...
Successfully processed and saved month 02
Processing month 03...
Successfully processed and saved month 03
Processing month 04...
Successfully processed and saved month 04
Processing month 05...
Successfully processed and saved month 05
Processing month 06...
Successfully processed and saved month 06
Processing month 07...
Successfully processed and saved month 07
Processing month 08...
Successfully processed and saved month 08
Processing month 09...
Successfully processed and saved month 09
Processing month 10...
Successfully processed and saved month 10
Processing month 11...
Successfully processed and saved month 11
Processing month 12...
Successfully processed and saved month 12
