In [None]:
import pandas as pd
import sys
import os

# Add project root to Python path to allow module imports
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.utils.forecasting import forecast_and_evaluate
from src.utils.cluster_zone import cluster_zones_with_kpi

# Forecast and evaluate

# Hourly

In [None]:
df1 = pd.read_parquet("../processed/cleaned_data/cleaned_yellow_tripdata_2021-01.parquet")

df1_forecast_hourly = forecast_and_evaluate(
    df1=df1,
    freq='H',
    test_periods= 168,
    arima_order=(1, 0, 1)
)

In [None]:
df1_forecast_hourly['metrics']

In [None]:
df1_forecast_hourly['predictions']

# Daily

In [None]:
df1_forecast_daily = forecast_and_evaluate(
    df1=df1,
    freq='D',
    test_periods=7,        # 7 ngày test
    arima_order=(1, 0, 1)
)

In [None]:
df1_forecast_daily['metrics']

In [None]:
df1_forecast_daily['predictions']

# Cluster Zone

In [None]:
# Load data for clustering
df1 = pd.read_parquet("../processed/cleaned_data/cleaned_yellow_tripdata_2021-01.parquet")

In [None]:
# Perform integrated clustering
qa_flags = pd.read_parquet("../processed/flags_for_analysis/flag_yellow_tripdata_2021-01.parquet")
clustered_df = cluster_zones_with_kpi(df1, qa_flags)

In [None]:
clustered_df[0].head()

# Cluster zone 12 months

In [None]:
# Tạo folder nếu thiếu
os.makedirs("../processed/cluster_zone", exist_ok=True)

# Danh sách tháng từ 01 đến 12
months = [f"{i:02d}" for i in range(1, 13)]

for month in months:
    # Đọc dữ liệu cleaned
    print(f"Đang xử lý tháng {month}")
    df_path = f"../processed/cleaned_data/cleaned_yellow_tripdata_2021-{month}.parquet"
    df1 = pd.read_parquet(df_path)
    
    # Đọc QA flags
    qa_path = f"../processed/flags_for_analysis/flag_yellow_tripdata_2021-{month}.parquet"
    qa_flags = pd.read_parquet(qa_path)
    
    # Thực hiện clustering
    clustered_df, centroids = cluster_zones_with_kpi(df1, qa_flags)
    
    # Lưu kết quả clustering
    output_path = f"../processed/cluster_zone/clustered_yellow_tripdata_2021-{month}.parquet"
    clustered_df.to_parquet(output_path)
    
    print(f"Đã xử lý xong tháng {month}")