In [1]:
import pandas as pd
import sys
import os

# Add project root to Python path to allow module imports
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.utils.forecasting import forecast_and_evaluate
from src.utils.cluster_zone import cluster_zone_hour_kpi
from src.utils.kpi import kpi

# Forecast and evaluate

# Hourly

In [2]:
df1 = pd.read_parquet("../processed/cleaned_data/cleaned_yellow_tripdata_2021-01.parquet")
df1_hourly = (
    df1
    .assign(
        hour=lambda x: pd.to_datetime(x['tpep_pickup_datetime']).dt.floor('h')
    )
    .groupby('hour')
    .size()
    .rename('trips')
    .to_frame()
)

df1_forecast_hourly = forecast_and_evaluate(
    df=df1_hourly,
    value_col="trips",
    test_periods= 168,
    freq='H',
    arima_order=(1, 0, 1)
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['baseline_pred'] = df.loc[test.index, 'baseline_pred']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['lr_pred'] = lr.predict(test_feat[features])
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [3]:
df1_forecast_hourly['metrics']

Unnamed: 0,MAE,MAPE (%)
Baseline,176.363095,11.352398
Linear Regression,1160.497902,276.122965
ARIMA,1322.146715,360.165871


In [4]:
df1_forecast_hourly['predictions']

Unnamed: 0_level_0,trips,baseline_pred,lr_pred,arima_pred
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-25 00:00:00,231,286.0,969.264722,379.852381
2021-01-25 01:00:00,119,173.0,1053.915356,471.706925
2021-01-25 02:00:00,67,83.0,1138.565990,557.475946
2021-01-25 03:00:00,43,54.0,1223.216624,637.562619
2021-01-25 04:00:00,88,109.0,1307.867258,712.343410
...,...,...,...,...
2021-01-31 19:00:00,1484,1347.0,2034.393632,1766.280102
2021-01-31 20:00:00,932,1148.0,2119.044266,1766.281392
2021-01-31 21:00:00,770,906.0,2203.694900,1766.282597
2021-01-31 22:00:00,531,812.0,2288.345534,1766.283721


# Daily

In [5]:
df_daily = (
    df1
    .assign(
        day=lambda x: pd.to_datetime(x['tpep_pickup_datetime']).dt.floor('D')
    )
    .groupby('day')
    .size()
    .rename('trips')
    .to_frame()
)

df1_forecast_daily = forecast_and_evaluate(
    df=df_daily,
    value_col="trips",
    test_periods=7,        # 7 ngày test
    freq='D',
    arima_order=(1, 0, 1)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['baseline_pred'] = df.loc[test.index, 'baseline_pred']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['lr_pred'] = lr.predict(test_feat[features])
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [6]:
df1_forecast_daily['metrics']

Unnamed: 0,MAE,MAPE (%)
Baseline,3186.714286,6.789176
Linear Regression,4082.995754,8.756846
ARIMA,9947.501167,22.101616


In [7]:
df1_forecast_daily['predictions']

Unnamed: 0_level_0,trips,baseline_pred,lr_pred,arima_pred
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-25,46338,36394.0,46625.928322,34981.33523
2021-01-26,47005,49962.0,46715.844406,39585.881917
2021-01-27,51595,48598.0,46805.76049,41329.613834
2021-01-28,55438,52365.0,46895.676573,41989.961356
2021-01-29,53623,53207.0,46985.592657,42240.033541
2021-01-30,38445,41290.0,33498.416958,42334.735339
2021-01-31,30500,30425.0,33588.333042,42370.598706


# Cluster Zone

In [8]:
df1_flags = pd.read_parquet("../processed/flags_for_analysis/flag_yellow_tripdata_2021-01.parquet")
df1_kpi = kpi(df1, df1_flags)
df1_kpi_daily = df1_kpi['Daily']

df1_kpi_daily = df1_kpi_daily.copy()

df1_kpi_daily["trips_index"] = (
    df1_kpi_daily["trips"] / df1_kpi_daily["trips"].mean()
)

df1_clustered, cluster_profile, cluster_desc = cluster_zone_hour_kpi(
    df1_kpi_daily,
    n_clusters=4
)

  df_monthly = df_calc.groupby(pd.Grouper(key='tpep_pickup_datetime', freq='M')).agg(**agg_rules).reset_index()


In [9]:
df1_clustered.head()

Unnamed: 0,tpep_pickup_datetime,duration_p50,duration_p95,speed_p50,distance_p50,distance_p95,avg_distance,trips,revuenue_per_trip,revenue_per_mile,Early Morning,Morning,Morning Rush,Midday,Evening Rush,Late Night,trips_index,cluster,cluster_name
0,2021-01-01,9.0,30.0,14.58,2.0,14.092,3.625908,24169,18.882795,5.193585,941.25 / 954.5,243.0 / 238.67,548.0 / 536.0,1507.67 / 1456.67,1700.67 / 1741.67,776.6 / 812.4,0.558982,2,Moderate demand – mixed pattern
1,2021-01-02,10.0,32.0,12.69,1.87,14.2,3.477905,33577,19.008309,5.459799,134.5 / 171.0,179.0 / 162.67,797.67 / 748.33,2499.17 / 2417.17,2756.33 / 2808.33,1369.0 / 1446.4,0.776571,0,Moderate demand – mixed pattern
2,2021-01-03,9.0,32.0,14.37,1.91,17.49,9.956894,25667,20.50652,2.05277,240.75 / 272.5,199.33 / 179.0,679.0 / 643.33,1935.0 / 1886.17,2065.0 / 2111.33,852.8 / 891.8,0.593628,2,Moderate demand – mixed pattern
3,2021-01-04,9.0,32.0,12.29,1.78,12.5405,3.949137,43460,18.55542,4.692513,154.0 / 175.25,495.0 / 408.67,2402.0 / 2323.0,3249.17 / 3215.5,3080.0 / 3163.0,1083.6 / 1156.4,1.005146,1,Moderate demand – mixed pattern
4,2021-01-05,9.0,31.0,11.81,1.7,11.15,4.111425,45777,17.733915,4.305597,166.25 / 183.75,461.67 / 389.67,2439.67 / 2329.33,3438.0 / 3407.0,3287.0 / 3385.67,1183.8 / 1257.2,1.058733,1,Moderate demand – mixed pattern


In [10]:
cluster_profile.head()

Unnamed: 0_level_0,trips_index,duration_p50,duration_p95
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.12,10.0,31.0
1,1.13,9.0,31.08
2,0.66,8.67,29.67
3,0.98,9.0,28.35


In [11]:
cluster_desc[0]

'Nhu cầu và thời lượng trung bình, hành vi hỗn hợp.'