This notebook combines all features into one large model. 

In [1]:
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
from autogluon.timeseries.utils.forecast import get_forecast_horizon_index_ts_dataframe
import matplotlib.pyplot as plt
import os
import pandas as pd
import sys
module_path = os.path.abspath(os.path.join('../..', 'utils'))
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

from ml_tools import add_weekends_holidays

d:\CS\summer_project\comp47360-group8\ml_pipeline\utils


In [2]:
df = pd.read_parquet('../../data_evaluation/taxi_trip_data/all_tlc_data_cleaned.parquet')
df.rename({'passenger_count':'busyness', 'location':'item_id', 'datetime': 'timestamp'},axis=1, inplace=True)

df['day'] = df['timestamp'].dt.day_name().str.lower()
df['hour'] = df['timestamp'].dt.hour
df_open = pd.read_csv("../../data_preparation/taxi_location_num_businesses_open/taxi_location_num_businesses_open.csv")
df_open.rename({'location':'item_id'},axis=1, inplace=True)
df = df.join(df_open.set_index(['day', 'hour', 'item_id']), on=['day', 'hour', 'item_id'])
df.fillna({'alcohol':0, 'food':0, 'leisure':0}, inplace=True)
df.drop(['day', 'hour'], axis=1, inplace=True)

df_wth = pd.read_csv("../../data_evaluation/weather_meteostat/meteostat_weather.csv")
df_wth.rename({'time':'timestamp'}, axis=1,inplace=True)
df_wth['timestamp'] = df_wth['timestamp'].astype('datetime64[us]')
df_wth['timestamp'] = pd.to_datetime(df_wth['timestamp'])
df_wth['coco'] = df_wth['coco'].astype('category')
df_wth.sort_values(by=['timestamp'])
df = df.join(df_wth.set_index('timestamp'), on='timestamp', how='left')

static_features_df = pd.read_csv("../../data_evaluation/taxi_trip_data/taxi_zone_lookup.csv")
static_features_df.rename({'LocationID': 'item_id'}, axis=1, inplace=True)

data = TimeSeriesDataFrame.from_data_frame(
    df,
    id_column="item_id",
    timestamp_column="timestamp",
    static_features_df = static_features_df
)
add_weekends_holidays(data)

prediction_length = 3 * 30 * 24 # 3 months prediction window
train_data, test_data = data.train_test_split(prediction_length)

In [8]:
# setting the frequency to h since the data is grouped/collected hourly,
# # added relative path 
mase_values = []
for attention_layers in [4, 6, 8]:
    model_name = f"patch_tst_attention_heads_{attention_layers}" + "_model_files"

    print("Evaluating hidden layer size:", attention_layers)
    predictor = TimeSeriesPredictor(
        freq='h',
        target="busyness",
        eval_metric="MASE",
        prediction_length=prediction_length,
        path=model_name,
        known_covariates_names=['food', 'alcohol', 'leisure', 'weekend', 'holiday', 'temp', 'dwpt', 'rhum', 'prcp', 'pres', 'coco']
    )
    predictions = predictor.fit(train_data,
                                    hyperparameters= {"PatchTST": {
                                        "context_length":192,
                                        "d_model":int(32*attention_layers/4),
                                        "nhead":attention_layers
                                    }}
                                    # excluded_model_types=["Chronos", "RecursiveTabular", "NPTS", "DeepAR", "AutoARIMA", "AutoETS", "DirectTabular", "DeepAR"]
                                    
                                    )
    print(predictor.leaderboard())
    mase_values += [predictor.evaluate(test_data)]

Beginning AutoGluon training...
AutoGluon will save models to 'patch_tst_attention_heads_4_model_files'
AutoGluon Version:  1.1.0
Python Version:     3.9.19
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          12
GPU Count:          1
Memory Avail:       6.29 GB / 15.90 GB (39.5%)
Disk Space Avail:   2105.31 GB / 2794.50 GB (75.3%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MASE,
 'freq': 'h',
 'hyperparameters': {'PatchTST': {'context_length': 192,
                                  'd_model': 32,
                                  'nhead': 4}},
 'known_covariates_names': ['food',
                            'alcohol',
                            'leisure',
                            'weekend',
                            'holiday',
                            'temp',
                            'dwpt',
                            'rhum',
                            'prcp',
                            'pres',
 

Evaluating hidden layer size: 4


train_data with frequency 'None' has been resampled to frequency 'h'.
Provided train_data has 6826363 rows (NaN fraction=1.0%), 261 time series. Median time series length is 26303 (min=9291, max=26305). 

Provided data contains following columns:
	target: 'busyness'
	known_covariates:
		categorical:        ['coco']
		continuous (float): ['food', 'alcohol', 'leisure', 'weekend', 'holiday', 'temp', ...]
	static_features:
		categorical:        ['Borough', 'Zone', 'service_zone']
		continuous (float): []

To learn how to fix incorrectly inferred types, please see documentation for TimeSeriesPredictor.fit

AutoGluon will gauge predictive performance using evaluation metric: 'MASE'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.

Starting training. Start time is 2024-07-14 19:38:37
Models that will be trained: ['PatchTST']
Training timeseries model PatchTST. 
	-0.8677       = Validation score (-MASE)


      model  score_val  pred_time_val  fit_time_marginal  fit_order
0  PatchTST  -0.867656      16.940989         292.877352          1


data with frequency 'None' has been resampled to frequency 'h'.
Model not specified in predict, will default to the model with the best validation score: PatchTST
Beginning AutoGluon training...
AutoGluon will save models to 'patch_tst_attention_heads_6_model_files'
AutoGluon Version:  1.1.0
Python Version:     3.9.19
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          12
GPU Count:          1
Memory Avail:       5.58 GB / 15.90 GB (35.1%)
Disk Space Avail:   2105.90 GB / 2794.50 GB (75.4%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MASE,
 'freq': 'h',
 'hyperparameters': {'PatchTST': {'context_length': 192,
                                  'd_model': 48,
                                  'nhead': 6}},
 'known_covariates_names': ['food',
                            'alcohol',
                            'leisure',
                            'weekend',
                            'holiday',
                  

Evaluating hidden layer size: 6


train_data with frequency 'None' has been resampled to frequency 'h'.
Provided train_data has 6826363 rows (NaN fraction=1.0%), 261 time series. Median time series length is 26303 (min=9291, max=26305). 

Provided data contains following columns:
	target: 'busyness'
	known_covariates:
		categorical:        ['coco']
		continuous (float): ['food', 'alcohol', 'leisure', 'weekend', 'holiday', 'temp', ...]
	static_features:
		categorical:        ['Borough', 'Zone', 'service_zone']
		continuous (float): []

To learn how to fix incorrectly inferred types, please see documentation for TimeSeriesPredictor.fit

AutoGluon will gauge predictive performance using evaluation metric: 'MASE'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.

Starting training. Start time is 2024-07-14 19:44:27
Models that will be trained: ['PatchTST']
Training timeseries model PatchTST. 
	-0.8717       = Validation score (-MASE)


      model  score_val  pred_time_val  fit_time_marginal  fit_order
0  PatchTST  -0.871734      23.079335         379.878615          1


data with frequency 'None' has been resampled to frequency 'h'.
Model not specified in predict, will default to the model with the best validation score: PatchTST
Beginning AutoGluon training...
AutoGluon will save models to 'patch_tst_attention_heads_8_model_files'
AutoGluon Version:  1.1.0
Python Version:     3.9.19
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          12
GPU Count:          1
Memory Avail:       7.83 GB / 15.90 GB (49.2%)
Disk Space Avail:   2104.82 GB / 2794.50 GB (75.3%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MASE,
 'freq': 'h',
 'hyperparameters': {'PatchTST': {'context_length': 192,
                                  'd_model': 64,
                                  'nhead': 8}},
 'known_covariates_names': ['food',
                            'alcohol',
                            'leisure',
                            'weekend',
                            'holiday',
                  

Evaluating hidden layer size: 8


train_data with frequency 'None' has been resampled to frequency 'h'.
Provided train_data has 6826363 rows (NaN fraction=1.0%), 261 time series. Median time series length is 26303 (min=9291, max=26305). 

Provided data contains following columns:
	target: 'busyness'
	known_covariates:
		categorical:        ['coco']
		continuous (float): ['food', 'alcohol', 'leisure', 'weekend', 'holiday', 'temp', ...]
	static_features:
		categorical:        ['Borough', 'Zone', 'service_zone']
		continuous (float): []

To learn how to fix incorrectly inferred types, please see documentation for TimeSeriesPredictor.fit

AutoGluon will gauge predictive performance using evaluation metric: 'MASE'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.

Starting training. Start time is 2024-07-14 19:52:11
Models that will be trained: ['PatchTST']
Training timeseries model PatchTST. 
	-0.8700       = Validation score (-MASE)


      model  score_val  pred_time_val  fit_time_marginal  fit_order
0  PatchTST  -0.870026      17.019459         690.063598          1


data with frequency 'None' has been resampled to frequency 'h'.
Model not specified in predict, will default to the model with the best validation score: PatchTST


In [9]:
mase_values

[{'MASE': -1.3622890570974713},
 {'MASE': -1.389364905997965},
 {'MASE': -1.3318451409826537}]

In [None]:
future_index = get_forecast_horizon_index_ts_dataframe(train_data, prediction_length=prediction_length, freq='H')
future_timestamps = future_index.get_level_values("timestamp").to_series()
known_covariates = pd.DataFrame(index=future_index)
known_covariates['day'] = future_timestamps.dt.day_name().str.lower().values
known_covariates['hour'] = future_timestamps.dt.hour.to_list()
known_covariates = known_covariates.join(df_open.set_index(['day', 'hour', 'item_id']), on=['day', 'hour', 'item_id'])
known_covariates = known_covariates.join(df_wth.set_index("timestamp"), on='timestamp', how='left')
add_weekends_holidays(known_covariates)

  offset = pd.tseries.frequencies.to_offset(freq)
