In [1]:
import os
import math
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from tqdm import tqdm


import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
import numpy as np
import pandas as pd
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer, NaNLabelEncoder
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss


# set error params
epsilon = 1e-6

# read files and remove the rows with NaN value in each column
df = pq.read_table('nanda_2years_top500.parquet').to_pandas()
df = df.dropna()
df

# DATA PRE-PROCESS: CHANGE FORMAT
# convert the data of the column 'plan_in_load_time' in pd.datetime format
df['plan_in_load_time'] = pd.to_datetime(df['plan_in_load_time'])
df['plan_in_load_time'] = df['plan_in_load_time'].apply(lambda x: x.date())
df['plan_in_load_time'] = pd.to_datetime(df['plan_in_load_time'])
df


In [3]:
# DATA PRE-PROCESS: GET DATE RANGE
# get MAX_DATE in df
MAX_DATE = df['plan_in_load_time'].max()

# !!! DUE TO WRONG TIMESTAMP !!!: 
# change the date of this row to '2023-11-30' in df but not in df_max_date
df.loc[df['plan_in_load_time'] == MAX_DATE, 'plan_in_load_time'] = '2023-11-30'
MAX_DATE = df['plan_in_load_time'].max()
MAX_DATE


MAX_DATES = []
TEST_DATE = pd.to_datetime('2023-10-16')

for t in range(30):
    MAX_DATES.append(TEST_DATE + pd.Timedelta(days=t))

MAX_DATES


# get MIN_DATE in df
MIN_DATE = df['plan_in_load_time'].min()
MIN_DATE


Timestamp('2023-11-30 00:00:00')

In [11]:
# DATA PRE-PROCESS: DATA GROUPBY
# generate a new dataframe which is groupby df by the columns 'start_group_name' and 'end_group_name'
df_gp = df.groupby(['start_group_name', 'end_group_name'])
df_gp

# CASE STUDY
# transfer the groupby object to a list of small dataframes
df_gp_list = [group for _, group in df_gp]
# # get 0.7-quartile of the length of df_gp_list
# np.quantile([len(i) for i in df_gp_list], 0.7)


# transfer the groupby object to a list of small dataframes
df_gp_list = []
df_gp_dict = dict()

tft_data = pd.DataFrame()

for _, group in df_gp:
    
    # STEP1: get group name and group id
    start_group_name, end_group_name = group['start_group_name'].values[0], group['end_group_name'].values[0]
    print('{} - {}'.format(group['start_group_name'].values[0], group['end_group_name'].values[0]))
    
    # rename the column 'plan_in_load_time' of group to 'y'
    group = group.rename(columns={'plan_in_load_time': 'date', 'is_success': 'count'})
    
    
    # STEP2: get new dataframe
    # get the column 'date' and 'count' of group
    group['count'] = 1
    group = group[['date', 'count']]
    # group by 'date'
    group = group.groupby('date').sum()
    group = group[['count']]
    
    
    # STEP3: adjust date of dataframe
    # fill the missing days in group using the value 0
    group.index = pd.to_datetime(group.index)
    group = group.resample('D').sum()
    group = group.fillna(0)
    
    # extend the data from MIN_DATE to MAX_DATE using the value 0 in column 'y'
    group_max_date, group_min_date = group.index.max(), group.index.min()
    group = pd.concat([group, pd.DataFrame({'count': [0] * (MAX_DATE - group_max_date).days}, index=pd.date_range(group_max_date + pd.Timedelta(days=1), MAX_DATE))])
    group = pd.concat([group, pd.DataFrame({'count': [0] * (group_min_date - MIN_DATE).days}, index=pd.date_range(MIN_DATE, group_min_date - pd.Timedelta(days=1)))])
    
    # get back the date column
    group.reset_index(inplace=True)
    group = group.rename(columns={'index': 'date'})
    
    
    # STEP4: ADD ID AND FEATURES
    # add start_group_name+' '+end_group_name as id
    group['idx'] = start_group_name + ' ' + end_group_name
    # add group name
    group['start_group_name'], group['end_group_name'] = start_group_name, end_group_name
    
    
    # STEP5: add data to data structure
    # add group to df_gp_list
    df_gp_list.append(group)
    # add group to df_gp_dict
    df_gp_dict[start_group_name + ' ' + end_group_name] = group
    # concat group to tft_data
    tft_data = pd.concat([tft_data, group])
    
    # print(group)

    
print(df_gp_list)
print('---')
# number of groups
print(len(df_gp_list))
print('---')
print(df_gp_dict)
print('---')
print(tft_data)


In [13]:
# add time index
tft_data['time_idx'] = (tft_data['date'] - tft_data['date'].min()).dt.days
tft_data.reset_index(inplace=True, drop=True)
tft_data

# add date features
tft_data['year'] = tft_data['date'].dt.year.astype(str)
tft_data['month'] = tft_data['date'].dt.month.astype(str)
tft_data['day'] = tft_data['date'].dt.day.astype(str)
tft_data


# CUT LATEST USELESS DATA
tft_data = tft_data[lambda x: x.date <= (TEST_DATE + pd.Timedelta(days=30))]
tft_data.reset_index(inplace=True)
tft_data


Unnamed: 0,date,count,idx,start_group_name,end_group_name,time_idx,year,month,day
0,2022-01-02,2.0,临沂 广州,临沂,广州,4,2022,1,2
1,2022-01-03,0.0,临沂 广州,临沂,广州,5,2022,1,3
2,2022-01-04,1.0,临沂 广州,临沂,广州,6,2022,1,4
3,2022-01-05,12.0,临沂 广州,临沂,广州,7,2022,1,5
4,2022-01-06,4.0,临沂 广州,临沂,广州,8,2022,1,6
...,...,...,...,...,...,...,...,...,...
350995,2021-12-30,0.0,青岛 长沙,青岛,长沙,1,2021,12,30
350996,2021-12-31,0.0,青岛 长沙,青岛,长沙,2,2021,12,31
350997,2022-01-01,0.0,青岛 长沙,青岛,长沙,3,2022,1,1
350998,2022-01-02,0.0,青岛 长沙,青岛,长沙,4,2022,1,2


In [None]:
# SMAPE_AVG = []

# for t_id in range(7):
    
#     # PART1: PREPARE DATASET
#     # TFT MODEL - Create dataset and dataloaders
#     max_prediction_length = 1
#     max_encoder_length = 14
#     training_cutoff = tft_data["time_idx"].max() - max_prediction_length
    
#     training = TimeSeriesDataSet(
#         tft_data[lambda x: x.time_idx <= training_cutoff],
#         time_idx='time_idx',
#         target='count',
#         group_ids=['idx'],
#         min_encoder_length=max_encoder_length,
#         max_encoder_length=max_encoder_length,
#         min_prediction_length=max_prediction_length,
#         max_prediction_length=max_prediction_length,
#         static_categoricals=['start_group_name', 'end_group_name'],
#         static_reals=[],
#         time_varying_known_categoricals=['year', 'month', 'day'],
#         variable_groups={},  # group of categorical variables can be treated as one variable
#         time_varying_known_reals=['time_idx'],
#         time_varying_unknown_categoricals=[],
#         time_varying_unknown_reals=[
#             'count',
#         ],
#         categorical_encoders={'year': NaNLabelEncoder().fit(tft_data.year), 'month': NaNLabelEncoder().fit(tft_data.month), \
#                               'day': NaNLabelEncoder().fit(tft_data.day)},
#         add_relative_time_idx=True,
#         add_target_scales=True,
#         add_encoder_length=True
#     )
    
#     # create validation set (predict=True) which means to predict the last max_prediction_length points in time
#     # for each series
#     validation = TimeSeriesDataSet.from_dataset(training, tft_data, predict=True, stop_randomization=True)
    
#     # create dataloaders for model
#     batch_size = 50  # set this between 32 to 128
#     train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=8)
#     val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=8)
    
#     # PART2: TRAIN TFT MODEL
#     # configure network and trainer
#     early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, verbose=False, mode='min')
#     lr_logger = LearningRateMonitor()  # log the learning rate
#     logger = TensorBoardLogger('lightning_logs') 
    
#     # build Trainer for TFT model
#     trainer = pl.Trainer(
#         max_epochs=1,
#         min_epochs=1,
#         accelerator='gpu',
#         enable_model_summary=True,
#         gradient_clip_val=0.01,
#         limit_train_batches=1,  # coment in for training, running valiation every 30 batches
#         # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
#         callbacks=[lr_logger, early_stop_callback],
#         logger=logger,
#     )
    
#     # build TFT model
#     tft = TemporalFusionTransformer.from_dataset(
#         training,
#         learning_rate=0.01,
#         hidden_size=8,
#         attention_head_size=2,
#         dropout=0.1,
#         hidden_continuous_size=4,
#         loss=QuantileLoss(quantiles=[0.5]),
#         log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
#         optimizer='Ranger',
#         reduce_on_plateau_patience=5,
#     )
#     print(f'Number of parameters in network: {tft.size()/1e3:.1f}k')
    
#     # fit network
#     trainer.fit(
#         tft,
#         train_dataloaders=train_dataloader,
#         val_dataloaders=val_dataloader,
#     )
    
#     # PART3: EVALUATE TFT MODEL & GET SMAPE VALUE
#     # load the best model according to the validation loss
#     # (given that we use early stopping, this is not necessarily the last epoch)
#     best_model_path = trainer.checkpoint_callback.best_model_path
#     best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
    
#     # calcualte SMAPE on validation set
#     predictions = best_tft.predict(val_dataloader, return_y=True)
#     preds = torch.round(predictions.output)
#     preds[preds < 0] = 0
#     SMAPE_VAL = SMAPE()(preds, predictions.y)
#     print('SMAPE_VALUE FOR DAY {} = {}'.format((7-t_id), SMAPE_VAL))
#     SMAPE_AVG.append(SMAPE_VAL)
    
#     # PART4: CUT OFF THE DATA OF LAST DAY -> TO ITERATE AND TEST THE DAY BEFORE LAST DAY
#     tft_data = tft_data[lambda x: x.time_idx < tft_data['time_idx'].max()]
    
    
# SMAPE_AVG_VAL = sum(SMAPE_AVG)/7.0
# print('FINAL SMAPE VALUE = {}'.format(SMAPE_AVG_VAL))


In [None]:
# TUNING
def write_to_file(file_path, content):
    with open(file_path, 'a') as file:
        file.write(content)

file_path = 'SMAPE_RESULTS_v3.txt'

for tune_max_encoder_length in [21]:
    for tune_epochs in [100]:
        for tune_lr in [0.05]:
            for tune_hidden_size in [32, 16, 8]:
                for tune_hidden_continuous_size in [2, 4, 8, 16]:
                    for tune_quantiles in [[0.5], [0.1, 0.5, 0.9]]:
                        
                        SMAPE_AVG = []
                        
                        for t_id in range(30):
                            
                            # PART1: PREPARE DATASET
                            # TFT MODEL - Create dataset and dataloaders
                            max_prediction_length = 1
                            max_encoder_length = tune_max_encoder_length
                            training_cutoff = tft_data["time_idx"].max() - max_prediction_length
                            
                            training = TimeSeriesDataSet(
                                tft_data[lambda x: x.time_idx <= training_cutoff],
                                time_idx='time_idx',
                                target='count',
                                group_ids=['idx'],
                                min_encoder_length=max_encoder_length,
                                max_encoder_length=max_encoder_length,
                                min_prediction_length=max_prediction_length,
                                max_prediction_length=max_prediction_length,
                                static_categoricals=['start_group_name', 'end_group_name'],
                                static_reals=[],
                                time_varying_known_categoricals=['year', 'month', 'day'],
                                variable_groups={},  # group of categorical variables can be treated as one variable
                                time_varying_known_reals=['time_idx'],
                                time_varying_unknown_categoricals=[],
                                time_varying_unknown_reals=[
                                    'count',
                                ],
                                categorical_encoders={'year': NaNLabelEncoder().fit(tft_data.year), 'month': NaNLabelEncoder().fit(tft_data.month), \
                                                      'day': NaNLabelEncoder().fit(tft_data.day)},
                                add_relative_time_idx=True,
                                add_target_scales=True,
                                add_encoder_length=True
                            )
                            
                            # create validation set (predict=True) which means to predict the last max_prediction_length points in time
                            # for each series
                            validation = TimeSeriesDataSet.from_dataset(training, tft_data, predict=True, stop_randomization=True)
                            
                            # create dataloaders for model
                            batch_size = 50  # set this between 32 to 128
                            train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=8)
                            val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=8)
                            
                            # PART2: TRAIN TFT MODEL
                            # configure network and trainer
                            early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, verbose=False, mode='min')
                            lr_logger = LearningRateMonitor()  # log the learning rate
                            logger = TensorBoardLogger('lightning_logs') 
                            
                            # build Trainer for TFT model
                            trainer = pl.Trainer(
                                max_epochs=tune_epochs,
                                min_epochs=tune_epochs,
                                accelerator='gpu',
                                enable_model_summary=True,
                                gradient_clip_val=0.01,
                                limit_train_batches=tune_epochs,  # coment in for training, running valiation every 30 batches
                                # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
                                callbacks=[lr_logger, early_stop_callback],
                                logger=logger,
                            )
                            
                            # build TFT model
                            tft = TemporalFusionTransformer.from_dataset(
                                training,
                                learning_rate=tune_lr,
                                hidden_size=tune_hidden_size,
                                attention_head_size=2,
                                dropout=0.1,
                                hidden_continuous_size=tune_hidden_continuous_size,
                                loss=QuantileLoss(quantiles=tune_quantiles),
                                log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
                                optimizer='Ranger',
                                reduce_on_plateau_patience=5,
                            )
                            print(f'Number of parameters in network: {tft.size()/1e3:.1f}k')
                            
                            # fit network
                            trainer.fit(
                                tft,
                                train_dataloaders=train_dataloader,
                                val_dataloaders=val_dataloader,
                            )
                            
                            # PART3: EVALUATE TFT MODEL & GET SMAPE VALUE
                            # load the best model according to the validation loss
                            # (given that we use early stopping, this is not necessarily the last epoch)
                            best_model_path = trainer.checkpoint_callback.best_model_path
                            best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
                            
                            # calcualte SMAPE on validation set
                            predictions = best_tft.predict(val_dataloader, return_y=True)
                            preds = torch.round(predictions.output)
                            preds[preds < 0] = 0
                            SMAPE_VAL = SMAPE()(preds, predictions.y)
                            print('SMAPE_VALUE FOR DAY {} = {}'.format((30-t_id), SMAPE_VAL))
                            SMAPE_AVG.append(SMAPE_VAL)
                            
                            # PART4: CUT OFF THE DATA OF LAST DAY -> TO ITERATE AND TEST THE DAY BEFORE LAST DAY
                            tft_data = tft_data[lambda x: x.time_idx < tft_data['time_idx'].max()]
                            
                            
                        SMAPE_AVG_VAL = sum(SMAPE_AVG)/30.0
                        print('FINAL SMAPE VALUE = {}'.format(SMAPE_AVG_VAL))
                        content = 'max_encoder_length = {}, epochs = {}, lr = {}, hidden_size = {}, hidden_continuous_size = {}, quantiles = {}, SMAPE VALUE = {}, SMAPE LIST = {}\n'.format(tune_max_encoder_length, tune_epochs, tune_lr, tune_hidden_size, tune_hidden_continuous_size, tune_quantiles, SMAPE_AVG_VAL, SMAPE_AVG)
                        write_to_file(file_path, content)
                        


In [None]:
# raw predictions are a dictionary from which all kind of information including quantiles can be extracted
raw_predictions = best_tft.predict(val_dataloader, mode='raw', return_x=True)

In [None]:
# FINALLY GENERATE OUTPUT FILES