In [1]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import random
random.seed(42)

# import itertools

from datetime import datetime, timedelta
from sklearn import preprocessing

%load_ext autoreload
%autoreload 2
from GBRT_for_TSF.utils import evaluate_with_xgboost

In [2]:
include_covariates = True  # True/False : whether to include features or not
include_motif_information = 0  # 0: no motif info; 1: Top-1 Motif; 2: Top-K Motifs (Direct); 3: Top-K Motifs (Average)

num_periods_output = 24  # to predict
num_periods_input = 24  # input

ALL_Test_Data = []
ALL_Test_Prediction = []

file_name = "traffic.npy"
data_path = r"../data/" + file_name
data = np.load(data_path)
data = data[0:90, :]
# data = data[0:5, :]

data = pd.DataFrame(data)


In [3]:
# preprocessing
def New_preprocessing(TimeSeries):
    Data = []
    start_date = datetime(2012, 1, 1, 00, 00, 00)  # define start date
    for i in range(0, len(TimeSeries)):
        record = []
        record.append(TimeSeries[i])
        record.append(start_date.month)
        record.append(start_date.day)
        record.append(start_date.hour)
        record.append(start_date.weekday())
        record.append(start_date.timetuple().tm_yday)
        record.append(start_date.isocalendar()[1])
        start_date = start_date + timedelta(hours=1)
        Data.append(record)
    ########## change list of lists to df ################
    headers = [
        "traffic",
        "month",
        "day",
        "hour",
        "day_of_week",
        "day_of_year",
        "week_of_year",
    ]
    Data_df = pd.DataFrame(Data, columns=headers)
    sub = Data_df.iloc[:, 1:]
    # Normalize features to be from -0.5 to 0.5 as mentioned in the paper
    New_sub = preprocessing.minmax_scale(sub, feature_range=(-0.5, 0.5))
    Normalized_Data_df = pd.DataFrame(
        np.column_stack([Data_df.iloc[:, 0], New_sub]), columns=headers
    )

    #################################################################################################
    # cut training and testing training is 10392
    Train = Normalized_Data_df.iloc[0:10392, :]
    Train = Train.values
    Train = Train.astype("float32")
    Test = Normalized_Data_df.iloc[10392 - num_periods_input :, :]
    Test = Test.values
    Test = Test.astype("float32")
    Number_Of_Features = 7

    ############################################ Windowing ##################################
    end = len(Train)
    start = 0
    next = 0
    x_batches = []
    y_batches = []
    count = 0
    while next + (num_periods_input) < end:
        next = start + num_periods_input
        x_batches.append(Train[start:next, :])
        y_batches.append(Train[next : next + num_periods_output, 0])
        start = start + 1
    y_batches = np.asarray(y_batches)
    y_batches = y_batches.reshape(-1, num_periods_output, 1)
    x_batches = np.asarray(x_batches)
    x_batches = x_batches.reshape(-1, num_periods_input, Number_Of_Features)

    ############################################ Windowing ##################################
    end_test = len(Test)
    start_test = 0
    next_test = 0
    x_testbatches = []
    y_testbatches = []
    while next_test + (num_periods_input) < end_test:
        next_test = start_test + num_periods_input
        x_testbatches.append(Test[start_test:next_test, :])
        y_testbatches.append(Test[next_test : next_test + num_periods_output, 0])
        start_test = start_test + num_periods_input
    y_testbatches = np.asarray(y_testbatches)
    y_testbatches = y_testbatches.reshape(-1, num_periods_output, 1)
    x_testbatches = np.asarray(x_testbatches)
    x_testbatches = x_testbatches.reshape(-1, num_periods_input, Number_Of_Features)
    # print(
    #     "x_batches shape:",
    #     x_batches.shape,
    #     "y_batches shape:",
    #     y_batches.shape,
    #     "x_testbatches shape:",
    #     x_testbatches.shape,
    #     "y_testbatches shape:",
    #     y_testbatches.shape,
    # )
    return x_batches, y_batches, x_testbatches, y_testbatches

In [4]:
x_batches_Full = []
y_batches_Full = []
X_Test_Full = []
Y_Test_Full = []
for i in range(0, len(data)):
    x_batches = []
    y_batches = []
    X_Test = []
    Y_Test = []
    TimeSeries = data.iloc[i, :]
    x_batches, y_batches, X_Test, Y_Test = New_preprocessing(TimeSeries)
    for element1 in x_batches:
        x_batches_Full.append(element1)

    for element2 in y_batches:
        y_batches_Full.append(element2)

    for element5 in X_Test:
        X_Test_Full.append(element5)

    for element6 in Y_Test:
        Y_Test_Full.append(element6)

In [5]:
xgboost_parameters = {
    "learning_rate": 0.2,
    "n_estimators": 800,
    "max_depth": 8,
    "min_child_weight": 1,
    "gamma": 0.0,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "scale_pos_weight": 1,
    "random_state": 42,
    "verbosity": 1, # 0=Silent, 1=Warning, 2=Info, 3=Debug
}

In [6]:
evaluate_with_xgboost(
    num_periods_output,
    x_batches_Full,
    y_batches_Full,
    X_Test_Full,
    Y_Test_Full,
    xgboost_parameters,
    include_covariates=include_covariates,
)

RMSE:  0.013664559
WAPE:  0.10890117
MAE:  0.0058806404
MAPE:  0.1256635


In [7]:
import datetime

print(f"This notebook was last run end-to-end on: {datetime.datetime.now()}\n")
###
###
###

This notebook was last run end-to-end on: 2025-12-25 00:21:54.357416

