In [1]:
class Args:
    def __init__(self):
        self.task_name = 'long_term_forecast'
        self.is_training = 1
        self.model_id = 'Pretrain'
        self.model_comment = 'TimeMSTEST'
        self.model = 'TimeLLM'
        self.seed = 2021
        self.data = 'pretrain'
        self.root_path = './dataset'
        self.data_path = 'ETTh1.csv'
        self.features = 'M'
        self.target = 'OT'
        self.loader = 'modal'
        self.freq = 'h'
        self.checkpoints = './checkpoints/'
        self.seq_len = 13
        self.label_len = 1
        self.pred_len = 1
        self.seasonal_patterns = 'Monthly'
        self.enc_in = 9
        self.dec_in = 9
        self.c_out = 9
        self.d_model = 32
        self.n_heads = 8  # Typically set by your model configuration
        self.e_layers = 2  # Typically set by your model configuration
        self.d_layers = 1  # Typically set by your model configuration
        self.d_ff = 128
        self.moving_avg = 25  # Assume default if not specified in the script
        self.factor = 3
        self.dropout = 0.1  # Assume default if not specified
        self.embed = 'timeF'  # Assume default if not specified
        self.activation = 'gelu'  # Assume default if not specified
        self.output_attention = False  # Assume default if not specified
        self.patch_len = 16  # Assume default if not specified
        self.stride = 8  # Assume default if not specified
        self.prompt_domain = 0  # Assume default if not specified
        self.llm_model = 'GPT2'
        self.llm_dim = 768
        self.num_workers = 10  # Default setting
        self.itr = 1
        self.train_epochs = 5
        self.align_epochs = 10  # Assume default if not specified
        self.batch_size = 1
        self.eval_batch_size = 1  # Assume default if not specified
        self.patience = 10  # Assume default if not specified
        self.learning_rate = 0.001
        self.des = 'Exp'
        self.loss = 'MSE'  # Assume default if not specified
        self.lradj = 'type1'  # Assume default if not specified
        self.pct_start = 0.2  # Assume default if not specified
        self.use_amp = False  # Assume default based on your environment capabilities
        self.llm_layers = 32
        self.percent = 100  # Assume default if not specified

# Instantiate the Args
args = Args()

In [2]:
import os
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from utils.timefeatures import time_features


In [3]:
from data_provider.data_factory import data_provider
from data_provider.data_loader import Dataset_MS

In [4]:
class Dataset_MS(Dataset):
    def __init__(self, root_path='dataset', flag='train', size=None,
                 features='MS', data_path='ETTh1.csv',
                 target='OT', scale=True, timeenc=0, freq='h', percent=100,
                 seasonal_patterns=None):
        if size == None:
            self.seq_len = 13
            self.label_len = 1
            self.pred_len = 1
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        # init
        assert flag in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[flag]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq
        self.percent = percent

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

        self.enc_in = self.data_x.shape[-1]
        self.tot_len = len(self.data_x) - self.seq_len - self.pred_len + 1

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path)).head(77)

        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        cols = list(df_raw.columns)
        cols.remove(self.target)
        cols.remove('date')
        df_raw = df_raw[['date'] + cols + [self.target]]
        num_train = int(len(df_raw) * 0.6)
        num_test = int(len(df_raw) * 0.2)
        num_vali = len(df_raw) - num_train - num_test
        border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len]
        border2s = [num_train, num_train + num_vali, len(df_raw)]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        if self.set_type == 0:
            border2 = (border2 - self.seq_len) * self.percent // 100 + self.seq_len

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
            data_stamp = df_stamp.drop(['date'], 1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp

    def __getitem__(self, index):
        #feat_id = index // self.tot_len
        s_begin = index % self.tot_len

        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len
        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]

        return seq_x, seq_y, seq_x_mark, seq_y_mark

    def __len__(self):
        return (len(self.data_x) - self.seq_len - self.pred_len + 1)

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)


In [5]:
data = pd.read_csv("dataset/ETTh1.csv")
data.shape[0]*0.6, data.shape[1]

(10452.0, 8)

In [6]:
df = Dataset_MS(root_path = args.root_path)

In [7]:
seq_x, seq_y, seq_x_mark, seq_y_mark = df[0]

In [8]:
seq_x.shape

(13, 7)

In [9]:
len(df.data_x)

46

In [10]:
df, df_loader = data_provider(args=args, flag='test')

In [11]:
nb=0
for seq_x, seq_y, seq_x_mark, seq_y_mark in df_loader:
    if nb==0: print(seq_x)
    if nb==1: print(seq_x)
    if nb==15: print(seq_x)
    if nb==16: print(seq_x)
    if nb>35:
        break
    nb+=1

tensor([[[ 1.4764,  1.8941,  1.5710,  2.0759, -0.7026,  0.6037,  1.5093],
         [ 1.7517,  1.3652,  1.8582,  1.1459, -0.1843,  1.5482,  0.9561],
         [ 1.4764,  0.8353,  1.6219,  0.8364, -0.8914,  0.6037,  1.6476],
         [ 1.9046,  1.4314,  1.8753,  2.0236, -0.7026,  1.2334,  1.4863],
         [ 1.7211,  1.3652,  1.8919,  1.5077, -0.2787,  1.5482,  0.8641],
         [ 2.7606,  1.8941,  2.5006,  1.1459,  0.9466,  2.0179,  1.2789],
         [ 2.0881,  1.2989,  2.1287,  2.0759,  0.8058,  1.8630,  0.7258],
         [ 1.8740,  1.1002,  1.7231,  0.8873,  0.7578,  1.8630,  1.6246],
         [ 1.5987,  0.9015,  1.4359,  0.3714,  0.9002,  2.0179,  1.9474],
         [ 2.3630,  1.7616,  2.1287,  1.6632,  0.7114,  1.5482,  2.1778],
         [ 2.5465,  2.0266,  2.2129,  1.5586,  1.5128,  1.8630,  2.6388],
         [ 1.5682,  1.6954,  1.6385,  1.6632,  0.2875,  1.5482,  1.7629],
         [ 1.2317,  1.6301,  1.3684,  1.7664,  0.0508,  1.7030,  2.8004]]],
       dtype=torch.float64)
tensor([

In [12]:
len(df_loader)

15

In [13]:
df.data_x.shape

(28, 7)

In [14]:
m=pd.read_csv("dataset/Monthly-train.csv")

In [15]:
m.iloc[:,:300]

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300
0,M1,8000.0,8350.0,8570.0,7700.0,7080.0,6520.0,6070.0,6650.0,6830.0,...,5450.0,5830.0,4080.0,3910.0,3840.0,3650.0,4630.0,2980.0,3550.0,3690.0
1,M2,2440.0,2490.0,2710.0,2290.0,1960.0,1990.0,1960.0,2530.0,2640.0,...,1960.0,1940.0,1680.0,1250.0,1700.0,1260.0,1340.0,1340.0,1740.0,1810.0
2,M3,2670.0,2590.0,2784.0,2816.0,2888.0,2906.0,2839.0,2860.0,2798.0,...,7369.0,7367.0,7272.0,7358.0,7403.0,7346.0,7496.0,7401.0,7294.0,7466.0
3,M4,7264.0,7215.0,7022.0,7040.0,6966.0,6984.0,7025.0,7082.0,6956.0,...,,,,,,,,,,
4,M5,4210.0,4290.0,4250.0,4320.0,4320.0,4330.0,4320.0,4260.0,4220.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,M47996,9964.0,7212.0,7858.0,8493.0,8203.0,8362.0,7238.0,7357.0,8642.0,...,,,,,,,,,,
47996,M47997,1696.6,1627.2,1465.4,1672.8,1760.1,1974.8,1733.7,1898.0,1952.4,...,,,,,,,,,,
47997,M47998,5900.0,6000.0,6280.0,6100.0,5410.0,5320.0,5130.0,5330.0,5810.0,...,,,,,,,,,,
47998,M47999,4660.0,4980.0,5060.0,5090.0,4790.0,4840.0,4640.0,4770.0,4940.0,...,,,,,,,,,,


In [16]:
m_test = pd.read_csv('dataset/Monthly-test.csv')

In [17]:
m_test

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19
0,M1,8720.0,7790.0,4770.0,5060.0,4720.0,4450.0,5120.0,5960.0,6560.0,4900.0,4520.0,7370.0,9050.0,7780.0,5380.0,4700.0,4490.0,4010.0
1,M2,2090.0,2180.0,1760.0,2290.0,1500.0,1370.0,1810.0,1450.0,1370.0,1380.0,1710.0,2140.0,1870.0,1670.0,1800.0,1800.0,1790.0,1600.0
2,M3,13314.0,13659.0,13839.0,13693.0,13701.0,13881.0,13862.0,13774.0,13677.0,13944.0,14023.0,13793.0,13752.0,13818.0,13901.0,14057.0,14037.0,14158.0
3,M4,6505.0,6559.0,6526.0,6499.0,6495.0,6450.0,6458.0,6434.0,6445.0,6382.0,6390.0,6390.0,6353.0,6337.0,6280.0,6284.0,6397.0,6433.0
4,M5,4390.0,4470.0,4350.0,4420.0,4480.0,4440.0,4340.0,4280.0,4210.0,4280.0,4350.0,4290.0,4320.0,4370.0,4390.0,4480.0,4390.0,4380.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,M47996,4405.9,5119.8,5993.1,6150.6,6845.0,3812.0,4503.0,3898.0,3988.0,3553.0,2514.0,2102.0,2548.0,2601.0,2831.0,3239.0,3825.0,1535.0
47996,M47997,1009.7,902.6,896.5,905.1,884.2,623.0,695.7,707.1,742.3,698.9,656.7,672.0,703.0,619.7,615.1,653.5,663.7,400.5
47997,M47998,5050.0,5010.0,4960.0,4940.0,4970.0,5010.0,5100.0,5090.0,5030.0,4940.0,4860.0,4880.0,4830.0,5030.0,4900.0,4790.0,5260.0,5100.0
47998,M47999,4390.0,4380.0,4400.0,4380.0,4410.0,4380.0,4470.0,4460.0,4400.0,4460.0,4380.0,4440.0,4300.0,4360.0,4340.0,4390.0,4570.0,4480.0


In [19]:
from data_provider.m4 import M4Dataset, M4Meta

In [20]:
dataset = M4Dataset.load(training=True, dataset_file='dataset/m4')

In [21]:
training_values = [v[~np.isnan(v)] for v in
             dataset.values[dataset.groups == 'Monthly']]

In [22]:
ids = np.array([i for i in dataset.ids[dataset.groups == 'Monthly']])
timeseries = [ts for ts in training_values]

In [66]:
seq_len = 17
pred_len=1
label_len=1
index=0
insample = np.zeros((seq_len, 1))
insample_mask = np.zeros((seq_len, 1))
outsample = np.zeros((pred_len + label_len, 1))
outsample_mask = np.zeros((pred_len + label_len, 1))  # m4 dataset
sampled_timeseries = timeseries[index][:17]
window_sampling_limit=1.5*pred_len
cut_point = np.random.randint(low=max(1, len(sampled_timeseries) - window_sampling_limit),
                                      high=len(sampled_timeseries),
                                      size=1)[0]

In [67]:
len(sampled_timeseries), window_sampling_limit

(17, 4.5)

In [68]:
cut_point

16

In [69]:
insample_window = sampled_timeseries[max(0, cut_point - seq_len):cut_point]
insample_window

array([7700., 7080., 6520., 6070., 6650., 6830., 5710., 5260., 5470.,
       7870., 7360., 8470., 7880.])

In [70]:
 insample[-len(insample_window):, 0] = insample_window
 insample

array([[7700.],
       [7080.],
       [6520.],
       [6070.],
       [6650.],
       [6830.],
       [5710.],
       [5260.],
       [5470.],
       [7870.],
       [7360.],
       [8470.],
       [7880.]])

In [71]:
outsample_window = sampled_timeseries[
cut_point - label_len:min(len(sampled_timeseries), cut_point + pred_len)]
outsample[:len(outsample_window), 0] = outsample_window

In [72]:
outsample

array([[7880.],
       [6750.],
       [   0.],
       [   0.]])

In [73]:
def __getitem__(self, index):
        insample = np.zeros((self.seq_len, 1))
        insample_mask = np.zeros((self.seq_len, 1))
        outsample = np.zeros((self.pred_len + self.label_len, 1))
        outsample_mask = np.zeros((self.pred_len + self.label_len, 1))  # m4 dataset

        sampled_timeseries = self.timeseries[index]
        cut_point = np.random.randint(low=max(1, len(sampled_timeseries) - self.window_sampling_limit),
                                      high=len(sampled_timeseries),
                                      size=1)[0]

        insample_window = sampled_timeseries[max(0, cut_point - self.seq_len):cut_point]
        insample[-len(insample_window):, 0] = insample_window
        insample_mask[-len(insample_window):, 0] = 1.0
        outsample_window = sampled_timeseries[
                           cut_point - self.label_len:min(len(sampled_timeseries), cut_point + self.pred_len)]
        outsample[:len(outsample_window), 0] = outsample_window
        outsample_mask[:len(outsample_window), 0] = 1.0
        return insample, outsample, insample_mask, outsample_mask

In [61]:
M4Meta.history_size

{'Yearly': 1.5,
 'Quarterly': 1.5,
 'Monthly': 1.5,
 'Weekly': 10,
 'Daily': 10,
 'Hourly': 10}

In [40]:

class Dataset_M4(Dataset):
    def __init__(self, root_path, flag='pred', size=None,
                 features='S', data_path='ETTh1.csv',
                 target='OT', scale=False, inverse=False, timeenc=0, freq='15min',
                 seasonal_patterns='Yearly'):
        self.features = features
        self.target = target
        self.scale = scale
        self.inverse = inverse
        self.timeenc = timeenc
        self.root_path = root_path

        self.seq_len = size[0]
        self.label_len = size[1]
        self.pred_len = size[2]

        self.seasonal_patterns = seasonal_patterns
        self.history_size = M4Meta.history_size[seasonal_patterns]
        self.window_sampling_limit = int(self.history_size * self.pred_len)
        self.flag = flag

        self.__read_data__()

    def __read_data__(self):
        # M4Dataset.initialize()
        if self.flag == 'train':
            dataset = M4Dataset.load(training=True, dataset_file=self.root_path)
        else:
            dataset = M4Dataset.load(training=False, dataset_file=self.root_path)
        training_values = np.array(
            [v[~np.isnan(v)] for v in
             dataset.values[dataset.groups == self.seasonal_patterns]])  # split different frequencies
        self.ids = np.array([i for i in dataset.ids[dataset.groups == self.seasonal_patterns]])
        self.timeseries = [ts for ts in training_values]

    def __getitem__(self, index):
        insample = np.zeros((self.seq_len, 1))
        insample_mask = np.zeros((self.seq_len, 1))
        outsample = np.zeros((self.pred_len + self.label_len, 1))
        outsample_mask = np.zeros((self.pred_len + self.label_len, 1))  # m4 dataset

        sampled_timeseries = self.timeseries[index]
        cut_point = np.random.randint(low=max(1, len(sampled_timeseries) - self.window_sampling_limit),
                                      high=len(sampled_timeseries),
                                      size=1)[0]

        insample_window = sampled_timeseries[max(0, cut_point - self.seq_len):cut_point]
        insample[-len(insample_window):, 0] = insample_window
        insample_mask[-len(insample_window):, 0] = 1.0
        outsample_window = sampled_timeseries[
                           cut_point - self.label_len:min(len(sampled_timeseries), cut_point + self.pred_len)]
        outsample[:len(outsample_window), 0] = outsample_window
        outsample_mask[:len(outsample_window), 0] = 1.0
        return insample, outsample, insample_mask, outsample_mask

    def __len__(self):
        return len(self.timeseries)

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

    def last_insample_window(self):
        """
        The last window of insample size of all timeseries.
        This function does not support batching and does not reshuffle timeseries.

        :return: Last insample window of all timeseries. Shape "timeseries, insample size"
        """
        insample = np.zeros((len(self.timeseries), self.seq_len))
        insample_mask = np.zeros((len(self.timeseries), self.seq_len))
        for i, ts in enumerate(self.timeseries):
            ts_last_window = ts[-self.seq_len:]
            insample[i, -len(ts):] = ts_last_window
            insample_mask[i, -len(ts):] = 1.0
        return insample, insample_mask



In [None]:
Dataset_M4()