In [1]:
import pandas as pd
from torch.utils.data import Dataset
import numpy as np
import os

In [2]:
data = pd.read_csv("dataset/true_promo/all_product_true_promo_train.csv")

In [3]:
data.head(2)

Unnamed: 0,end_date,ean_global_channel,is_promo,price_range,sub_axis,seasonality_index,sold_units,2 for a price,3 for 2,bogof,bogshp,coupon,listing fee,online,save,site fee,year,month,week
0,2022-10-08,30074576_Offline,True,8.6,Eye Makeup,0.954407,372.0,0,0,0,0,0,0,1,1,1,2022,10,40
1,2022-10-15,30074576_Offline,True,8.6,Eye Makeup,0.986968,355.0,0,0,0,0,0,0,1,1,1,2022,10,41


### We need a way to get one product out of this data:
### We define id: each combination ean/global_channel is a time series

In [4]:
ids = data["ean_global_channel"].unique()
ids.shape

(1537,)

In [5]:
# To get a time series it's simple now:
data[data["ean_global_channel"]==ids[0]].head()

Unnamed: 0,end_date,ean_global_channel,is_promo,price_range,sub_axis,seasonality_index,sold_units,2 for a price,3 for 2,bogof,bogshp,coupon,listing fee,online,save,site fee,year,month,week
0,2022-10-08,30074576_Offline,True,8.6,Eye Makeup,0.954407,372.0,0,0,0,0,0,0,1,1,1,2022,10,40
1,2022-10-15,30074576_Offline,True,8.6,Eye Makeup,0.986968,355.0,0,0,0,0,0,0,1,1,1,2022,10,41
2,2022-10-22,30074576_Offline,True,8.6,Eye Makeup,0.971972,344.0,0,0,0,0,0,0,1,1,1,2022,10,42
3,2022-10-29,30074576_Offline,True,8.6,Eye Makeup,1.051523,334.0,0,0,0,0,0,0,1,1,1,2022,10,43
4,2022-11-05,30074576_Offline,True,8.733333,Eye Makeup,1.057523,320.0,0,0,0,0,0,0,1,1,1,2022,11,44


In [6]:
def preprocess_pipeline(data, id, target):
    """This function is responsible for all the preprocessing """
    data = data.rename(columns={'end_date':'date', id:'id'})
    data = data.drop(['is_promo', 'sub_axis', 'year', 'month', 'week'], axis=1)
    cols = list(data.columns)
    cols.remove(target)
    cols.remove('date')
    data= data[['date'] + cols + [target]] # organize data to date, variables and last is target
    return data

In [7]:
df = preprocess_pipeline(data, 'ean_global_channel', 'sold_units')

In [24]:
result = data.groupby('ean_global_channel')['end_date'].agg(['min', 'max', 'count']).reset_index().sort_values(by='count', ascending=True)

In [25]:
result

Unnamed: 0,ean_global_channel,min,max,count
902,3600531672485_Offline,2023-04-15,2023-11-25,33
888,3600531672317_Offline,2023-04-15,2023-11-25,33
889,3600531672324_Offline,2023-04-15,2023-11-25,33
890,3600531672331_Offline,2023-04-15,2023-11-25,33
892,3600531672362_Offline,2023-04-15,2023-11-25,33
...,...,...,...,...
543,3600523999439_Offline,2022-10-08,2023-11-25,60
542,3600523999422_Offline,2022-10-08,2023-11-25,60
540,3600523993314_Offline,2022-10-08,2023-11-25,60
538,3600523993291_Offline,2022-10-08,2023-11-25,60


In [8]:

class Dataset_Promo_ean_global_channel(Dataset):
    def __init__(self, root_path, flag='train', size=None,
                 features='S', data_path='ETTh1.csv',
                 target='sold_units', scale=False, inverse=False, timeenc=0, freq='15min',
                 seasonal_patterns='Yearly'):
        self.features = features
        self.target = target
        self.scale = scale
        self.inverse = inverse
        self.timeenc = timeenc
        self.root_path = root_path

        self.seq_len = size[0]
        self.label_len = size[1]
        self.pred_len = size[2]

        self.seasonal_patterns = seasonal_patterns
        self.history_size = 1.5
        self.window_sampling_limit = int(self.history_size * self.pred_len)
        self.flag = flag

        self.__read_data__()

    def preprocess_pipeline(self, data, id = 'ean_global_channel'):
        """This function is responsible for all the preprocessing """
        data = data.rename(columns={'end_date':'date', id:'id'})
        data = data.drop(['is_promo', 'sub_axis', 'year', 'month', 'week'], axis=1)
        cols = list(data.columns)
        cols.remove(self.target)
        cols.remove('date')
        data= data[cols + [self.target]] # organize data to date, variables and last is target we're not using date now
        return data

    def __read_data__(self):
        # M4Dataset.initialize()
        if self.flag == 'train':
            dataset = pd.read_csv(os.path.join(root_path,
                                          data_path)) 
        else:
            dataset = pd.read_csv(os.path.join(root_path,
                                          data_path.replace('train', 'test')))
        # Preprocessing dataset:
        df = self.preprocess_pipeline(dataset)
        self.ids = df['id'].unique()
        self.timeseries = [df[df['id']==ids[i]].drop('id', axis=1).values for i in range(len(ids))]
        self.n_var = self.timeseries[0].shape[1]
    def __getitem__(self, index):
        insample = np.zeros((self.seq_len, self.n_var))
        insample_mask = np.zeros((self.seq_len, self.n_var))
        outsample = np.zeros((self.pred_len + self.label_len, self.n_var))
        outsample_mask = np.zeros((self.pred_len + self.label_len, self.n_var))  # m4 dataset

        sampled_timeseries = self.timeseries[index]
        if self.flag=='train':
            cut_point = np.random.randint(low=self.seq_len,
                                      high=len(sampled_timeseries)-self.pred_len+1,
                                      size=1)[0]
        else:
            cut_point = np.random.randint(low=max(1, len(sampled_timeseries)- self.window_sampling_limit),
                                      high=len(sampled_timeseries),
                                      size=1)[0]
        # cut_point = np.random.randint(low=self.seq_len,
        #                               high=len(sampled_timeseries),
        #                               size=1)[0]
        insample_window = sampled_timeseries[max(0, cut_point - self.seq_len):cut_point]
        insample[-len(insample_window):] = insample_window
        insample_mask[-len(insample_window):] = 1.0
        outsample_window = sampled_timeseries[
                           cut_point - self.label_len:min(len(sampled_timeseries), cut_point + self.pred_len)]
        outsample[:len(outsample_window)] = outsample_window
        outsample_mask[:len(outsample_window)] = 1.0
        return insample, outsample, insample_mask, outsample_mask

    def __len__(self):
        return len(self.timeseries)

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

    def last_insample_window(self):
        """
        The last window of insample size of all timeseries.
        This function does not support batching and does not reshuffle timeseries.

        :return: Last insample window of all timeseries. Shape "timeseries, insample size"
        """
        insample = np.zeros((len(self.timeseries), self.seq_len, self.n_var))
        insample_mask = np.zeros((len(self.timeseries), self.seq_len, self.n_var))
        for i, ts in enumerate(self.timeseries):
            ts_last_window = ts[-self.seq_len:]
            insample[i, -len(ts):] = ts_last_window
            insample_mask[i, -len(ts):] = 1.0
        return insample, insample_mask



46

In [9]:
root_path = 'dataset/true_promo/'
data_path = 'all_product_true_promo_train.csv'
dtrain = Dataset_Promo_ean_global_channel(root_path=root_path, data_path=data_path, target='sold_units', flag='train', size=(int(17*1.75),17,17))

In [10]:
dtest = Dataset_Promo_ean_global_channel(root_path=root_path, data_path=data_path, target='sold_units', flag='test', size=(int(17*1.75),17,17))

(29, 12)

In [12]:
from torch.utils.data import DataLoader


In [13]:
dtrain_loader = DataLoader(dtrain, batch_size=10, shuffle=False)
dtest_loader = DataLoader(dtest, batch_size=10, shuffle=False)

In [14]:
x, _ = dtrain_loader.dataset.last_insample_window()

In [15]:
y = dtest_loader.dataset.timeseries

In [19]:
for x,_,y,_ in dtrain_loader:
    print(x.shape)

torch.Size([10, 29, 12])
torch.Size([10, 29, 12])
torch.Size([10, 29, 12])


ValueError: low >= high

In [16]:
from data_provider.m4 import M4Dataset, M4Meta

In [17]:
import torch

In [94]:
x = torch.tensor(x, dtype=torch.float32)
x.shape

torch.Size([1537, 29, 12])

In [95]:
x.shape

torch.Size([1537, 29, 12])

In [96]:
B,_,C = x.shape

In [100]:
id_list = np.arange(0, B, 6)
id_list = np.append(id_list, B)
id_list

array([   0,    6,   12,   18,   24,   30,   36,   42,   48,   54,   60,
         66,   72,   78,   84,   90,   96,  102,  108,  114,  120,  126,
        132,  138,  144,  150,  156,  162,  168,  174,  180,  186,  192,
        198,  204,  210,  216,  222,  228,  234,  240,  246,  252,  258,
        264,  270,  276,  282,  288,  294,  300,  306,  312,  318,  324,
        330,  336,  342,  348,  354,  360,  366,  372,  378,  384,  390,
        396,  402,  408,  414,  420,  426,  432,  438,  444,  450,  456,
        462,  468,  474,  480,  486,  492,  498,  504,  510,  516,  522,
        528,  534,  540,  546,  552,  558,  564,  570,  576,  582,  588,
        594,  600,  606,  612,  618,  624,  630,  636,  642,  648,  654,
        660,  666,  672,  678,  684,  690,  696,  702,  708,  714,  720,
        726,  732,  738,  744,  750,  756,  762,  768,  774,  780,  786,
        792,  798,  804,  810,  816,  822,  828,  834,  840,  846,  852,
        858,  864,  870,  876,  882,  888,  894,  9