In [53]:
import pandas as pd
from torch.utils.data import Dataset
import numpy as np
import os

In [11]:
data = pd.read_csv("dataset/true_promo/all_product_true_promo_train.csv")

In [24]:
data.head(2)

Unnamed: 0,end_date,ean_global_channel,is_promo,price_range,sub_axis,seasonality_index,sold_units,2 for a price,3 for 2,bogof,bogshp,coupon,listing fee,online,save,site fee,year,month,week
0,2022-10-08,30074576_Offline,True,8.6,Eye Makeup,0.954407,372.0,0,0,0,0,0,0,1,1,1,2022,10,40
1,2022-10-15,30074576_Offline,True,8.6,Eye Makeup,0.986968,355.0,0,0,0,0,0,0,1,1,1,2022,10,41


### We need a way to get one product out of this data:
### We define id: each combination ean/global_channel is a time series

In [43]:
ids = data["ean_global_channel"].unique()
ids.shape

(1537,)

In [44]:
# To get a time series it's simple now:
data[data["ean_global_channel"]==ids[0]].head()

Unnamed: 0,end_date,ean_global_channel,is_promo,price_range,sub_axis,seasonality_index,sold_units,2 for a price,3 for 2,bogof,bogshp,coupon,listing fee,online,save,site fee,year,month,week
0,2022-10-08,30074576_Offline,True,8.6,Eye Makeup,0.954407,372.0,0,0,0,0,0,0,1,1,1,2022,10,40
1,2022-10-15,30074576_Offline,True,8.6,Eye Makeup,0.986968,355.0,0,0,0,0,0,0,1,1,1,2022,10,41
2,2022-10-22,30074576_Offline,True,8.6,Eye Makeup,0.971972,344.0,0,0,0,0,0,0,1,1,1,2022,10,42
3,2022-10-29,30074576_Offline,True,8.6,Eye Makeup,1.051523,334.0,0,0,0,0,0,0,1,1,1,2022,10,43
4,2022-11-05,30074576_Offline,True,8.733333,Eye Makeup,1.057523,320.0,0,0,0,0,0,0,1,1,1,2022,11,44


In [27]:
def preprocess_pipeline(data, id, target):
    """This function is responsible for all the preprocessing """
    data = data.rename(columns={'end_date':'date', id:'id'})
    data = data.drop(['is_promo', 'sub_axis', 'year', 'month', 'week'], axis=1)
    cols = list(data.columns)
    cols.remove(target)
    cols.remove('date')
    data= data[['date'] + cols + [target]] # organize data to date, variables and last is target
    return data

In [31]:
df = preprocess_pipeline(data, 'ean_global_channel', 'sold_units')

In [45]:
timeseries = 

In [47]:
a = 'true_promo_train.csv'

In [48]:
a.

'true_promo_test.csv'

In [69]:

class Dataset_Promo_ean_global_channel(Dataset):
    def __init__(self, root_path, flag='train', size=None,
                 features='S', data_path='ETTh1.csv',
                 target='sold_units', scale=False, inverse=False, timeenc=0, freq='15min',
                 seasonal_patterns='Yearly'):
        self.features = features
        self.target = target
        self.scale = scale
        self.inverse = inverse
        self.timeenc = timeenc
        self.root_path = root_path

        self.seq_len = size[0]
        self.label_len = size[1]
        self.pred_len = size[2]

        self.seasonal_patterns = seasonal_patterns
        self.history_size = 1.5
        self.window_sampling_limit = int(self.history_size * self.pred_len)
        self.flag = flag

        self.__read_data__()

    def preprocess_pipeline(self, data, id = 'ean_global_channel'):
        """This function is responsible for all the preprocessing """
        data = data.rename(columns={'end_date':'date', id:'id'})
        data = data.drop(['is_promo', 'sub_axis', 'year', 'month', 'week'], axis=1)
        cols = list(data.columns)
        cols.remove(self.target)
        cols.remove('date')
        data= data[cols + [self.target]] # organize data to date, variables and last is target we're not using date now
        return data

    def __read_data__(self):
        # M4Dataset.initialize()
        if self.flag == 'train':
            dataset = pd.read_csv(os.path.join(root_path,
                                          data_path)) 
        else:
            dataset = pd.read_csv(os.path.join(root_path,
                                          data_path.replace('train', 'test')))
        # Preprocessing dataset:
        df = self.preprocess_pipeline(dataset)
        self.ids = df['id'].unique()
        self.timeseries = [df[df['id']==ids[i]].drop('id', axis=1).values for i in range(len(ids))]
        self.n_var = self.timeseries[0].shape[1]
    def __getitem__(self, index):
        insample = np.zeros((self.seq_len, self.n_var))
        insample_mask = np.zeros((self.seq_len, self.n_var))
        outsample = np.zeros((self.pred_len + self.label_len, self.n_var))
        outsample_mask = np.zeros((self.pred_len + self.label_len, self.n_var))  # m4 dataset

        sampled_timeseries = self.timeseries[index]
        # cut_point = np.random.randint(low=max(1, len(sampled_timeseries) - self.window_sampling_limit),
        #                               high=len(sampled_timeseries),
        #                               size=1)[0]
        cut_point = np.random.randint(low=self.seq_len,
                                      high=len(sampled_timeseries)-self.pred_len+1,
                                      size=1)[0]
        insample_window = sampled_timeseries[max(0, cut_point - self.seq_len):cut_point]
        insample[-len(insample_window):] = insample_window
        insample_mask[-len(insample_window):] = 1.0
        outsample_window = sampled_timeseries[
                           cut_point - self.label_len:min(len(sampled_timeseries), cut_point + self.pred_len)]
        outsample[:len(outsample_window)] = outsample_window
        outsample_mask[:len(outsample_window)] = 1.0
        return insample, outsample, insample_mask, outsample_mask

    def __len__(self):
        return len(self.timeseries)

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

    def last_insample_window(self):
        """
        The last window of insample size of all timeseries.
        This function does not support batching and does not reshuffle timeseries.

        :return: Last insample window of all timeseries. Shape "timeseries, insample size"
        """
        insample = np.zeros((len(self.timeseries), self.seq_len))
        insample_mask = np.zeros((len(self.timeseries), self.seq_len))
        for i, ts in enumerate(self.timeseries):
            ts_last_window = ts[-self.seq_len:]
            insample[i, -len(ts):] = ts_last_window
            insample_mask[i, -len(ts):] = 1.0
        return insample, insample_mask



In [86]:
root_path = 'dataset/true_promo/'
data_path = 'all_product_true_promo_train.csv'
dtrain = Dataset_Promo_ean_global_channel(root_path=root_path, data_path=data_path, target='sold_units', flag='train', size=(13,3,1))

In [83]:
dtrain[4][0]

array([[ 6.7       ,  0.81650573,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         1.        , 27.        ],
       [ 6.7       ,  0.83288112,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         1.        , 31.        ],
       [ 6.7       ,  0.86386914,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         1.        , 34.        ],
       [ 7.55      ,  0.86255283,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  1.        ,
         1.        , 24.        ],
       [ 8.4       ,  0.86279302,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        , 15.        ],
       [ 8.4       ,  0.87171183,  0.        ,  0.        ,  0.        ,
         0.        ,  