In [None]:
!pip install numpy==1.20

In [None]:
import pandas as pd
import numpy as np
import datetime
import json
import pickle
from joblib import Parallel, delayed

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

abs_data = pd.concat([
    pd.read_csv('train_bert_data/train_7days.csv'),
    pd.read_csv('train_bert_data/val_7days.csv'),
    pd.read_csv('train_bert_data/test_7days.csv')
])
abs_data.shape, np.__version__

In [None]:
days_len = 7
items = sorted(train['item'].drop_duplicates())

In [None]:
train = train[(train['date'] >= '2017-01-01')]
train_gp = train.sort_values('date').groupby(['item', 'store', 'date'], as_index=False)
train_gp = train_gp.agg({'sales':['mean']})
train_gp.columns = ['item', 'store', 'date', 'sales']
train_gp.head()

In [None]:
data = train
window = 29
lag = 
cols, names = list(), list()
# Input sequence (t-n, ... t-1)
for i in range(window, 0, -1):
    cols.append(data.shift(i))
    names += [('%s(t-%d)' % (col, i)) for col in data.columns]
# Current timestep (t=0)
cols.append(data)
names += [('%s(t)' % (col)) for col in data.columns]
# Target timestep (t=lag)
cols.append(data.shift(-lag))
names += [('%s(t+%d)' % (col, lag)) for col in data.columns]
# Put it all together
agg = pd.concat(cols, axis=1)
agg.columns = names
# Drop rows with NaN values
if dropnan:
    agg.dropna(inplace=True)

In [None]:
item(t-%d)' % 9

In [None]:
def process_for_dates(date_list):
    result_dict = dict()
    for start_str in date_list:
        start_date = datetime.datetime(*list(map(lambda x: int(x), start_str.split('-'))))
        end_date = start_date + datetime.timedelta(days=days_len)
        end_str = end_date.strftime('%Y-%m-%d')
        for item in items:
            abs_parts = []
            dd = start_date
            while dd < end_date:
                abs_parts.append(list(filter(lambda x: f':item{item}-' in x, abs_data.loc[abs_data['date'] == dd.strftime('%Y-%m-%d'), 'seq_cnv'])))
                dd += datetime.timedelta(days=1)
            input_parts = train.loc[train['item'] == item].loc[train['date'] >= start_str].loc[train['date'] < end_str].sort_values(by=['date'])
            input_pivot = input_parts.pivot(index='date', columns='store', values='sales')
            output_parts = train.loc[train['item'] == item].loc[train['date'] == end_str].pivot(index='date', columns='store', values='sales')
            if item not in result_dict.keys():
                result_dict[item] = dict()
            result_dict[item][start_str] = {
                'store_seq': abs_parts,
                'input': list(map(lambda x: list(map(lambda y: int(y), list(x))), input_pivot.values.T)),
                'output': list(map(lambda x: int(x), output_parts.values.squeeze())),
                'total': list(map(lambda x: int(x), input_pivot.sum(1).values))
            }
    return result_dict

In [None]:
%%time 

n_jobs, n_len = 20, 5

for idx, filename in zip([train_idx, val_idx, test_idx], ['train', 'val', 'test']):
    n_block = round(len(idx) / n_len)
    splits = np.array_split(idx, n_block)
    
    data_dict_list = []
    for ii in range(0, len(splits), n_jobs):
        tmp = splits[ii:ii + n_jobs]
        print(filename, list(map(lambda x: len(x), tmp)))
        start_time = datetime.datetime.now()
        data_dict_list += Parallel(n_jobs=len(tmp))(delayed(process_for_dates)(dates) for dates in tmp)
        print(ii, datetime.datetime.now() - start_time)
    for i in range(len(data_dict_list)):
        if i == 0:
            merged_dict = data_dict_list[0]
        else:
            for k, v in data_dict_list[i].items():
                for date in v.keys():
                    merged_dict[k][date] = v[date]
    with open(f'sales_data/{filename}_7days.json', 'w') as f:
        json.dump(merged_dict, f)

In [None]:
import os

for name in ['train', 'val', 'test']:
    path = f'sales_data/{name}_items'
    if not os.path.isdir(path):
        os.system(f'mkdir {path}')
    data = json.load(open(f'sales_data/{name}_7days.json', 'r'))
    for item in items:
        item_data = data[str(item)]
        dates = item_data.keys()
        remove_dates = []
        for d in dates:
            if [] in item_data[d]['store_seq']:
                remove_dates.append(d)
        print(remove_dates)
        for d in remove_dates:
            del item_data[d]
        with open(f'{path}/item{item}.json', 'w') as f:
            json.dump(item_data, f)

In [None]:
items = sorted(train['item'].drop_duplicates())
for item in items:
    for name in ['train', 'val', 'test']:
        data_dict = json.load(open(f'sales_data/{name}_items/item{item}.json', 'r'))
        dates = data_dict.keys()
        df_dict = dict()
        for col in ['store_seq', 'input', 'output', 'total']:
            data = list(map(lambda x: data_dict[x][col], dates))
            df_dict[col] = data
        df = pd.DataFrame(df_dict)
        df['date'] = dates
        df_json = df.to_dict('records')
        with open(f'sales_data/{name}_items/Item{item}.json', 'w') as f:
            json.dump(df_json, f)

In [None]:
import torch
from transformers import ElectraModel, ElectraTokenizer

MODEL_PATH = 'sales_seq_electra-small_model'
TOK_PATH = 'koelectra_tokenizer'
tokenizer = ElectraTokenizer.from_pretrained(TOK_PATH)
bert = ElectraModel.from_pretrained(MODEL_PATH)


def bert_model(seq_data):
    bert_input = []
    for seq_list in seq_data:
        token_list = []
        for seqs in seq_list:
            token_list.append(tokenizer.batch_encode_plus(
                seqs,
                padding='max_length',
                truncation=True,
                max_length=32,
                return_tensors='pt'
            ).input_ids.numpy())
        bert_input.append(np.stack(token_list))
    return np.stack(bert_input)#.astype(np.float16)

def scale(arr_list):
    arr_list = np.array(arr_list)
    sizes = list(arr_list.shape)

    if len(sizes) == 2:
        sizes += [1]
        arr_list = np.vstack(arr_list.reshape(*sizes))
    else:
        arr_list = np.concatenate(np.vstack(arr_list)).reshape(-1, 1)
    transed = pow_scaler.transform(arr_list).reshape(*sizes)
    if sizes[-1] != 1:
        sizes_ = sizes[:1] + [1] + sizes[1:]
        transed = transed.reshape(*sizes_)
    return np.array(transed, dtype=np.float16)

def process_file(batch):
    return {
        #'input_': scale(batch['input']),
        #'output_': scale(batch['output']),
        #'total_': scale(batch['total']),
        'bert_input': bert_model(batch['store_seq'])
    }

In [None]:
%%time

from datasets import load_dataset

datasets = load_dataset('json', data_files={'test': 'sales_data/train_items/Item1.json'})
testdatasets = datasets.map(
    process_file,
    batched=True,
    num_proc=2,
    load_from_cache_file=True,
    remove_columns=['store_seq', 'date'], # 'input', 'output', 'total', 
)

In [None]:
import torch

a = torch.randn(26, 10, 20)
a.mean(1).shape

In [None]:
testdatasets['test'].data.to_pandas()['bert_input'].iloc[2]

In [None]:
testdatasets['test'].data.to_pandas()['input_'].iloc[:2].values

In [None]:
a = np.random.rand(2, 3, 5)
a

In [None]:
np.array([aa.T for aa in a])

In [None]:
torch.FloatTensor(np.stack(testdatasets['test'].data.to_pandas()['output'].iloc[0])).half().shape

In [None]:
import seaborn as sns
%matplotlib inline

train.loc[train['date'] < '2016-01-01'].tail()

In [None]:
sns.distplot(train.loc[train['date'] < '2016-01-01', 'sales'])

In [None]:
from sklearn.preprocessing import StandardScaler, PowerTransformer

std_scaler = StandardScaler().fit(train.loc[train['date'] < '2016-01-01', 'sales'].values.reshape(-1, 1))
pow_scaler = PowerTransformer().fit(train.loc[train['date'] < '2016-01-01', 'sales'].values.reshape(-1, 1))

In [None]:
std_scaler.transform(train.loc[train['date'] < '2016-01-01', 'sales'].values.reshape(-1, 1))

In [None]:
sns.distplot(std_scaler.transform(train.loc[train['date'] >= '2016-01-01', 'sales'].values.reshape(-1, 1)))

In [None]:
sns.distplot(pow_scaler.transform(train.loc[train['date'] < '2016-01-01', 'sales'].values.reshape(-1, 1)))

In [None]:
sns.distplot(pow_scaler.transform(train.loc[train['date'] >= '2016-01-01', 'sales'].values.reshape(-1, 1)))

In [None]:
import pickle

#pickle.load(std_scaler, open('sales_data/standard_transformer.pkl', 'wb'))
pow_scaler = pickle.load(open('sales_data/power_transformer.pkl', 'rb'))

In [None]:
import torch.nn as nn

gru = nn.GRU(10, 20, 7, batch_first=True, dropout=0.2)

In [None]:
gru.to(torch.device('cpu'))

In [None]:
import torch

torch.stack([torch.tensor([-1.5941, -1.3562]), torch.tensor([-1.1443, -0.7769])]).T