In [8]:
import os
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from utils.timefeatures import time_features
from ean_global_channel import check_saved_standardization_data, generate_standardization_dicts, save_standardization_data, load_standardization_data
import warnings
import hashlib
from torch.utils.data import DataLoader
warnings.filterwarnings('ignore')

class Dataset_Promo_ean_global_channel(Dataset):
    def __init__(self, root_path, flag='train', size=None,
                 features='S', data_path='ETTh1.csv',
                 target='sold_units', scale=False, inverse=False, timeenc=0, freq='15min',
                 seasonal_patterns='Yearly', scale_path=None, embedding=True, embedding_dimension = 2):
        self.features = features
        self.target = target
        self.scale = scale
        self.scale_path = scale_path
        self.inverse = inverse
        self.timeenc = timeenc
        self.root_path = root_path

        self.embedding_dict = {}

        self.seq_len = size[0]
        self.label_len = size[1]
        self.pred_len = size[2]
        self.embedding_dim = embedding_dimension
        self.embedding = embedding
        self.seasonal_patterns = seasonal_patterns
        self.history_size = 2
        self.window_sampling_limit = int(self.history_size * self.pred_len)
        self.flag = flag
        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

    def generate_combinations(self, n):
        """Generates all unique combinations of binary values for n binary columns"""
        return [[(i >> j) & 1 for j in range(n)] for i in range(2**n)]
    def deterministic_embedding(self, comb):
        """Generates a deterministic embedding based on a hash of the combination"""
        hash_object = hashlib.sha256(str(comb).encode())
        hash_digest = hash_object.digest()
        seed = int.from_bytes(hash_digest[:4], 'little')
        rng = np.random.default_rng(seed)
        return rng.random(self.embedding_dim)

    def preprocess_pipeline(self, data, id='ean_global_channel'):
        """This function is responsible for all the preprocessing """
        data = data.rename(columns={'end_date': 'date', id: 'id'})
        data = data.drop(['is_promo', 'sub_axis', 'year', 'month', 'week'], axis=1)
        cols = list(data.columns)
        cols.remove(self.target)
        cols.remove('date')
        data = data[cols + [self.target]]  # organize data to date, variables and last is target we're not using date now

        binary_columns = [col for col in data.columns if col not in ['price_range', 'seasonality_index', 'id', self.target]]

        unique_combinations = self.generate_combinations(len(binary_columns))
        self.embedding_dict = {tuple(comb): self.deterministic_embedding(comb) for comb in unique_combinations}
        
        def get_embedding(row):
            comb = tuple(row[binary_columns])
            return self.embedding_dict[comb]
        if self.embedding:
            embeddings = data[binary_columns].apply(get_embedding, axis=1)
            embedding_df = pd.DataFrame(embeddings.tolist(), columns=['embedding_1', 'embedding_2'])
            data = pd.concat([data, embedding_df], axis=1)
            data = data.drop(columns=binary_columns)
        if self.scale:
            columns_to_standarize = ['price_range', 'sold_units', 'seasonality_index']
            if not check_saved_standardization_data(self.scale_path):
                mean_dict, std_dict, ids = generate_standardization_dicts(data)
                save_standardization_data(mean_dict, std_dict, ids, self.scale_path)
                print(f"standarization dictionaries are created in{self.scale_path}")
            print(f"scaling the data of {self.flag}")
            mean_dict, std_dict, ids = load_standardization_data(self.scale_path)
            standardized_data = pd.DataFrame()
            for id_value, group in data.groupby('id'):
                if id_value in mean_dict:
                    means = pd.Series(mean_dict[id_value])
                    stds = pd.Series(std_dict[id_value])
                    standardized_group = group.copy()
                    for col in columns_to_standarize:
                        if col in means and col in stds:
                            # Standardize each column in the group using the training set stats
                            standardized_group[col] = (group[col] - means[col]) / stds[col]
                        else:
                            print(f"No training data statistics for column: {col} in id: {id_value}. Skipping standardization for this column.")
                    standardized_group['id'] = id_value  # Add id column back
                    standardized_data = pd.concat([standardized_data, standardized_group])
                else:
                    print(f"No training data statistics for id: {id_value}. Skipping standardization for this id.")
            print(f"standarization is over of {self.flag}")
        else:
            standardized_data = data.copy()
        cols = list(standardized_data.columns)
        cols.remove(self.target)
        standardized_data = standardized_data[cols + [self.target]]
        return standardized_data
        
    def __read_data__(self):
        if self.flag == 'train':
            dataset = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path)) 
        else:
            dataset = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path.replace('train', 'test')))
        # Preprocessing dataset:
        df = self.preprocess_pipeline(dataset)
        self.ids = df['id'].unique()#[:100]
        self.timeseries = [df[df['id']==self.ids[i]].drop('id', axis=1).values for i in range(len(self.ids))]
        self.n_var = self.timeseries[0].shape[1]
    def __getitem__(self, index):
        insample = np.zeros((self.seq_len, self.n_var))
        insample_mask = np.zeros((self.seq_len, self.n_var))
        outsample = np.zeros((self.pred_len + self.label_len, self.n_var))
        outsample_mask = np.zeros((self.pred_len + self.label_len, self.n_var))  # m4 dataset

        sampled_timeseries = self.timeseries[index]
        # cut_point = np.random.randint(low=max(1, len(sampled_timeseries) - self.window_sampling_limit),
        #                               high=len(sampled_timeseries),
        #                               size=1)[0]
        if self.flag=='train':
            cut_point = np.random.randint(low=self.seq_len,
                                      high=len(sampled_timeseries)-self.pred_len+1,
                                      size=1)[0]
        else:
            cut_point = np.random.randint(low=max(1, len(sampled_timeseries)- self.window_sampling_limit),
                                      high=len(sampled_timeseries),
                                      size=1)[0]
            # if self.flag =='train':
            #     print(cut_point)
        # cut_point = np.random.randint(low=self.seq_len,
        #                               high=len(sampled_timeseries),
        #                               size=1)[0]
        insample_window = sampled_timeseries[max(0, cut_point - self.seq_len):cut_point]
        insample[-len(insample_window):] = insample_window
        insample_mask[-len(insample_window):] = 1.0
        outsample_window = sampled_timeseries[
                           cut_point - self.label_len:min(len(sampled_timeseries), cut_point + self.pred_len)]
        outsample[:len(outsample_window)] = outsample_window
        outsample_mask[:len(outsample_window)] = 1.0
        return insample, outsample, insample_mask, outsample_mask

    def __len__(self):
        return len(self.timeseries)

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

    def last_insample_window(self):
        """
        The last window of insample size of all timeseries.
        This function does not support batching and does not reshuffle timeseries.

        :return: Last insample window of all timeseries. Shape "timeseries, insample size"
        """
        insample = np.zeros((len(self.timeseries), self.seq_len, self.n_var))
        insample_mask = np.zeros((len(self.timeseries), self.seq_len, self.n_var))
        for i, ts in enumerate(self.timeseries):
            ts_last_window = ts[-self.seq_len:]
            insample[i, -len(ts):] = ts_last_window
            insample_mask[i, -len(ts):] = 1.0
        return insample, insample_mask

def data_provider(args, flag):
    timeenc = 0 if args.embed != 'timeF' else 1
    percent = args.percent

    if flag == 'test':
        shuffle_flag = False
        drop_last = True
        batch_size = args.batch_size
        freq = args.freq
    else:
        shuffle_flag = True
        drop_last = False
        batch_size = args.batch_size
        freq = args.freq

    data_set = Dataset_Promo_ean_global_channel(
        root_path=args.root_path,
        data_path=args.data_path,
        flag=flag,
        size=[args.seq_len, args.label_len, args.pred_len],
        features=args.features,
        target=args.target,
        scale=args.scale,
        scale_path=args.scale_path,
        embedding=args.embedding,
        embedding_dimension=args.embedding_dimension
    )
    data_loader = DataLoader(
        data_set,
        batch_size=batch_size,
        shuffle=shuffle_flag,
        num_workers=args.num_workers,
        drop_last=drop_last)
    return data_set, data_loader


In [2]:
from math import sqrt

import torch
import torch.nn as nn

from transformers import LlamaConfig, LlamaModel, LlamaTokenizer, GPT2Config, GPT2Model, GPT2Tokenizer, BertConfig, \
    BertModel, BertTokenizer
from layers.Embed import PatchEmbedding
import transformers
from layers.StandardNorm import Normalize
from vertexai.preview import VertexModel # VertexModel
import vertexai
from utils.tools import del_files, EarlyStopping, adjust_learning_rate, vali, load_content


from torch import optim
from torch.optim import lr_scheduler
from tqdm import tqdm


import time
import numpy as np
import os



from torch.cuda.amp import autocast, GradScaler

transformers.logging.set_verbosity_error()

def test_MS(args, device, model, train_loader, vali_loader, criterion):
    x, _ = train_loader.dataset.last_insample_window()
    y = vali_loader.dataset.timeseries
    x = torch.tensor(x, dtype=torch.float32).to(device)
    print("Shape of X eval", x.shape)
    model.eval()
    with torch.no_grad():
        B, _, C = x.shape
        dec_inp = torch.zeros((B, args.pred_len, C)).float().to(device)
        dec_inp = torch.cat([x[:, -args.label_len:, :], dec_inp], dim=1)
        outputs = torch.zeros((B, args.pred_len, C)).float().to(device)
        id_list = np.arange(0, B, args.eval_batch_size)
        id_list = np.append(id_list, B)
        with autocast():
            for i in range(len(id_list) - 1):
                outputs[id_list[i]:id_list[i + 1], :, :] = model(
                    x[id_list[i]:id_list[i + 1]],
                    None,
                    dec_inp[id_list[i]:id_list[i + 1]],
                    None
                )
        print("Shape of output eval before choosing", outputs.shape)
        f_dim = -1 if args.features == 'MS' else 0
        outputs = outputs[:, -args.pred_len:, f_dim:]
        pred = outputs
        true = torch.from_numpy(np.array(y)).to(device)
        true = true[:, -args.pred_len:, f_dim:]
        print("Shape of y eval", true.shape)
        batch_y_mark = torch.ones(true.shape).to(device)

        loss = criterion(pred, true)

    model.train()
    return loss

class FlattenHead(nn.Module):
    def __init__(self, n_vars, nf, target_window, head_dropout=0):
        super().__init__()
        self.n_vars = n_vars
        self.flatten = nn.Flatten(start_dim=-2)
        self.linear = nn.Linear(nf, target_window)
        self.dropout = nn.Dropout(head_dropout)

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear(x)
        x = self.dropout(x)
        return x


class Model(nn.Module,VertexModel):

    def __init__(self, configs, patch_len=16, stride=8):
        nn.Module.__init__(self)
        VertexModel.__init__(self)
        self.task_name = configs.task_name
        self.pred_len = configs.pred_len
        self.seq_len = configs.seq_len
        self.d_ff = configs.d_ff
        self.top_k = 5
        self.d_llm = configs.llm_dim
        self.patch_len = configs.patch_len
        self.stride = configs.stride
        self.args = configs

        if configs.llm_model == 'LLAMA':
            # self.llama_config = LlamaConfig.from_pretrained('/mnt/alps/modelhub/pretrained_model/LLaMA/7B_hf/')
            self.llama_config = LlamaConfig.from_pretrained('huggyllama/llama-7b')
            self.llama_config.num_hidden_layers = configs.llm_layers
            self.llama_config.output_attentions = True
            self.llama_config.output_hidden_states = True
            try:
                self.llm_model = LlamaModel.from_pretrained(
                    # "/mnt/alps/modelhub/pretrained_model/LLaMA/7B_hf/",
                    'huggyllama/llama-7b',
                    trust_remote_code=True,
                    local_files_only=True,
                    config=self.llama_config,
                    # load_in_4bit=True
                )
            except EnvironmentError:  # downloads model from HF is not already done
                print("Local model files not found. Attempting to download...")
                self.llm_model = LlamaModel.from_pretrained(
                    # "/mnt/alps/modelhub/pretrained_model/LLaMA/7B_hf/",
                    'huggyllama/llama-7b',
                    trust_remote_code=True,
                    local_files_only=False,
                    config=self.llama_config,
                    # load_in_4bit=True
                )
            try:
                self.tokenizer = LlamaTokenizer.from_pretrained(
                    # "/mnt/alps/modelhub/pretrained_model/LLaMA/7B_hf/tokenizer.model",
                    'huggyllama/llama-7b',
                    trust_remote_code=True,
                    local_files_only=True
                )
            except EnvironmentError:  # downloads the tokenizer from HF if not already done
                print("Local tokenizer files not found. Atempting to download them..")
                self.tokenizer = LlamaTokenizer.from_pretrained(
                    # "/mnt/alps/modelhub/pretrained_model/LLaMA/7B_hf/tokenizer.model",
                    'huggyllama/llama-7b',
                    trust_remote_code=True,
                    local_files_only=False
                )
        elif configs.llm_model == 'GPT2':
            self.gpt2_config = GPT2Config.from_pretrained('openai-community/gpt2')

            self.gpt2_config.num_hidden_layers = configs.llm_layers
            self.gpt2_config.output_attentions = True
            self.gpt2_config.output_hidden_states = True
            try:
                self.llm_model = GPT2Model.from_pretrained(
                    'openai-community/gpt2',
                    trust_remote_code=True,
                    local_files_only=True,
                    config=self.gpt2_config,
                )
            except EnvironmentError:  # downloads model from HF is not already done
                print("Local model files not found. Attempting to download...")
                self.llm_model = GPT2Model.from_pretrained(
                    'openai-community/gpt2',
                    trust_remote_code=True,
                    local_files_only=False,
                    config=self.gpt2_config,
                )

            try:
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    'openai-community/gpt2',
                    trust_remote_code=True,
                    local_files_only=True
                )
            except EnvironmentError:  # downloads the tokenizer from HF if not already done
                print("Local tokenizer files not found. Atempting to download them..")
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    'openai-community/gpt2',
                    trust_remote_code=True,
                    local_files_only=False
                )
        elif configs.llm_model == 'BERT':
            self.bert_config = BertConfig.from_pretrained('google-bert/bert-base-uncased')

            self.bert_config.num_hidden_layers = configs.llm_layers
            self.bert_config.output_attentions = True
            self.bert_config.output_hidden_states = True
            try:
                self.llm_model = BertModel.from_pretrained(
                    'google-bert/bert-base-uncased',
                    trust_remote_code=True,
                    local_files_only=True,
                    config=self.bert_config,
                )
            except EnvironmentError:  # downloads model from HF is not already done
                print("Local model files not found. Attempting to download...")
                self.llm_model = BertModel.from_pretrained(
                    'google-bert/bert-base-uncased',
                    trust_remote_code=True,
                    local_files_only=False,
                    config=self.bert_config,
                )

            try:
                self.tokenizer = BertTokenizer.from_pretrained(
                    'google-bert/bert-base-uncased',
                    trust_remote_code=True,
                    local_files_only=True
                )
            except EnvironmentError:  # downloads the tokenizer from HF if not already done
                print("Local tokenizer files not found. Atempting to download them..")
                self.tokenizer = BertTokenizer.from_pretrained(
                    'google-bert/bert-base-uncased',
                    trust_remote_code=True,
                    local_files_only=False
                )
        else:
            raise Exception('LLM model is not defined')

        if self.tokenizer.eos_token:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        else:
            pad_token = '[PAD]'
            self.tokenizer.add_special_tokens({'pad_token': pad_token})
            self.tokenizer.pad_token = pad_token

        for param in self.llm_model.parameters():
            param.requires_grad = False

        if configs.prompt_domain:
            self.description = configs.content
        else:
            self.description = 'The Electricity Transformer Temperature (ETT) is a crucial indicator in the electric power long-term deployment.'

        self.dropout = nn.Dropout(configs.dropout)

        self.patch_embedding = PatchEmbedding(
            configs.d_model, self.patch_len, self.stride, configs.dropout)

        self.word_embeddings = self.llm_model.get_input_embeddings().weight
        self.vocab_size = self.word_embeddings.shape[0]
        self.num_tokens = 1000
        self.mapping_layer = nn.Linear(self.vocab_size, self.num_tokens)

        self.reprogramming_layer = ReprogrammingLayer(configs.d_model, configs.n_heads, self.d_ff, self.d_llm)

        self.patch_nums = int((configs.seq_len - self.patch_len) / self.stride + 2)
        self.head_nf = self.d_ff * self.patch_nums

        if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
            self.output_projection = FlattenHead(configs.enc_in, self.head_nf, self.pred_len,
                                                 head_dropout=configs.dropout)
        else:
            raise NotImplementedError

        self.normalize_layers = Normalize(configs.enc_in, affine=False)

    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
        if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
            dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
            return dec_out[:, -self.pred_len:, :]
        return None

    def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):

        x_enc = self.normalize_layers(x_enc, 'norm')

        B, T, N = x_enc.size()
        x_enc = x_enc.permute(0, 2, 1).contiguous().reshape(B * N, T, 1)

        min_values = torch.min(x_enc, dim=1)[0]
        max_values = torch.max(x_enc, dim=1)[0]
        medians = torch.median(x_enc, dim=1).values
        lags = self.calcute_lags(x_enc)
        trends = x_enc.diff(dim=1).sum(dim=1)

        prompt = []
        for b in range(x_enc.shape[0]):
            min_values_str = str(min_values[b].tolist()[0])
            max_values_str = str(max_values[b].tolist()[0])
            median_values_str = str(medians[b].tolist()[0])
            lags_values_str = str(lags[b].tolist())
            prompt_ = (
                f"<|start_prompt|>Dataset description: {self.description}"
                f"Task description: forecast the next {str(self.pred_len)} steps given the previous {str(self.seq_len)} steps information; "
                "Input statistics: "
                f"min value {min_values_str}, "
                f"max value {max_values_str}, "
                f"median value {median_values_str}, "
                f"the trend of input is {'upward' if trends[b] > 0 else 'downward'}, "
                f"top 5 lags are : {lags_values_str}<|<end_prompt>|>"
            )

            prompt.append(prompt_)

        x_enc = x_enc.reshape(B, N, T).permute(0, 2, 1).contiguous() # B, T, N

        prompt = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048).input_ids
        prompt_embeddings = self.llm_model.get_input_embeddings()(prompt.to(x_enc.device))  # (batch, prompt_token, dim)

        source_embeddings = self.mapping_layer(self.word_embeddings.permute(1, 0)).permute(1, 0)

        x_enc = x_enc.permute(0, 2, 1).contiguous() # B N T
        enc_out, n_vars = self.patch_embedding(x_enc.to(torch.bfloat16))
        enc_out = self.reprogramming_layer(enc_out, source_embeddings, source_embeddings)
        llama_enc_out = torch.cat([prompt_embeddings, enc_out], dim=1)
        dec_out = self.llm_model(inputs_embeds=llama_enc_out).last_hidden_state
        dec_out = dec_out[:, :, :self.d_ff]

        dec_out = torch.reshape(
            dec_out, (-1, n_vars, dec_out.shape[-2], dec_out.shape[-1]))
        dec_out = dec_out.permute(0, 1, 3, 2).contiguous()

        dec_out = self.output_projection(dec_out[:, :, :, -self.patch_nums:])
        dec_out = dec_out.permute(0, 2, 1).contiguous()

        dec_out = self.normalize_layers(dec_out, 'denorm')

        return dec_out

    @vertexai.preview.developer.mark.train()
    def train_model(self, train_loader, vali_loader, criterion, path):
        torch.set_printoptions(profile="full")
        import torch.multiprocessing as mp
        mp.set_start_method('spawn', force=True)
        # for name, param in self.named_parameters():
        #     if param.requires_grad:
        #         if name == 'mapping_layer.weight' or name == 'reprogramming_layer.query_projection.weight':
        #             print(f"Layer: {name} | Size: {param.size()} | Values : {param[:1]} \n")

        if not os.path.exists(path):
            os.makedirs(path)

        self.args.content = load_content(self.args)
        time_now = time.time()

        train_steps = len(train_loader)
        early_stopping = EarlyStopping(accelerator=None, patience=self.args.patience)

        trained_parameters = [p for p in self.parameters() if p.requires_grad]
        model_optim = optim.Adam(trained_parameters, lr=self.args.learning_rate)

        if self.args.lradj == 'COS':
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(model_optim, T_max=20, eta_min=1e-8)
        else:
            scheduler = lr_scheduler.OneCycleLR(optimizer=model_optim,
                                                steps_per_epoch=train_steps,
                                                pct_start=self.args.pct_start,
                                                epochs=self.args.train_epochs,
                                                max_lr=self.args.learning_rate)

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)
        print(f'device{device}')
        scaler = GradScaler()
        accumulation_steps = 4

        for epoch in range(self.args.train_epochs):
            iter_count = 0
            train_loss = []

            self.train()
            epoch_time = time.time()

            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
                iter_count += 1
                model_optim.zero_grad()
                batch_x = batch_x.float().to(device)
                batch_y = batch_y.float().to(device)
                batch_y_mark = batch_y_mark.float().to(device)

                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float().to(device)
                dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(device)
                
                with autocast():
                    outputs = self(batch_x, None, dec_inp, None)

                f_dim = -1 if self.args.features == 'MS' else 0
                outputs = outputs[:, -self.args.pred_len:, f_dim:]
                batch_y = batch_y[:, -self.args.pred_len:, f_dim:]
                batch_y_mark = batch_y_mark[:, -self.args.pred_len:, f_dim:]

                loss = criterion(outputs, batch_y)
                loss = loss / accumulation_steps
                scaler.scale(loss).backward()

                if (i + 1) % accumulation_steps == 0:
                    scaler.step(model_optim)
                    scaler.update()
                    model_optim.zero_grad()

                train_loss.append(loss.item())

                if (i + 1) % 100 == 0:
                    print(
                        "\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item())
                    )
                    speed = (time.time() - time_now) / iter_count
                    left_time = speed * ((self.args.train_epochs - epoch) * train_steps - i)
                    print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time))
                    iter_count = 0
                    time_now = time.time()

                if self.args.lradj == 'TST':
                    adjust_learning_rate(None, model_optim, scheduler, epoch + 1, self.args, printout=False)
                    scheduler.step()

            print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time))
            train_loss = np.average(train_loss)
            print('########################################################################')
            vali_loss = test_MS(self.args, device, self, train_loader, vali_loader, criterion)
            test_loss = vali_loss
            print(
                "Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format(
                    epoch + 1, train_steps, train_loss, vali_loss, test_loss))
            
            early_stopping(vali_loss, self, path)  # model saving
            if early_stopping.early_stop:
                print("Early stopping")
                break

            if self.args.lradj != 'TST':
                adjust_learning_rate(None, model_optim, scheduler, epoch + 1, self.args, printout=True)
            else:
                print('Updating learning rate to {}'.format(scheduler.get_last_lr()[0]))
        
        torch.save(self.state_dict(), './checkpoint')

    
    def calcute_lags(self, x_enc):
        q_fft = torch.fft.rfft(x_enc.permute(0, 2, 1).contiguous(), dim=-1)
        k_fft = torch.fft.rfft(x_enc.permute(0, 2, 1).contiguous(), dim=-1)
        res = q_fft * torch.conj(k_fft)
        corr = torch.fft.irfft(res, dim=-1)
        mean_value = torch.mean(corr, dim=1)
        _, lags = torch.topk(mean_value, self.top_k, dim=-1)
        return lags



class ReprogrammingLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_keys=None, d_llm=None, attention_dropout=0.1):
        super(ReprogrammingLayer, self).__init__()

        d_keys = d_keys or (d_model // n_heads)

        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
        self.key_projection = nn.Linear(d_llm, d_keys * n_heads)
        self.value_projection = nn.Linear(d_llm, d_keys * n_heads)
        self.out_projection = nn.Linear(d_keys * n_heads, d_llm)
        self.n_heads = n_heads
        self.dropout = nn.Dropout(attention_dropout)

    def forward(self, target_embedding, source_embedding, value_embedding):
        B, L, _ = target_embedding.shape
        S, _ = source_embedding.shape
        H = self.n_heads

        target_embedding = self.query_projection(target_embedding).view(B, L, H, -1)
        source_embedding = self.key_projection(source_embedding).view(S, H, -1)
        value_embedding = self.value_projection(value_embedding).view(S, H, -1)

        out = self.reprogramming(target_embedding, source_embedding, value_embedding)

        out = out.reshape(B, L, -1)

        return self.out_projection(out)

    def reprogramming(self, target_embedding, source_embedding, value_embedding):
        B, L, H, E = target_embedding.shape

        scale = 1. / sqrt(E)

        scores = torch.einsum("blhe,she->bhls", target_embedding, source_embedding)

        A = self.dropout(torch.softmax(scale * scores, dim=-1))
        reprogramming_embedding = torch.einsum("bhls,she->blhe", A, value_embedding)

        return reprogramming_embedding


In [3]:
class Args:
    def __init__(self):

        self.task_name = 'short_term_forecast'
        self.is_training = 1
        self.model_id = 'promo_ean_channel'
        self.model_comment = 'pretrain-EAN_Channel'
        self.model = 'TimeLLM'
        self.seed = 2021
        self.data = 'promo_ean_channel'
        self.root_path = './dataset'
        self.data_path = 'train.csv'
        self.features = 'MS'
        self.target = 'sold_units'
        self.loader = 'modal'
        self.freq = 'h'
        self.checkpoints = './checkpoints/'

        self.seasonal_patterns = 'Monthly'
        self.moving_avg = 1
        self.dropout = 0.1
        self.embed = 'timeF'
        self.activation = 'gelu'
        self.output_attention = False
        self.prompt_domain = 0
        self.llm_model = 'GPT2'
        self.llm_dim = 768
        self.num_workers = 10
        self.itr = 1
        
        self.align_epochs = 10
        self.batch_size = 1
        self.eval_batch_size = 1
        
        
        self.des = 'Exp'
        
        self.lradj = 'type1'
        self.pct_start = 0.2
        self.use_amp = False
        self.percent = 100

        self.train_epochs = 2
        self.patience = 20
        self.learning_rate = 100
        self.loss = 'MSE'

        self.zero_percent = 0
        self.interpolation = False
        self.interpolation_method = False
        self.fill_discontinuity = True
        self.month = 11
        self.num_weeks = 48
        self.scale = True
        self.embedding = True
        self.embedding_dimension = 2
        self.keep_non_promo = False
        self.channel = 'Offline'
        self.pretrain = False
        
        self.seq_len = 34
        self.label_len = 17
        self.pred_len = 17
        self.patch_len = 1
        self.stride = 1
        self.enc_in = 7
        self.dec_in = 7
        self.c_out = 7
        self.d_model = 16
        self.llm_layers = 16
        self.n_heads = 8
        self.e_layers = 2
        self.d_layers = 1
        self.d_ff = 32
        self.factor = 1
# Instantiate the Args
args = Args()

# Train

## Load data from gcp

In [15]:
import torch
from torch import optim
from torch.optim import lr_scheduler



import time
import random
import numpy as np
import pandas as pd

from utils.losses import smape_loss
import os

os.environ['CURL_CA_BUNDLE'] = ''
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

from utils.tools import del_files, EarlyStopping, adjust_learning_rate, load_content

from ean_global_channel import import_true_promo, import_all, check_saved_standardization_data, delete_saved_standardization_data
from google.cloud import bigquery

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(2.0 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))) * 100


fix_seed = 2021
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)



PROJECT_ID = "itg-bpma-gbl-ww-np"  # @param {type:"string"}
REGION = "europe-west1" 
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}
import vertexai
REMOTE_JOB_NAME = "timeseriesllm1"
REMOTE_JOB_BUCKET = f"{BUCKET_URI}/{REMOTE_JOB_NAME}"
##################################################################################################
vertexai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=REMOTE_JOB_BUCKET,
)

##################################################################################################
bq_client = bigquery.Client(
    project=PROJECT_ID,  # GCP project used for running the queries 
)

#################################################################################################
pred_len = 17

#print("Ended up with ", 4*pred_len)
args.num_weeks=4*pred_len
args.pred_len = pred_len
args.label_len = pred_len
args.seq_len = 2*pred_len
# print(f"{args.seq_len}")
#print("Let's Load the Data From GCP")
# if args.interpolation:
#     final_data, train_set, test_set, pred_len = import_all(
#         client=bq_client,
#         zero_percent=args.zero_percent,
#         month=args.month,
#         num_weeks=args.num_weeks,
#         channel=args.channel,
#         fill_discontinuity=args.fill_discontinuity,
#         keep_non_promo=args.keep_non_promo,
#         interpolation_method=args.interpolation_method
#     )
# else :
#     final_data, train_set, test_set, pred_len = import_true_promo(
#         client=bq_client,
#         zero_percent=args.zero_percent,
#         month=args.month,
#         num_weeks=args.num_weeks,
#         channel=args.channel,
#         fill_discontinuity=args.fill_discontinuity,
#         keep_non_promo=args.keep_non_promo
#     )

setting = '{}_{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_{}'.format(
        args.task_name,
        args.model_id,
        args.model,
        args.data,
        args.features,
        args.seq_len,
        args.label_len,
        args.pred_len,
        args.d_model,
        args.n_heads,
        args.e_layers,
        args.d_layers,
        args.d_ff,
        args.factor,
        args.embed,
        args.des)
################## 
# Construct the path
base_dir = f"dataset/"
if args.interpolation:
    base_dir += f"interpolation_{args.interpolation_method}/"
else : 
    base_dir += f"true_promo/"

base_dir+= f"{args.channel}Channel_Month{args.month}_{args.num_weeks}Weeks"
if args.fill_discontinuity:
    base_dir += "_filldiscont"
if args.keep_non_promo:
    base_dir += "_keepnonpromo"
if args.scale:
    base_dir+="_scaled"
if args.embedding:
    base_dir+=f"_embedding_{args.embedding_dimension}"
base_dir += '/'+setting
os.makedirs(base_dir, exist_ok=True)

train_path = os.path.join(base_dir, "train.csv")
test_path = os.path.join(base_dir, "test.csv")


train_set = pd.read_csv(train_path)
test_set = pd.read_csv(test_path)

if args.scale:
     
    args.scale_path = 'scale_path/' + base_dir[8:]
    if check_saved_standardization_data(args.scale_path):
        delete_saved_standardization_data(args.scale_path)


########################################################### configuration ####################
args.pred_len = pred_len
args.label_len = pred_len
args.seq_len = int(2*pred_len)
args.root_path = base_dir
args.data_path = 'train.csv'
##############################################################################################
#print(f"{args.seq_len}")

for ii in range(args.itr):
    # setting record of experiments
    setting = '{}_{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_{}_{}'.format(
        args.task_name,
        args.model_id,
        args.model,
        args.data,
        args.features,
        args.seq_len,
        args.label_len,
        args.pred_len,
        args.d_model,
        args.n_heads,
        args.e_layers,
        args.d_layers,
        args.d_ff,
        args.factor,
        args.embed,
        args.des, ii)

    
    path = os.path.join(args.checkpoints,
                        base_dir[8:] + '_' + str(ii) + '-' + args.model_comment)  # unique checkpoint saving path
    args.content = load_content(args)
    if not os.path.exists(path):
        os.makedirs(path)



In [16]:
train_data, train_loader = data_provider(args, 'train')
test_data, test_loader = data_provider(args, 'test')

standarization dictionaries are created inscale_path/true_promo/OfflineChannel_Month11_68Weeks_filldiscont_scaled_embedding_2/short_term_forecast_promo_ean_channel_TimeLLM_promo_ean_channel_ftMS_sl34_ll17_pl17_dm16_nh8_el2_dl1_df32_fc1_ebtimeF_Exp
scaling the data of train
standarization is over of train
scaling the data of test
standarization is over of test


In [17]:
len(train_loader)

10

### change le and epochs! 

In [18]:
args.learning_rate = 0.01
args.train_epochs = 5

In [19]:
fix_seed = 2021
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)
model = Model(args).float()
# Initialize optimizer with these parameters
criterion = nn.MSELoss()
if args.pretrain:
    setting = '{}_{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_{}_{}'.format(
    args.task_name,
    'pretrain',
    args.model,
    'pretrain',
    args.features,
    args.seq_len,
    args.label_len,
    args.pred_len,
    args.d_model,
    args.n_heads,
    args.e_layers,
    args.d_layers,
    args.d_ff,
    args.factor,
    args.embed,
    args.des, ii)
    pretrain_path = os.path.join(args.checkpoints,
                    'pretrain/' + setting+'/checkpoint_pretrain')
    if not os.path.exists(pretrain_path):
        raise "can't find pretrained path"
    model.load_state_dict(torch.load(pretrain_path), strict=False)
    print(f"pretrained model was loaded from{pretrain_path}")

In [20]:
vertexai.preview.init(remote=True)
model.train_model.vertex.remote_config.container_uri = "europe-west1-docker.pkg.dev/itg-bpma-gbl-ww-np/timeseriesforecasting/torch-train:latest"
model.train_model.vertex.remote_config.enable_cuda = True
model.train_model.vertex.remote_config.accelerator_count = 4
model.train_model(train_loader, test_loader, criterion, path)
torch.save(model.state_dict(),  path + '/' + 'checkpoint')

Remote job created. View the job: https://console.cloud.google.com/ai/platform/locations/europe-west1/training/6728740269811826688?project=238069609727


In [37]:
for name, param in model.named_parameters():
    if param.requires_grad:
        if name == 'mapping_layer.weight' or name == 'reprogramming_layer.query_projection.weight':
            print(f"Layer: {name} | Size: {param.size()} | Values : {param[:1]} \n")


Layer: mapping_layer.weight | Size: torch.Size([1000, 50257]) | Values : tensor([[0.0032, 0.0033, 0.0024,  ..., 0.0037, 0.0007, 0.0014]],
       grad_fn=<SliceBackward0>) 

Layer: reprogramming_layer.query_projection.weight | Size: torch.Size([256, 16]) | Values : tensor([[ 0.0243,  0.0304,  0.2257, -0.0924, -0.2241, -0.0075,  0.1639, -0.0313,
          0.1483,  0.1348, -0.1936,  0.1491, -0.1729, -0.1193, -0.1242,  0.2027]],
       grad_fn=<SliceBackward0>) 



check model initial params

In [15]:
fix_seed = 2021
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)
for name, param in Model(args).float().named_parameters():
    if param.requires_grad:
        if name == 'output_projection.linear.bias' or name == 'reprogramming_layer.query_projection.weight':
            print(f"Layer: {name} | Size: {param.size()} | Values : {param[:1]} \n")

Layer: reprogramming_layer.query_projection.weight | Size: torch.Size([256, 16]) | Values : tensor([[ 0.0243,  0.0304,  0.2257, -0.0924, -0.2241, -0.0075,  0.1639, -0.0313,
          0.1483,  0.1348, -0.1936,  0.1491, -0.1729, -0.1193, -0.1242,  0.2027]],
       grad_fn=<SliceBackward0>) 

Layer: output_projection.linear.bias | Size: torch.Size([17]) | Values : tensor([-0.0210], grad_fn=<SliceBackward0>) 

