In [21]:

import argparse
import torch
from accelerate import Accelerator, DeepSpeedPlugin
from accelerate import DistributedDataParallelKwargs
from torch import nn, optim
from torch.optim import lr_scheduler

from time_llm.data_provider_pretrain.data_factory import data_provider
from time_llm.models import Autoformer, DLinear
import model_predefines

import time
import random
import numpy as np
import os

from time_llm.utils.tools import del_files, EarlyStopping, adjust_learning_rate, vali, load_content


In [None]:

# os.environ['CURL_CA_BUNDLE'] = ''
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"


parser = argparse.ArgumentParser(description='Time-LLM')

fix_seed = 2021
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)


In [None]:
args = {}

# basic config
args.task_name = 'long_term_forecast' # 'task name, options:[long_term_forecast, short_term_forecast, imputation, classification, anomaly_detection]'
args.is_training = 1 # 'status'
args.model_id = 'test' # 'model id'
args.model_comment = 'none' # 'prefix when saving test results'
args.model = 'Autoformer' # 'model name, options: [Autoformer, DLinear]'
args.seed = 2021 # 'random seed'

# data loader
args.data_pretrain = 'ETTm1' # 'dataset type'
args.data = 'ETTm1' # 'dataset type'
args.root_path = 'ETTh1' # 'data file'
args.data_path_pretrain = 'ETTh1' # 'data file'
args.features = 'M' # 'forecasting task, options:[M, S, MS]; ''M:multivariate predict multivariate, S: univariate predict univariate, ''MS:multivariate predict univariate'
args.target = 'OT' # 'target feature in S or MS task'
args.loader = 'modal' # 'dataset type'
args.freq = 'h' # 'freq for time features encoding, ''options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], ''you can also use more detailed freq like 15min or 3h'
args.checkpoints = 96 # 'input sequence length'
args.label_len = 48 # 'start token length'
args.pred_len = 96 # 'prediction sequence length'
args.seasonal_patterns = 'Monthly' # 'subset for M4'

# model define
args.enc_in = 7 # 'encoder input size'
args.dec_in = 7 # 'decoder input size'
args.c_out = 7 # 'output size'
args.d_model = 16 # 'dimension of model'
args.n_heads = 8 # 'num of heads'
args.e_layers = 2 # 'num of encoder layers'
args.d_layers = 1 # 'num of decoder layers'
args.d_ff = 32 # 'dimension of fcn'
args.moving_avg = 25 # 'window size of moving average'
args.factor = 1 # 'attn factor'
args.dropout = 0 # 'dropout'
args.embed = 'timeF' # 'time features encoding, options:[timeF, fixed, learned]'
args.activation = 'gelu' # 'activation'
args.output_attention = 16 # 'patch length'
args.stride = 8 # 'stride'
args.prompt_domain = 0 # ''
args.llm_model = 'LLAMA' # 'LLM model' # LLAMA
args.llm_dim = '4096' # 'LLM model dimension'# LLama7b:4096; GPT2-small:768; BERT-base:768

# optimization
args.num_workers = 10 # 'data loader num workers'
args.itr = 1 # 'experiments times'
args.train_epochs = 10 # 'train epochs'
args.align_epochs = 10 # 'alignment epochs'
args.batch_size = 32 # 'batch size of train input data'
args.eval_batch_size = 8 # 'batch size of model evaluation'
args.patience = 5 # 'early stopping patience'
args.learning_rate = 0 # 'optimizer learning rate'
args.des = 'test' # 'exp description'
args.loss = 'MSE' # 'loss function'
args.lradj = 'type1' # 'adjust learning rate'
args.pct_start = 0 # 'pct_start'
args.use_amp = False # use automatic mixed precision training
args.llm_layers = False 
args.percent = 100

In [None]:

ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
# deepspeed_plugin = DeepSpeedPlugin(hf_ds_config='./ds_config_zero2.json')
# accelerator = Accelerator(kwargs_handlers=[ddp_kwargs], deepspeed_plugin=deepspeed_plugin)
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])

for ii in range(args.itr):
    # setting record of experiments
    setting = '{}_{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_{}_{}'.format(
        args.task_name,
        args.model_id,
        args.model,
        args.data,
        args.features,
        args.seq_len,
        args.label_len,
        args.pred_len,
        args.d_model,
        args.n_heads,
        args.e_layers,
        args.d_layers,
        args.d_ff,
        args.factor,
        args.embed,
        args.des, ii)

    train_data, train_loader = data_provider(args, args.data_pretrain, args.data_path_pretrain, True, 'train')
    vali_data, vali_loader = data_provider(args, args.data_pretrain, args.data_path_pretrain, True, 'val')
    test_data, test_loader = data_provider(args, args.data, args.data_path, False, 'test')

    model = model_predefines.Model(args).float()

    path = os.path.join(args.checkpoints,setting + '-' + args.model_comment)  # unique checkpoint saving path
    args.content = load_content(args)
    if not os.path.exists(path) and accelerator.is_local_main_process:
        os.makedirs(path)

    time_now = time.time()

    train_steps = len(train_loader)
    early_stopping = EarlyStopping(accelerator=accelerator, patience=args.patience)

    trained_parameters = []
    for p in model.parameters():
        if p.requires_grad is True:
            trained_parameters.append(p)

    model_optim = optim.Adam(trained_parameters, lr=args.learning_rate)

    if args.lradj == 'COS':
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(model_optim, T_max=20, eta_min=1e-8)
    else:
        scheduler = lr_scheduler.OneCycleLR(optimizer=model_optim,
                                            steps_per_epoch=train_steps,
                                            pct_start=args.pct_start,
                                            epochs=args.train_epochs,
                                            max_lr=args.learning_rate)

    criterion = nn.MSELoss()
    mae_metric = nn.L1Loss()

    train_loader, vali_loader, test_loader, model, model_optim, scheduler = accelerator.prepare(
        train_loader, vali_loader, test_loader, model, model_optim, scheduler)

    if args.use_amp:
        scaler = torch.cuda.amp.GradScaler()

    for epoch in range(args.train_epochs):
        iter_count = 0
        train_loss = []

        model.train()
        epoch_time = time.time()
        for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
            iter_count += 1
            model_optim.zero_grad()

            batch_x = batch_x.float().to(accelerator.device)
            batch_y = batch_y.float().to(accelerator.device)
            batch_x_mark = batch_x_mark.float().to(accelerator.device)
            batch_y_mark = batch_y_mark.float().to(accelerator.device)

            # decoder input
            dec_inp = torch.zeros_like(batch_y[:, -args.pred_len:, :]).float().to(
                accelerator.device)
            dec_inp = torch.cat([batch_y[:, :args.label_len, :], dec_inp], dim=1).float().to(
                accelerator.device)

            # encoder - decoder
            if args.use_amp:
                with torch.cuda.amp.autocast():
                    if args.output_attention:
                        outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                    else:
                        outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)

                    f_dim = -1 if args.features == 'MS' else 0
                    outputs = outputs[:, -args.pred_len:, f_dim:]
                    batch_y = batch_y[:, -args.pred_len:, f_dim:].to(accelerator.device)
                    loss = criterion(outputs, batch_y)
                    train_loss.append(loss.item())
            else:
                if args.output_attention:
                    outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                else:
                    outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)

                f_dim = -1 if args.features == 'MS' else 0
                outputs = outputs[:, -args.pred_len:, f_dim:]
                batch_y = batch_y[:, -args.pred_len:, f_dim:]
                loss = criterion(outputs, batch_y)
                train_loss.append(loss.item())

            if (i + 1) % 100 == 0:
                accelerator.print(
                    "\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item()))
                speed = (time.time() - time_now) / iter_count
                left_time = speed * ((args.train_epochs - epoch) * train_steps - i)
                accelerator.print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time))
                iter_count = 0
                time_now = time.time()

            if args.use_amp:
                scaler.scale(loss).backward()
                scaler.step(model_optim)
                scaler.update()
            else:
                accelerator.backward(loss)
                model_optim.step()

            if args.lradj == 'TST':
                adjust_learning_rate(accelerator, model_optim, scheduler, epoch + 1, args, printout=False)
                scheduler.step()

        accelerator.print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time))
        train_loss = np.average(train_loss)
        vali_loss, vali_mae_loss = vali(args, accelerator, model, vali_data, vali_loader, criterion, mae_metric)
        test_loss, test_mae_loss = vali(args, accelerator, model, test_data, test_loader, criterion, mae_metric)
        accelerator.print(
            "Epoch: {0} | Train Loss: {1:.7f} Vali Loss: {2:.7f} Test Loss: {3:.7f} MAE Loss: {4:.7f}".format(
                epoch + 1, train_loss, vali_loss, test_loss, test_mae_loss))

        early_stopping(vali_loss, model, path)
        if early_stopping.early_stop:
            accelerator.print("Early stopping")
            break

        if args.lradj != 'TST':
            if args.lradj == 'COS':
                scheduler.step()
                accelerator.print("lr = {:.10f}".format(model_optim.param_groups[0]['lr']))
            else:
                if epoch == 0:
                    args.learning_rate = model_optim.param_groups[0]['lr']
                    accelerator.print("lr = {:.10f}".format(model_optim.param_groups[0]['lr']))
                adjust_learning_rate(accelerator, model_optim, scheduler, epoch + 1, args, printout=True)

        else:
            accelerator.print('Updating learning rate to {}'.format(scheduler.get_last_lr()[0]))

accelerator.wait_for_everyone()
if accelerator.is_local_main_process:
    path = './checkpoints'  # unique checkpoint saving path
    del_files(path)  # delete checkpoint files
    accelerator.print('success delete checkpoints')