# # TimesNet 教程
# **设置说明：** 本笔记本提供了 `TimesNet` 支持的学习任务的教程。
# 
# `TimesNet` 基本上可以支持5种任务，分别是长期预测、短期预测、插值、异常检测和分类。

# ### 1. 安装 Python 3.8。为方便起见，执行以下命令。

In [None]:
pip install -r requirements.txt

### 2. Package Import

In [None]:
import torch    
import torch.nn as nn
import torch.nn.functional as F
import torch.fft
from layers.Embed import DataEmbedding
from layers.Conv_Blocks import Inception_Block_V1   
            #convolution block used for convoluting the 2D time data, changeable

# ### 3. TimesBlock 构造
#  `TimesNet` 的核心思想在于 `TimesBlock` 的构造。它通常通过对数据实施FFT来获取基本频率，然后分别从主要基本频率将时间序列重塑为2D变化，接着进行2D卷积，其输出被重塑回原形状并加权以形成最终输出。
# 
#  在接下来的部分，我们将详细查看 `TimesBlock`。
# 
#  TimesBlock 有两个成员。

In [None]:
class TimesBlock(nn.Module):
    def __init__(self, configs):
        ...
    
    def forward(self, x):
        ...

First, let's focus on ```__init__(self, configs):```

In [None]:
def __init__(self, configs):    ##configs是为TimesBlock定义的配置
    super(TimesBlock, self).__init__() 
    self.seq_len = configs.seq_len   ##序列长度
    self.pred_len = configs.pred_len ##预测长度
    self.k = configs.top_k    ##k表示考虑的顶部频率数量
    
    # 参数高效设计
    self.conv = nn.Sequential(
        Inception_Block_V1(configs.d_model, configs.d_ff,
                           num_kernels=configs.num_kernels),
        nn.GELU(),
        Inception_Block_V1(configs.d_ff, configs.d_model,
                           num_kernels=configs.num_kernels)
    )

Then, have a look at ```forward(self, x)```

In [None]:
def forward(self, x):
        B, T, N = x.size()
            #B: 批次大小  T: 时间序列长度  N: 特征数量
        period_list, period_weight = FFT_for_Period(x, self.k)
            #FFT_for_Period() 将在后面展示。这里，period_list([top_k]) 表示
            #top_k 个最显著的周期，period_weight([B, top_k]) 表示其权重（振幅）

        res = []
        for i in range(self.k):
            period = period_list[i]

            # 填充：为了形成2D映射，我们需要序列的总长度加上要预测的部分
            # 能被周期整除，所以需要填充
            if (self.seq_len + self.pred_len) % period != 0:
                length = (
                                 ((self.seq_len + self.pred_len) // period) + 1) * period
                padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
                out = torch.cat([x, padding], dim=1)
            else:
                length = (self.seq_len + self.pred_len)
                out = x

            # 重塑：我们需要单个数据的每个通道成为2D变量，
            # 同时，为了之后实现2D卷积，我们需要调整要进行卷积的2个维度到最后2个维度，
            # 通过调用permute()函数。之后，为了使张量在内存中连续，调用contiguous()
            out = out.reshape(B, length // period, period,
                              N).permute(0, 3, 1, 2).contiguous()
            
            #2D卷积以获取周期内和周期间的信息
            out = self.conv(out)

            # 重塑回原来的形状，类似于之前的重塑
            out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
            
            #截断输出的填充部分并将其放入结果中
            res.append(out[:, :(self.seq_len + self.pred_len), :])
        res = torch.stack(res, dim=-1) #res: 4D [B, length , N, top_k]

        # 自适应聚合
        #首先，使用softmax从振幅获取归一化权重 --> 2D [B,top_k]
        period_weight = F.softmax(period_weight, dim=1) 

        #经过两次unsqueeze(1)后，形状变为 [B,1,1,top_k]，所以重复权重以适应res的形状
        period_weight = period_weight.unsqueeze(
            1).unsqueeze(1).repeat(1, T, N, 1)
        
        #加权求和top_k个周期的结果，得到这个TimesBlock的结果
        res = torch.sum(res * period_weight, -1)

        # 残差连接
        res = res + x
        return res

The ```FFT_for_Period``` above is given by:

In [None]:
def FFT_for_Period(x, k=2):
    # xf 形状 [B, T, C]，表示给定B,N处的数据片段的频率(T)幅度
    xf = torch.fft.rfft(x, dim=1) 

    # 通过幅度找到周期：这里我们假设周期特征在不同的批次和通道中基本恒定
    # 所以我们对这两个维度取平均，得到一个形状为[T]的频率列表 
    # frequency_list中位置t的每个元素表示频率(t)处的总体幅度
    frequency_list = abs(xf).mean(0).mean(-1) 
    frequency_list[0] = 0

    # 通过torch.topk()，我们可以得到frequency_list的最大k个元素，以及它们的位置（即top_list中的k个主要频率）
    _, top_list = torch.topk(frequency_list, k)

    # 返回一个新的张量'top_list'，与当前图分离。
    # 结果永远不需要梯度。转换为numpy实例
    top_list = top_list.detach().cpu().numpy()
     
    # period：形状为[top_k]的列表，分别记录平均频率的周期
    period = x.shape[1] // top_list

    # 这里，返回的第二项形状为[B, top_k]，表示每个数据片段的最大top_k幅度
    # N个特征被平均
    return period, abs(xf).mean(-1)[:, top_list] 

To make it clearer, please see the figures below.

![FFT demonstrator](./fft.png)

![2D Conv demonstrator](./conv.png)

For more details, please read the our paper 
(link: https://openreview.net/pdf?id=ju_Uqw384Oq)

### 4. TimesNet

到目前为止，我们已经得到了`TimesBlock`，它擅长检索周期内和周期间的时间信息。我们现在能够构建一个`TimesNet`。`TimesNet`在多项任务中表现出色，包括短期和长期预测、插值、分类和异常检测。

在本节中，我们将详细概述`TimesNet`如何在这些任务中获得其强大能力。

In [None]:
class Model(nn.Module):
    def __init__(self, configs):
        ...
    
    def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
        ...

    def imputation(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask):
        ...

    def anomaly_detection(self, x_enc):
        ...
    
    def classification(self, x_enc, x_mark_enc):
        ...

    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
        ...

First of all, let's focus on ```__init__(self, configs):```

In [None]:
def __init__(self, configs):
    super(Model, self).__init__()
    # 参数初始化
    self.configs = configs
    self.task_name = configs.task_name
    self.seq_len = configs.seq_len
    self.label_len = configs.label_len
    self.pred_len = configs.pred_len

    # 堆叠TimesBlock e_layers次，形成TimesNet的主要部分，命名为model
    self.model = nn.ModuleList([TimesBlock(configs)
                                for _ in range(configs.e_layers)])
    
    # 嵌入和归一化
    # enc_in是编码器输入大小，即一条数据的特征数量
    # d_model是嵌入的维度
    self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
                                       configs.dropout)
    self.layer = configs.e_layers # 编码器层数
    self.layer_norm = nn.LayerNorm(configs.d_model)

    # 为不同任务定义一些层
    if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
        self.predict_linear = nn.Linear(
            self.seq_len, self.pred_len + self.seq_len)
        self.projection = nn.Linear(
            configs.d_model, configs.c_out, bias=True)
    if self.task_name == 'imputation' or self.task_name == 'anomaly_detection':
        self.projection = nn.Linear(
            configs.d_model, configs.c_out, bias=True)
    if self.task_name == 'classification':
        self.act = F.gelu
        self.dropout = nn.Dropout(configs.dropout)
        self.projection = nn.Linear(
            configs.d_model * configs.seq_len, configs.num_class)

#### 4.1 Forecast

预测的基本思想是将已知序列延长至(seq_len+pred_len)，这是预测后的总长度。然后通过多个TimesBlock层和层归一化，表示出一些潜在的周期内和周期间信息。利用这些信息，我们可以将其投影到输出空间。之后通过反归一化（如果是非平稳Transformer）得到最终输出。

In [None]:
def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
    # 在时间维度上进行非平稳Transformer的归一化
    means = x_enc.mean(1, keepdim=True).detach() #[B,T]
    x_enc = x_enc - means
    stdev = torch.sqrt(
        torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
    x_enc /= stdev

    # 嵌入：将一个数字投影到C通道向量
    enc_out = self.enc_embedding(x_enc, x_mark_enc)  # [B,T,C] C是d_model
    enc_out = self.predict_linear(enc_out.permute(0, 2, 1)).permute(
        0, 2, 1)  # 对齐时间维度 [B,pred_len+seq_len,C]
    
    # TimesNet：通过TimesBlock self.layer次，每次都进行层归一化
    for i in range(self.layer):
        enc_out = self.layer_norm(self.model[i](enc_out))

    # 投影回去  #[B,T,d_model]-->[B,T,c_out]
    dec_out = self.projection(enc_out) 

    # 非平稳Transformer的反归一化
    dec_out = dec_out * \
              (stdev[:, 0, :].unsqueeze(1).repeat(
                  1, self.pred_len + self.seq_len, 1)) #延长stdev以适应dec_out
    dec_out = dec_out + \
              (means[:, 0, :].unsqueeze(1).repeat(
                  1, self.pred_len + self.seq_len, 1)) #延长mean以适应dec_out
    return dec_out

#### 4.2 Imputation

# 插补是一项旨在补全时间序列中缺失值的任务，因此在某种程度上它与预测类似。我们仍然可以使用相似的步骤来处理它。

In [None]:
def imputation(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask):
    # 来自非平稳Transformer的归一化
    means = torch.sum(x_enc, dim=1) / torch.sum(mask == 1, dim=1)
    means = means.unsqueeze(1).detach()
    x_enc = x_enc - means
    x_enc = x_enc.masked_fill(mask == 0, 0)
    stdev = torch.sqrt(torch.sum(x_enc * x_enc, dim=1) /
                       torch.sum(mask == 1, dim=1) + 1e-5)
    stdev = stdev.unsqueeze(1).detach()
    x_enc /= stdev

    # 嵌入
    enc_out = self.enc_embedding(x_enc, x_mark_enc)  # [B,T,C]
    # TimesNet
    for i in range(self.layer):
        enc_out = self.layer_norm(self.model[i](enc_out))
    # 投影回去
    dec_out = self.projection(enc_out)

    # 来自非平稳Transformer的反归一化
    dec_out = dec_out * \
              (stdev[:, 0, :].unsqueeze(1).repeat(
                  1, self.pred_len + self.seq_len, 1))
    dec_out = dec_out + \
              (means[:, 0, :].unsqueeze(1).repeat(
                  1, self.pred_len + self.seq_len, 1))
    return dec_out

# #### 4.3 异常检测
# 
# 与插补类似。

In [None]:
def anomaly_detection(self, x_enc):
    # 来自非平稳Transformer的归一化
    means = x_enc.mean(1, keepdim=True).detach()
    x_enc = x_enc - means
    stdev = torch.sqrt(
        torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
    x_enc /= stdev
    # 嵌入
    enc_out = self.enc_embedding(x_enc, None)  # [B,T,C]
    # TimesNet
    for i in range(self.layer):
        enc_out = self.layer_norm(self.model[i](enc_out))
    # 投影回去
    dec_out = self.projection(enc_out)
    # 来自非平稳Transformer的反归一化
    dec_out = dec_out * \
              (stdev[:, 0, :].unsqueeze(1).repeat(
                  1, self.pred_len + self.seq_len, 1))
    dec_out = dec_out + \
              (means[:, 0, :].unsqueeze(1).repeat(
                  1, self.pred_len + self.seq_len, 1))
    return dec_out

#### 4.4 分类

In [None]:
def classification(self, x_enc, x_mark_enc):
        # 嵌入
        enc_out = self.enc_embedding(x_enc, None)  # [B,T,C]
        # TimesNet
        for i in range(self.layer):
            enc_out = self.layer_norm(self.model[i](enc_out))

        # 输出
        # 输出的变换器编码器/解码器嵌入不包括非线性
        output = self.act(enc_out)
        output = self.dropout(output)

        # 清除填充嵌入：代码中x_mark_enc的主要作用是通过元素乘法清除输出张量中填充位置的嵌入，
        # 帮助模型专注于有意义的数据，同时忽略填充。
        output = output * x_mark_enc.unsqueeze(-1)
        
        # (批次大小，序列长度 * d_model)
        output = output.reshape(output.shape[0], -1)
        output = self.projection(output)  # (批次大小，类别数量)
        return output

In the end, with so many tasks above, we become able to complete `forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):`. 

In [None]:
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
    if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
        dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
        return dec_out[:, -self.pred_len:, :]  # [B, L, D] 返回序列的预测部分
    if self.task_name == 'imputation':
        dec_out = self.imputation(
            x_enc, x_mark_enc, x_dec, x_mark_dec, mask)
        return dec_out  # [B, L, D] 返回估算缺失值的整个序列
    if self.task_name == 'anomaly_detection':
        dec_out = self.anomaly_detection(x_enc)
        return dec_out  # [B, L, D] 返回应该正确的序列
    if self.task_name == 'classification':
        dec_out = self.classification(x_enc, x_mark_enc)
        return dec_out  # [B, N] 返回分类结果
    return None

### 5. 训练和设置

到目前为止，我们已经成功构建了`TimesNet`。我们现在面临的问题是如何训练和测试这个神经网络。训练、验证和测试的操作在__*exp*__部分实现，其中收集了不同任务的代码。这些实验不仅适用于`TimesNet`的训练，也适用于任何其他时间序列表示模型。但是这里，我们只是使用`TimesNet`进行分析。

`TimesNet`在多个任务中是最先进的，而这里我们只会介绍其在长期预测任务上的训练，因为其他任务的训练过程的骨干与此类似。同样，一旦您了解了训练过程的工作原理，测试和验证代码也可以很容易地理解。因此，我们首先要集中讨论`TimesNet`在长期预测任务上的训练。

我们将讨论许多方面，包括训练过程、训练损失等。

#### 5.1 长期预测任务的训练

下面的代码展示了长期预测任务模型训练的过程。我们将对其进行详细的分析。为了简洁起见，训练部分可以分为几个部分，包括数据准备、创建保存路径、初始化、优化器和损失函数选择、使用混合精度训练、训练循环、验证和早停、学习率调整、加载最佳模型。

详细信息，请见下面的代码。'train' 过程在实验中定义为 <font color=orange>__class Exp_Long_Term_Forecast__</font>。

In [None]:
import os

In [None]:
def train(self, setting):  #setting是这个模型训练的参数
    #获取训练数据加载器
    train_data, train_loader = self._get_data(flag='train')
    vali_data, vali_loader = self._get_data(flag='val')
    test_data, test_loader = self._get_data(flag='test')

    #设置模型保存和加载的路径
    path = os.path.join(self.args.checkpoints, setting)
    if not os.path.exists(path):
        os.makedirs(path)
    time_now = time.time()

    train_steps = len(train_loader)

    #EarlyStopping是一个典型的自定义类或函数，用于监控模型在训练期间的性能，通常通过跟踪某些指标（通常是验证损失或准确率）。这是深度学习中防止过拟合的一种常见技术。
    early_stopping = EarlyStopping(patience=self.args.patience, verbose=True)

    #优化器和损失函数选择
    model_optim = self._select_optimizer()
    criterion = self._select_criterion()

    #AMP训练是一种技术，使用较低精度的数据类型（例如float16）进行某些计算，以加速训练并减少内存使用。
    if self.args.use_amp:  
        scaler = torch.cuda.amp.GradScaler()
    for epoch in range(self.args.train_epochs):
        iter_count = 0
        train_loss = []
        self.model.train()
        epoch_time = time.time()

        #开始本轮的训练
        for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
            iter_count += 1
            model_optim.zero_grad()
            batch_x = batch_x.float().to(self.device)  #输入特征
            batch_y = batch_y.float().to(self.device)  #目标特征

            #_mark持有关于时间相关特征的信息。具体来说，它是一个编码时间信息的张量，并与输入数据的batch_x相关。
            batch_x_mark = batch_x_mark.float().to(self.device)
            batch_y_mark = batch_y_mark.float().to(self.device)
            #解码器输入（在TimesNet情况下未使用）
            dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float()
            dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
            #编码器-解码器
            if self.args.use_amp: #在TimesNet情况下，use_amp应该为False
                with torch.cuda.amp.autocast():
                    #是否在编码器中输出注意力，在TimesNet情况下为否
                    if self.args.output_attention: 
                        outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                    #模型输入
                    else:
                        outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)

                    #预测任务，选项：[M, S, MS]; M：多变量预测多变量，S：单变量预测单变量，MS：多变量预测单变量'
                    #如果是多变量预测单变量，则输出应该是解码器的最后一列，所以f_dim = -1只包含最后一列，否则是所有列
                    f_dim = -1 if self.args.features == 'MS' else 0 
                    outputs = outputs[:, -self.args.pred_len:, f_dim:]
                    batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device)

                    #计算损失
                    loss = criterion(outputs, batch_y)
                    train_loss.append(loss.item())
            else:  #与use_amp为True时类似
                if self.args.output_attention:
                    outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                else:
                    outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
                f_dim = -1 if self.args.features == 'MS' else 0
                outputs = outputs[:, -self.args.pred_len:, f_dim:]
                batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device)
                loss = criterion(outputs, batch_y)
                train_loss.append(loss.item())

            #当训练轮次达到某些100的倍数时，打印速度、剩余时间、损失等反馈信息
            if (i + 1) % 100 == 0:
                print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item()))
                speed = (time.time() - time_now) / iter_count
                left_time = speed * ((self.args.train_epochs - epoch) * train_steps - i)
                print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time))
                iter_count = 0
                time_now = time.time()

            #反向传播
            if self.args.use_amp:
                scaler.scale(loss).backward()
                scaler.step(model_optim)
                scaler.update()
            else:
                loss.backward()
                model_optim.step()
        
        #本轮训练结束，打印信息
        print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time))
        train_loss = np.average(train_loss)

        #在当前模型上运行测试和验证
        vali_loss = self.vali(vali_data, vali_loader, criterion)
        test_loss = self.vali(test_data, test_loader, criterion)

        #打印训练、测试、验证损失信息
        print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format(
            epoch + 1, train_steps, train_loss, vali_loss, test_loss))
        
        #决定是否触发早停。如果early_stop为真，则意味着本轮训练的斜率现在是平坦的，所以停止本轮的进一步训练。
        early_stopping(vali_loss, self.model, path)
        if early_stopping.early_stop:
            print("早停")
            break

        #调整学习率
        adjust_learning_rate(model_optim, epoch + 1, self.args)
    best_model_path = path + '/' + 'checkpoint.pth'

    #从保存的检查点文件中加载训练的模型状态字典
    self.model.load_state_dict(torch.load(best_model_path))
    return self.model

# 如果您想了解更多，请查看exp/exp_long_term_forecasting.py

# #### 5.2 早停机制
# 
# <font color=purple>__EarlyStopping__</font>通常是一个自定义的类或函数，它在训练过程中监视模型的性能，通常是通过跟踪某个指标（通常是验证损失或准确性）来实现的。这是深度学习中常用的技术，用于防止训练过程中的过拟合。
# 
# 让我们看一下下面的代码（原始代码在`tools.py`中）。

In [None]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0):
        self.patience = patience # 你将容忍多少次损失不再下降
        self.verbose = verbose  # 是否打印提示信息
        self.counter = 0 # 目前损失不再下降的次数
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model, path):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, path)

        # 含义：当前分数不比best_score好'delta'，表明进一步的训练可能不会带来明显的损失改善。
        elif score < self.best_score + self.delta:  
            self.counter += 1
            print(f'EarlyStopping 计数器: {self.counter} out of {self.patience}')
            # '无改善'次数超过耐心值 --> 停止进一步的训练
            if self.counter >= self.patience:
                self.early_stop = True

        else: # 模型的损失仍在下降，保存当前最好的模型并继续训练
            self.best_score = score
            self.save_checkpoint(val_loss, model, path)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, path):
    ### 用于保存当前最好的模型
        if self.verbose:
            print(f'验证损失下降 ({self.val_loss_min:.6f} --> {val_loss:.6f}).  保存模型 ...')
        torch.save(model.state_dict(), path + '/' + 'checkpoint.pth')
        self.val_loss_min = val_loss

#### 5.3 优化器和损失函数
优化器和损失函数在<font color=orange>__class Exp_Long_Term_Forecast__</font>中定义，并在训练过程中通过函数`self._select_optimizer()`和`self._select_criterion()`调用。在这里，对于长期预测任务，我们简单地采用Adam优化器和MSELoss来衡量实际数据和预测数据之间的损失。

In [None]:
def _select_optimizer(self):
    model_optim = optim.Adam(self.model.parameters(), lr=self.args.learning_rate)
    return model_optim

def _select_criterion(self):
    criterion = nn.MSELoss()
    return criterion

# #### 5.4 自动混合精度（AMP）
# 
# AMP是深度学习中用于提高训练速度和降低内存使用的技术。AMP通过混合半精度（16位浮点数）和单精度（32位浮点数）的计算来实现这一点。
# 
# 让我们更详细地了解这段代码：

In [None]:
#in forward process:
with torch.cuda.amp.autocast():

...

#in BP process:
if self.args.use_amp:
    scaler.scale(loss).backward()
    scaler.step(model_optim)
    scaler.update()


# `with torch.cuda.amp.autocast():` : 使用torch.cuda.amp.autocast()的目的是在保持数值稳定性的同时，利用混合精度训练的速度和内存效率优势。一些深度学习模型可以从这种技术中获益匪浅，特别是在支持半精度算术的现代GPU上。它允许您更快地执行某些计算，同时确保关键计算（例如梯度更新）以足够的精度进行，以避免精度损失。
# 
# `scaler.scale(loss).backward()`: 如果启用了AMP，它使用torch.cuda.amp.GradScaler()创建的scaler对象来自动缩放损失并执行反向传播。这是AMP的关键部分，确保数值稳定性。在反向传播之前，损失被缩放到适当的范围，以防止梯度过快发散或引起数值不稳定。
# 
# `scaler.step(model_optim)`: 接下来，scaler调用step方法，该方法将缩放后的梯度应用于模型的优化器（model_optim）。这用于更新模型的权重以最小化损失函数。
# 
# `scaler.update()`: 最后，scaler调用update方法，该方法更新缩放因子，以确保下一轮迭代的损失正确缩放。这一步有助于动态调整梯度的缩放，以适应不同的训练场景。

# #### 5.5 学习率调整
# 
# 虽然优化器负责根据epochs调整学习率，但我们仍然想手动对其进行一些调整，如函数`adjust_learning_rate(model_optim, epoch + 1, self.args)`所示（原始代码在`tools.py`中）： 

In [None]:
def adjust_learning_rate(optimizer, epoch, args):

    #第一种类型：学习率随着epoch指数下降
    if args.lradj == 'type1':
        lr_adjust = {epoch: args.learning_rate * (0.5 ** ((epoch - 1) // 1))}

    #第二种类型：手动降低学习率
    elif args.lradj == 'type2':
        lr_adjust = {
            2: 5e-5, 4: 1e-5, 6: 5e-6, 8: 1e-6,
            10: 5e-7, 15: 1e-7, 20: 5e-8
        }

    #第一种类型：每个epoch都更新
    #第二种类型：只在Dict lr_adjust中写入的epoch更新
    if epoch in lr_adjust.keys():
        lr = lr_adjust[epoch]
    
        # 对优化器中不同参数组的学习率进行更改
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        print('更新学习率为 {}'.format(lr))

### 6. Validation and Testing

During training, the model continuously adjusts its weights and parameters to minimize training error. However, this may not reflect the model's performance on unseen data. Validation allows us to periodically assess the model's performance on data that is different from the training data, providing insights into the model's generalization ability.

By comparing performance on the validation set, we can identify whether the model is overfitting. Overfitting occurs when a model performs well on training data but poorly on unseen data. Monitoring performance on the validation set helps detect overfitting early and take measures to prevent it, such as early stopping or adjusting hyperparameters.

Here, we still take long-term forecasting as an example, similar to train process:

In [None]:
def vali(self, vali_data, vali_loader, criterion):
        total_loss = []

        #evaluation mode
        self.model.eval()
        with torch.no_grad():
            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(vali_loader):
                batch_x = batch_x.float().to(self.device)
                batch_y = batch_y.float()

                batch_x_mark = batch_x_mark.float().to(self.device)
                batch_y_mark = batch_y_mark.float().to(self.device)

                # decoder input
                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float()
                dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
                # encoder - decoder
                if self.args.use_amp:
                    with torch.cuda.amp.autocast():
                        if self.args.output_attention:
                            outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                        else:
                            outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
                else:
                    if self.args.output_attention:
                        outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                    else:
                        outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
                f_dim = -1 if self.args.features == 'MS' else 0
                outputs = outputs[:, -self.args.pred_len:, f_dim:]
                batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device)

                pred = outputs.detach().cpu()
                true = batch_y.detach().cpu()

                loss = criterion(pred, true)

                total_loss.append(loss)
        total_loss = np.average(total_loss)
        self.model.train()
        return total_loss

Testing is similar to validation, but it's purpose is to examine how well the model behaves, so it's common to add some visualization with __matplotlib.pyplot__. 

In [None]:
import matplotlib.pyplot as plt

def visual(true, preds=None, name='./pic/test.pdf'):
    """
    Results visualization
    """
    plt.figure()
    plt.plot(true, label='GroundTruth', linewidth=2)
    if preds is not None:
        plt.plot(preds, label='Prediction', linewidth=2)
    plt.legend()
    plt.savefig(name, bbox_inches='tight')

In [None]:
def test(self, setting, test=0):
        test_data, test_loader = self._get_data(flag='test')
        if test:
            print('loading model')
            self.model.load_state_dict(torch.load(os.path.join('./checkpoints/' + setting, 'checkpoint.pth')))

        preds = []
        trues = []
        folder_path = './test_results/' + setting + '/'
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        self.model.eval()
        with torch.no_grad():
            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader):
                batch_x = batch_x.float().to(self.device)
                batch_y = batch_y.float().to(self.device)

                batch_x_mark = batch_x_mark.float().to(self.device)
                batch_y_mark = batch_y_mark.float().to(self.device)

                # decoder input
                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float()
                dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
                # encoder - decoder
                if self.args.use_amp:
                    with torch.cuda.amp.autocast():
                        if self.args.output_attention:
                            outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                        else:
                            outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
                else:
                    if self.args.output_attention:
                        outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]

                    else:
                        outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)

                f_dim = -1 if self.args.features == 'MS' else 0
                outputs = outputs[:, -self.args.pred_len:, f_dim:]
                batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device)
                outputs = outputs.detach().cpu().numpy()
                batch_y = batch_y.detach().cpu().numpy()

                #inverse the data if scaled
                if test_data.scale and self.args.inverse:
                    outputs = test_data.inverse_transform(outputs)
                    batch_y = test_data.inverse_transform(batch_y)

                pred = outputs
                true = batch_y

                preds.append(pred)
                trues.append(true)

                #visualize one piece of data every 20
                if i % 20 == 0:
                    input = batch_x.detach().cpu().numpy()
                    #the whole sequence
                    gt = np.concatenate((input[0, :, -1], true[0, :, -1]), axis=0)
                    pd = np.concatenate((input[0, :, -1], pred[0, :, -1]), axis=0)
                    visual(gt, pd, os.path.join(folder_path, str(i) + '.pdf'))

        preds = np.array(preds)
        trues = np.array(trues)  # shape[batch_num, batch_size, pred_len, features]
        print('test shape:', preds.shape, trues.shape)
        preds = preds.reshape(-1, preds.shape[-2], preds.shape[-1])
        trues = trues.reshape(-1, trues.shape[-2], trues.shape[-1])
        print('test shape:', preds.shape, trues.shape)

        # result save
        folder_path = './results/' + setting + '/'
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        mae, mse, rmse, mape, mspe = metric(preds, trues)
        print('mse:{}, mae:{}'.format(mse, mae))
        f = open("result_long_term_forecast.txt", 'a')
        f.write(setting + "  \n")
        f.write('mse:{}, mae:{}'.format(mse, mae))
        f.write('\n')
        f.write('\n')
        f.close()
        
        np.save(folder_path + 'metrics.npy', np.array([mae, mse, rmse, mape, mspe]))
        np.save(folder_path + 'pred.npy', preds)
        np.save(folder_path + 'true.npy', trues)

        return


### 7. Dataloader and DataProvider

In the process of training, we simply take the dataloader for granted, by the function `self._get_data(flag='train')`. So how does this line work? Have a look at the definition(in  <font color=orange>__class Exp_Long_Term_Forecast__</font>):

In [None]:
def _get_data(self, flag):
        data_set, data_loader = data_provider(self.args, flag)
        return data_set, data_loader

One step forward, see `data_provider(self.args, flag)`(in `data_factory.py`):

In [None]:
# Below are some dataloaders defined in data_loader.py. If you want to add your own data, 
# go and check data_loader.py to rewrite a dataloader to fit your data.
data_dict = {
    'ETTh1': Dataset_ETT_hour,
    'ETTh2': Dataset_ETT_hour,
    'ETTm1': Dataset_ETT_minute,
    'ETTm2': Dataset_ETT_minute,
    'custom': Dataset_Custom,
    'm4': Dataset_M4,
    'PSM': PSMSegLoader,
    'MSL': MSLSegLoader,
    'SMAP': SMAPSegLoader,
    'SMD': SMDSegLoader,
    'SWAT': SWATSegLoader,
    'UEA': UEAloader
}


def data_provider(args, flag):
    Data = data_dict[args.data]  #data_provider

    # time features encoding, options:[timeF, fixed, learned]
    timeenc = 0 if args.embed != 'timeF' else 1

    #test data provider
    if flag == 'test':
        shuffle_flag = False
        drop_last = True
        if args.task_name == 'anomaly_detection' or args.task_name == 'classification':
            batch_size = args.batch_size

        #Some tasks during the testing phase may require evaluating samples one at a time. 
        # This could be due to variations in sample sizes in the test data or because the 
        # evaluation process demands finer-grained results or different processing. 
        else:
            batch_size = 1  # bsz=1 for evaluation

        #freq for time features encoding, 
        # options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly,
        #  m:monthly], you can also use more detailed freq like 15min or 3h')
        freq = args.freq
    else:
        shuffle_flag = True
        drop_last = True
        batch_size = args.batch_size  # bsz for train and valid
        freq = args.freq

    if args.task_name == 'anomaly_detection':
        drop_last = False
        data_set = Data(
            root_path=args.root_path, #root path of the data file
            win_size=args.seq_len,    #input sequence length
            flag=flag,
        )
        print(flag, len(data_set))
        data_loader = DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle_flag,
            num_workers=args.num_workers,#data loader num workers
            drop_last=drop_last)
        return data_set, data_loader

    elif args.task_name == 'classification':
        drop_last = False
        data_set = Data(
            root_path=args.root_path,
            flag=flag,
        )

        data_loader = DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle_flag,
            num_workers=args.num_workers,
            drop_last=drop_last,
            collate_fn=lambda x: collate_fn(x, max_len=args.seq_len) 
            #define some limits to collate pieces of data into batches
        )
        return data_set, data_loader
    else:
        if args.data == 'm4':
            drop_last = False
        data_set = Data(
            root_path=args.root_path, #eg.  ./data/ETT/
            data_path=args.data_path, #eg.  ETTh1.csv
            flag=flag,
            size=[args.seq_len, args.label_len, args.pred_len],
            features=args.features,   #forecasting task, options:[M, S, MS]; 
            # M:multivariate predict multivariate, S:univariate predict univariate,
            # MS:multivariate predict univariate
            
            target=args.target,       #target feature in S or MS task
            timeenc=timeenc,
            freq=freq,
            seasonal_patterns=args.seasonal_patterns
        )
        print(flag, len(data_set))
        data_loader = DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle_flag,
            num_workers=args.num_workers,
            drop_last=drop_last)
        return data_set, data_loader


From above, it's easy to find that data_provider is responsible for collate the dataset into batches according to different tasks and running mode. It passes the parameters to dataloader(`Data`) to instruct it how to manage a data file into pieces of usable data. Then it also generates the final dara_loader by passing the built-up dataset and some other params to the standard class Dataloader. After that, a dataset that fits the need of the model and a enumerable dataloader are generated. 

So how to organize the data file into pieces of data that fits the model? Let's see `data_loader.py`! There are many dataloaders in it, and of course you can write your own dataloader, but here we'll only focus on <font color=orange>__class Dataset_ETT_hour(Dataset)__</font> as an example.

In [None]:
class Dataset_ETT_hour(Dataset):
        def __init__(self, root_path, flag='train', size=None,
                 features='S', data_path='ETTh1.csv',
                 target='OT', scale=True, timeenc=0, freq='h', seasonal_patterns=None):
                ...     
        def __read_data__(self):
                ...     
        def __getitem__(self, index):
                ...
        
        def __len__(self):
                ...
        
        def inverse_transform(self, data):
                ...

`__init__()`  is the constructor used to initialize various parameters and attributes of the dataset. It takes a series of arguments, including the path to the data file, the dataset's flag (e.g., train, validate, test), dataset size, feature type, target variable, whether to scale the data, time encoding, time frequency, and more. These parameters are used to configure how the dataset is loaded and processed.

In [None]:
def __init__(self, root_path, flag='train', size=None,
             features='S', data_path='ETTh1.csv',
             target='OT', scale=True, timeenc=0, freq='h', seasonal_patterns=None):
    # size [seq_len, label_len, pred_len]
    # info
    if size == None:
        self.seq_len = 24 * 4 * 4
        self.label_len = 24 * 4
        self.pred_len = 24 * 4
    else:
        self.seq_len = size[0]
        self.label_len = size[1]
        self.pred_len = size[2]
    # init
    assert flag in ['train', 'test', 'val']
    type_map = {'train': 0, 'val': 1, 'test': 2}
    self.set_type = type_map[flag]
    self.features = features
    self.target = target
    self.scale = scale
    self.timeenc = timeenc
    self.freq = freq
    self.root_path = root_path
    self.data_path = data_path
    
    # After initialization, call __read_data__() to manage the data file.
    self.__read_data__()

The actual process of managing data file into usable data pieces happens in `__read_data__()`, see below:

In [None]:
def __read_data__(self):
    self.scaler = StandardScaler()

    #get raw data from path
    df_raw = pd.read_csv(os.path.join(self.root_path,
                                      self.data_path))

    # split data set into train, vali, test. border1 is the left border and border2 is the right.
    # Once flag(train, vali, test) is determined, __read_data__ will return certain part of the dataset.
    border1s = [0, 12 * 30 * 24 - self.seq_len, 12 * 30 * 24 + 4 * 30 * 24 - self.seq_len]
    border2s = [12 * 30 * 24, 12 * 30 * 24 + 4 * 30 * 24, 12 * 30 * 24 + 8 * 30 * 24]
    border1 = border1s[self.set_type]
    border2 = border2s[self.set_type]

    #decide which columns to select
    if self.features == 'M' or self.features == 'MS':
        cols_data = df_raw.columns[1:] # column name list (remove 'date')
        df_data = df_raw[cols_data]  #remove the first column, which is time stamp info
    elif self.features == 'S':
        df_data = df_raw[[self.target]] # target column

    #scale data by the scaler that fits training data
    if self.scale:
        train_data = df_data[border1s[0]:border2s[0]]
        #train_data.values: turn pandas DataFrame into 2D numpy
        self.scaler.fit(train_data.values)  
        data = self.scaler.transform(df_data.values)
    else:
        data = df_data.values 
    
    #time stamp:df_stamp is a object of <class 'pandas.core.frame.DataFrame'> and
    # has one column called 'date' like 2016-07-01 00:00:00
    df_stamp = df_raw[['date']][border1:border2]
    
    # Since the date format is uncertain across different data file, we need to 
    # standardize it so we call func 'pd.to_datetime'
    df_stamp['date'] = pd.to_datetime(df_stamp.date) 

    if self.timeenc == 0:  #time feature encoding is fixed or learned
        df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
        df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
        df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
        df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
        #now df_frame has multiple columns recording the month, day etc. time stamp
        # next we delete the 'date' column and turn 'DataFrame' to a list
        data_stamp = df_stamp.drop(['date'], 1).values

    elif self.timeenc == 1: #time feature encoding is timeF
        '''
         when entering this branch, we choose arg.embed as timeF meaning we want to 
         encode the temporal info. 'freq' should be the smallest time step, and has 
          options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h')
         So you should check the timestep of your data and set 'freq' arg. 
         After the time_features encoding, each date info format will be encoded into 
         a list, with each element denoting the relative position of this time point
         (e.g. Day of Week, Day of Month, Hour of Day) and each normalized within scope[-0.5, 0.5]
         '''
        data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
        data_stamp = data_stamp.transpose(1, 0)
        
    
    # data_x and data_y are same copy of a certain part of data
    self.data_x = data[border1:border2]
    self.data_y = data[border1:border2]
    self.data_stamp = data_stamp

`__read_data__()` splits the dataset into 3 parts, selects the needed columns and manages time stamp info. It gives out the well-managed data array for later use. Next, we have to finish the overload of <font color=orange>__class Dataset__</font>, see `__getitem__(self, index)` and `__len__(self)`:

In [None]:
def __getitem__(self, index):
    #given an index, calculate the positions after this index to truncate the dataset
    s_begin = index
    s_end = s_begin + self.seq_len
    r_begin = s_end - self.label_len
    r_end = r_begin + self.label_len + self.pred_len

    #input and output sequence
    seq_x = self.data_x[s_begin:s_end]
    seq_y = self.data_y[r_begin:r_end]

    #time mark
    seq_x_mark = self.data_stamp[s_begin:s_end]
    seq_y_mark = self.data_stamp[r_begin:r_end]

    return seq_x, seq_y, seq_x_mark, seq_y_mark

def __len__(self):
    return len(self.data_x) - self.seq_len - self.pred_len + 1

You can also add an inverse_transform for scaler if needed.

In [None]:
def inverse_transform(self, data):
    return self.scaler.inverse_transform(data)

By now, we have finished constructing the dataset and dataloader. If you want to construct your own data and run it on the net, you can find proper data and try to accomplish the functions listed above. Here are some widely used datasets in times series analysis.

![common dataset](./dataset.png)

### 8. Running the Experiment and Visualizing Result

After managing the data, model well, we need to write a shell script for the experiment. In the script, we need to run `run.py` with several arguments, which is part of the configuration. Here, let's see `TimesNet` on task long-term forecast with dataset ETTh1 for example.

In [None]:
model_name=TimesNet


python -u run.py \
  --task_name long_term_forecast \
  --is_training 1 \
  --root_path ./dataset/ETT-small/ \
  --data_path ETTh1.csv \
  --model_id ETTh1_96_96 \
  --model $model_name \
  --data ETTh1 \
  --features M \
  --seq_len 96 \
  --label_len 48 \
  --pred_len 96 \
  --e_layers 2 \
  --d_layers 1 \
  --factor 3 \
  --enc_in 7 \
  --dec_in 7 \
  --c_out 7 \
  --d_model 16 \
  --d_ff 32 \
  --des 'Exp' \
  --itr 1 \
  --top_k 5


After finishing the shell script, you can run it in shell using bash. For example, you can run the following command, for `TimesNet` ETTh1 long-term forecast:

In [None]:
bash ./scripts/long_term_forecast/ETT_script/TimesNet_ETTh1.sh

Here, the bash command may not be successfully implemented due to a lack of proper packages in the environment. If that's the case, simply follow the error information to install the missing package step by step until you achieve success. The sign of a successful experiment running is that information about the experiment is printed out, such as:

In [None]:
Namespace(task_name='long_term_forecast', is_training=1, model_id='ETTh1_96_96', model='TimesNet', data='ETTh1', root_path='./dataset/ETT-small/', data_path='ETTh1.csv', features='M', target='OT', freq='h', checkpoints='./checkpoints/', seq_len=96, label_len=48, pred_len=96, seasonal_patterns='Monthly', inverse=False, mask_rate=0.25, anomaly_ratio=0.25, top_k=5, num_kernels=6, enc_in=7, dec_in=7, c_out=7, d_model=16, n_heads=8, e_layers=2, d_layers=1, d_ff=32, moving_avg=25, factor=3, distil=True, dropout=0.1, embed='timeF', activation='gelu', output_attention=False, num_workers=10, itr=1, train_epochs=10, batch_size=32, patience=3, learning_rate=0.0001, des='Exp', loss='MSE', lradj='type1', use_amp=False, use_gpu=False, gpu=0, use_multi_gpu=False, devices='0,1,2,3', p_hidden_dims=[128, 128], p_hidden_layers=2)
Use GPU: cuda:0
>>>>>>>start training : long_term_forecast_ETTh1_96_96_TimesNet_ETTh1_ftM_sl96_ll48_pl96_dm16_nh8_el2_dl1_df32_fc3_ebtimeF_dtTrue_Exp_0>>>>>>>>>>>>>>>>>>>>>>>>>>
train 8449
val 2785
test 2785

Then, the model starts training. Once one epoch finishes training, information like below will be printer out：

In [None]:
        iters: 100, epoch: 1 | loss: 0.4701951
        speed: 0.2108s/iter; left time: 535.7317s
        iters: 200, epoch: 1 | loss: 0.4496171
        speed: 0.0615s/iter; left time: 150.0223s
Epoch: 1 cost time: 30.09317970275879
Epoch: 1, Steps: 264 | Train Loss: 0.4964185 Vali Loss: 0.8412074 Test Loss: 0.4290483
Validation loss decreased (inf --> 0.841207).  Saving model ...
Updating learning rate to 0.0001

When all epochs are done, the model steps into testing. The following information about testing will be printed out, giving the MAE and MSE of test.

In [None]:
>>>>>>>testing : long_term_forecast_ETTh1_96_96_TimesNet_ETTh1_ftM_sl96_ll48_pl96_dm16_nh8_el2_dl1_df32_fc3_ebtimeF_dtTrue_Exp_0<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
test 2785
test shape: (2785, 1, 96, 7) (2785, 1, 96, 7)
test shape: (2785, 96, 7) (2785, 96, 7)
mse:0.3890332877635956, mae:0.41201362013816833

After test finishes, some visible information are already stored in the test_results folder in PDF format. For example:

![result ETTm1 2440](./result.png)