In [2]:
import numpy as np
import torch
import matplotlib.pyplot as plt
#import causal_convolution_layer
#import Dataloader
import math
from torch.utils.data import DataLoader

In [None]:
class DataProcessor(Dataset):
    """synthetic time series dataset from section 5.1"""
    
    def __init__(self,t0=96,N=,transform=None):
        """
        Args:
            t0: previous t0 data points to predict from
            N: number of data points
            transform: any transformations to be applied to time series
        """
        self.t0 = t0
        self.N = N
        self.transform = None
        
        # time points
        self.x = torch.cat(N*[torch.arange(0,t0+24).type(torch.float).unsqueeze(0)])

        # sinuisoidal signal
        # 如果用到自己的数据的话，把下面这块改掉就好
        # 注意数据输入格式为（N，Nb of timepoints）
        # 其中N为你有多少行ts，以电力系统数据为例，一个客户10天的数据就构成一行ts
        # 而nb of timepoints为一行ts中有几个时间点，比如十天小时粒度的，就是 10 * 24 = 240
        A1,A2,A3 = 60 * torch.rand(20,N)
        A4 = torch.max(A1,A2)        
        self.fx = torch.cat([A1.unsqueeze(1)*torch.sin(np.pi*self.x[0,0:12]/6)+72 ,
                        A2.unsqueeze(1)*torch.sin(np.pi*self.x[0,12:24]/6)+72 ,
                        A3.unsqueeze(1)*torch.sin(np.pi*self.x[0,24:t0]/6)+72,
                        A4.unsqueeze(1)*torch.sin(np.pi*self.x[0,t0:t0+24]/12)+72],1)
        
        # add noise
        self.fx = self.fx + torch.randn(self.fx.shape)
        
        self.masks = self._generate_square_subsequent_mask(t0)
                
        
        # print out shapes to confirm desired output
        print("x: {}*{}".format(*list(self.x.shape)),
              "fx: {}*{}".format(*list(self.fx.shape)))        
        
    def __len__(self):
        return len(self.fx)
    
    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        
        sample = (self.x[idx,:],
                  self.fx[idx,:],
                  self.masks)
        
        if self.transform:
            sample=self.transform(sample)
            
        return sample
    
    def _generate_square_subsequent_mask(self,t0):
        mask = torch.zeros(t0+24,t0+24)
        for i in range(0,t0):
            mask[i,t0:] = 1 
        for i in range(t0,t0+24):
            mask[i,i+1:] = 1
        mask = mask.float().masked_fill(mask == 1, float('-inf'))#.masked_fill(mask == 1, float(0.0))
        return mask

In [None]:
class CausalConv1d(torch.nn.Conv1d):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 dilation=1,
                 groups=1,
                 bias=True):

        super(CausalConv1d, self).__init__(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=0,
            dilation=dilation,
            groups=groups,
            bias=bias)
        
        self.__padding = (kernel_size - 1) * dilation
        
    def forward(self, input):
        return super(CausalConv1d, self).forward(F.pad(input, (self.__padding, 0)))


class context_embedding(torch.nn.Module):
    def __init__(self,in_channels=1,embedding_size=256,k=5):
        super(context_embedding,self).__init__()
        self.causal_convolution = CausalConv1d(in_channels,embedding_size,kernel_size=k)

    def forward(self,x):
        x = self.causal_convolution(x)
        return F.tanh(x)

# model class
class TransformerTimeSeries(torch.nn.Module):
    """
    Time Series application of transformers based on paper
    
    causal_convolution_layer parameters:
        in_channels: the number of features per time point
        out_channels: the number of features outputted per time point
        kernel_size: k is the width of the 1-D sliding kernel
        
    nn.Transformer parameters:
        d_model: the size of the embedding vector (input)
    
    PositionalEncoding parameters:
        d_model: the size of the embedding vector (positional vector)
        dropout: the dropout to be used on the sum of positional+embedding vector
    
    """
    def __init__(self):
        super(TransformerTimeSeries,self).__init__()
        self.input_embedding = causal_convolution_layer.context_embedding(2,256,9)
        self.positional_embedding = torch.nn.Embedding(512,256)

        
        self.decode_layer = torch.nn.TransformerEncoderLayer(d_model=256,nhead=8)
        self.transformer_decoder = torch.nn.TransformerEncoder(self.decode_layer, num_layers=3)
        
        self.fc1 = torch.nn.Linear(256,1)
        
    def forward(self,x,y,attention_masks):
        
        # concatenate observed points and time covariate
        # (B*feature_size*n_time_points)
        z = torch.cat((y.unsqueeze(1),x.unsqueeze(1)),1)

        # input_embedding returns shape (Batch size,embedding size,sequence len) -> need (sequence len,Batch size,embedding_size)
        z_embedding = self.input_embedding(z).permute(2,0,1)
        
        # get my positional embeddings (Batch size, sequence_len, embedding_size) -> need (sequence len,Batch size,embedding_size)
        positional_embeddings = self.positional_embedding(x.type(torch.long)).permute(1,0,2)
        
        input_embedding = z_embedding+positional_embeddings
        
        transformer_embedding = self.transformer_decoder(input_embedding,attention_masks)

        output = self.fc1(transformer_embedding.permute(1,0,2))
        
        return output

In [None]:
def train(train_dataset, test_dataset, t0, future):
    train_dl = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_dl = DataLoader(test_dataset, batch_size=128)

    model = TransformerTimeSeries()

    lr = .0005  # learning rate
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    epochs = 50
    criterion = torch.nn.MSELoss()

    train_epoch_loss = []
    eval_epoch_loss = []
    Rp_best = 1e5
    model_save_path = 'ConvTransformer_nologsparse.pth'
    for e, epoch in enumerate(range(epochs)):
        train_loss = []
        eval_loss = []

        l_t = train_epoch(model, train_dl, opt, criterion, t0)
        train_loss.append(l_t)

        Rp = test_epoch(model, test_dl, t0, future)

        if Rp_best > Rp:
            Rp_best = Rp
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': opt.state_dict(),
                'loss': Rp,
            }, model_save_path)

        train_epoch_loss.append(np.mean(train_loss))
        eval_epoch_loss.append(np.mean(eval_loss))

        print("Epoch {}: Train loss: {} \t Validation loss: {} \t R_p={}".format(e,
                                                                                 np.mean(train_loss),
                                                                                 np.mean(eval_loss), Rp))

        print("Rp best={}".format(Rp_best))
没啥需要特别说明的地方，就是这个作者只是写了一个论文构造数据复现的代码，所以工程性很差，很多地方都写死或者没有抽象出来，拿来即用的同学们需要注意改一下。另外他也没有prediction相关函数，我这边写了一个，仅供参考：

prediction
def prediction(model, dl, t0, future):
    # 预测前先load model， dl就是待预测数据，t0就是前n和时间点，future就是要预测的n个时间点
    # 比如你要用一周内前五天的数据训练模型，来预测后两天的值 t0 = 5 * 24 = 120， future = 48
    with torch.no_grad():
        predictions = []
        observations = []
        for step, (x, y, attention_masks) in enumerate(dl):
            # x: (batch_size， total_ts_length)
            # y: (batch_size, total_ts_length)
            # ouput:(batch_size, total_ts_length, 1)
            output = model(x, y, attention_masks[0])
            history = y[:, :t0].cpu().numpy().tolist()
            for p, o in zip(output.squeeze()[:, (t0 - 1):(t0 + future - 1)].cpu().numpy().tolist(),
                            y[:, t0:].cpu().numpy().tolist()):  # not missing data

                predictions.append(p) # (batch_size, future)
                observations.append(o) # (batch_size, future)
        num = 0
        den = 0
        for hist, y_preds, y_trues in zip(history, predictions, observations):
            plot_result(hist, y_preds, y_trues, t0)
            num_i, den_i = Rp_num_den(y_preds, y_trues, .5)
            num += num_i
            den += den_i
        Rp = (2 * num) / den
    return Rp

In [None]:
def plot_result(history, yhat, ytruth, t0):
    # 带上历史值
    yhat = history + yhat
    ytruth = history + ytruth
    # 画图
    x = range(len(ytruth))
    yhat = np.round(yhat, 2)
    ytruth = np.round(ytruth, 2)
    plt.figure(facecolor='w')  
    plt.plot(range(len(x)), ytruth, 'green', linewidth=1.5, label='ground truth')
    plt.plot(range(len(x)), yhat, 'blue', alpha=0.8, linewidth=1.2, label='predict value')
    # 画条预测起始线
    plt.vlines(t0, yhat.min() * 0.99, yhat.max() * 1.01,
               alpha=0.7, colors="r", linestyles="dashed")
    # plt.text(0.15, 0.01, error_message, size=10, alpha=0.9, transform=plt.gca().transAxes)  # 相对位置，经验设置值
    plt.legend(loc='best')  # 设置标签的位置
    plt.grid(True)
    plt.show()