# A Time Series is Worth 64 Words: Long-term Forecasting with Transformers.

In [1]:
import os
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from transformers import PatchTSTForPrediction, PatchTSTConfig

%matplotlib inline

In [2]:
from src.data.make_dataset import get_train_test_datasets

In [3]:
DIR_PATH = os.getcwd()

data_path = os.path.join(DIR_PATH, 'dataset')

## Lecture des données et création des datasets

In [4]:
from torch.utils.data import DataLoader

In [22]:
window_size = 512
sliding_window = 16
forcasting_horizon = 96
batch_size = 64

train_set, test_set = get_train_test_datasets(
    data_path+'/ETTm1.csv', 
    test_size=0.2, 
    window_size=window_size,
    sliding_size=sliding_window,
    forcasting_horizon=forcasting_horizon, 
    kind='sliding_window'
    )

train_loader = DataLoader(train_set, batch_size=10, shuffle=True)
test_loader = DataLoader(test_set, batch_size=10, shuffle=True)

num_channels = next(iter(train_loader))[0].shape[1]

## Initialisation du modèle

In [23]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

In [24]:
# Initializing an PatchTST configuration with 12 time steps for prediction
configuration = PatchTSTConfig(
    num_input_channels=num_channels,
    context_length=window_size,
    patch_stride=sliding_window,
    prediction_length=forcasting_horizon,
    random_mask_ratio=0.4,
    d_model=128,
    num_attention_heads=16,
    num_hidden_layers=3,
    ffn_dim=256,
    dropout=0.2,
    head_dropout=0.2,
    pooling_type=None,
    channel_attention=False,
    scaling="std",
    loss="mse",
    pre_norm=True,
    norm_type="batchnorm"
)

# Randomly initializing a model (with random weights) from the configuration
model = PatchTSTForPrediction(configuration)

# Accessing the model configuration
configuration = model.config

configuration

PatchTSTConfig {
  "_attn_implementation_autoset": true,
  "activation_function": "gelu",
  "attention_dropout": 0.0,
  "bias": true,
  "channel_attention": false,
  "channel_consistent_masking": false,
  "context_length": 512,
  "d_model": 128,
  "distribution_output": "student_t",
  "do_mask_input": null,
  "dropout": 0.2,
  "ff_dropout": 0.0,
  "ffn_dim": 256,
  "head_dropout": 0.2,
  "init_std": 0.02,
  "loss": "mse",
  "mask_type": "random",
  "mask_value": 0,
  "model_type": "patchtst",
  "norm_eps": 1e-05,
  "norm_type": "batchnorm",
  "num_attention_heads": 16,
  "num_forecast_mask_patches": [
    2
  ],
  "num_hidden_layers": 3,
  "num_input_channels": 7,
  "num_parallel_samples": 100,
  "num_targets": 1,
  "output_range": null,
  "patch_length": 1,
  "patch_stride": 16,
  "path_dropout": 0.0,
  "pooling_type": null,
  "positional_dropout": 0.0,
  "positional_encoding_type": "sincos",
  "pre_norm": true,
  "prediction_length": 96,
  "random_mask_ratio": 0.4,
  "scaling": "std"

## training

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

num_epochs = 10

In [30]:
from tqdm import tqdm

def train(model, optimizer, criterion, batch_size, num_epochs, train_set, device):
    model.to(device)
    model.train()
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    for epoch in range(num_epochs):
        train_loss = 0
        num_samples = 0
        dloader_train = tqdm(train_loader, unit="batches")
        for i, (x, y) in enumerate(dloader_train, 1):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            y_hat = model(x.permute(0,2,1))["prediction_outputs"]
            loss = criterion(y_hat, y.permute(0,2,1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * len(x)
            num_samples += len(x)

            desc = "TRAIN : Epoch [{}/{}] - loss : {:.3f} ".format(epoch+1, num_epochs, train_loss/num_samples/y_hat.shape[2])
            dloader_train.set_description(desc)

In [31]:
train(model, optimizer, criterion, batch_size, num_epochs, train_set, device)

TRAIN : Epoch [1/10] - loss : 0.618 : 100%|██████████| 54/54 [00:04<00:00, 12.03batches/s]
TRAIN : Epoch [2/10] - loss : 0.608 : 100%|██████████| 54/54 [00:04<00:00, 13.30batches/s]
TRAIN : Epoch [3/10] - loss : 0.602 : 100%|██████████| 54/54 [00:04<00:00, 13.30batches/s]
TRAIN : Epoch [4/10] - loss : 0.597 : 100%|██████████| 54/54 [00:04<00:00, 13.31batches/s]
TRAIN : Epoch [5/10] - loss : 0.589 : 100%|██████████| 54/54 [00:04<00:00, 13.29batches/s]
TRAIN : Epoch [6/10] - loss : 0.586 : 100%|██████████| 54/54 [00:04<00:00, 13.22batches/s]
TRAIN : Epoch [7/10] - loss : 0.576 : 100%|██████████| 54/54 [00:04<00:00, 13.33batches/s]
TRAIN : Epoch [8/10] - loss : 0.574 : 100%|██████████| 54/54 [00:04<00:00, 13.34batches/s]
TRAIN : Epoch [9/10] - loss : 0.573 : 100%|██████████| 54/54 [00:04<00:00, 13.32batches/s]
TRAIN : Epoch [10/10] - loss : 0.560 : 100%|██████████| 54/54 [00:04<00:00, 13.34batches/s]
