In [1]:
import math
import torch
import torch.nn as nn


In [None]:
# Positional Encoding module adds a positional signal to each input
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        """
        d_model: dimension of the model (e.g., 64)
        dropout: dropout rate applied after adding positional encodings
        max_len: maximum sequence length to support
        """
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Create a long enough 'pe' matrix with shape (max_len, d_model)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # Compute the division term for even and odd indices
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # Apply sine to even indices and cosine to odd indices
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Add a batch dimension: shape becomes (1, max_len, d_model)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        x: Tensor of shape (batch_size, seq_length, d_model)
        Adds positional encoding to the input tensor.
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

# Transformer-based Time Series Model
class TransformerTimeSeries(nn.Module):
    def __init__(self, input_size=1, d_model=64, nhead=8, num_layers=3,
                 dim_feedforward=128, dropout=0.1, seq_length=5):
        """
        input_size: dimension of input (1 for univariate time series)
        d_model: embedding dimension for the transformer
        nhead: number of heads in the multi-head attention
        num_layers: number of transformer encoder layers
        dim_feedforward: dimension of the feedforward network in the encoder
        dropout: dropout rate applied in encoder and positional encoding
        seq_length: number of past time steps used (5 in this case)
        """
        super(TransformerTimeSeries, self).__init__()
        self.seq_length = seq_length
        self.d_model = d_model
        
        # 1. Input projection: Map each scalar input to a d_model-dimensional vector (embeddings)
        self.input_projection = nn.Linear(input_size, d_model)
        
        # 2. Positional Encoding: Add position information to the input embeddings.
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        # 3. Transformer Encoder: Create a stack of encoder layers.
        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model,
                                                    nhead=nhead,
                                                    dim_feedforward=dim_feedforward,
                                                    dropout=dropout,
                                                    batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        # 4. Prediction Head: A linear layer to map from d_model to the predicted scalar value.
        self.fc_out = nn.Linear(d_model, 1)

    def forward(self, x):
        """
        x: Input tensor with shape (batch_size, seq_length, input_size)
           For this task, seq_length=5 and input_size=1.
           
        Forward pass breakdown:
        1. Input projection: (batch_size, 5, 1) -> (batch_size, 5, d_model)
        2. Add positional encoding: (batch_size, 5, d_model)
        3. Transformer encoder: (batch_size, 5, d_model) remains (batch_size, 5, d_model)
        4. Extract last time step representation: (batch_size, d_model)
        5. Final prediction: (batch_size, d_model) -> (batch_size, 1)
        """  # noqa: D205, D212
        # Step 1: Project input to d_model dimensions (embedding space)
        x = self.input_projection(x)
        # Step 2: Add positional encoding
        x = self.pos_encoder(x)
        # Step 3: Transformer encoder processes the sequence
        x = self.transformer_encoder(x)
        # Step 4: Extract the output from the last time step (position 5)
        x = x[:, -1, :]  # shape becomes (batch_size, d_model)
        # Step 5: Map to a single predicted value using a linear layer
        out = self.fc_out(x)
        return out


In [6]:

# Create dummy data: batch of 10 sequences, each of length 5, one feature per time step.
batch_size = 10
seq_length = 5
input_size = 1
dummy_input = torch.randn(batch_size, seq_length, input_size)

# Instantiate the model
model = TransformerTimeSeries(input_size=input_size, seq_length=seq_length)

# Forward pass: output shape will be (batch_size, 1)
prediction = model(dummy_input)
print("Prediction shape:", prediction.shape)  # Should output: torch.Size([10, 1])

TypeError: TransformerEncoder.__init__() got an unexpected keyword argument 'batch_first'