In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import gc
import math

In [None]:
FORECAST_HORIZON = 28
LOOKBACK_WINDOW = 56
BATCH_SIZE = 32  # Small batch for memory efficiency
EPOCHS = 30
LEARNING_RATE = 0.0001
D_MODEL = 64  # Model dimension (smaller for 8GB RAM)
N_HEADS = 4
E_LAYERS = 2  # Encoder layers
D_LAYERS = 1  # Decoder layers
D_FF = 128  # Feedforward dimension
DROPOUT = 0.1
AGGREGATION_LEVEL = 'store_category'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class ProbAttention(nn.Module):
    """ProbSparse Self-Attention (Informer's key innovation)"""
    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1):
        super().__init__()
        self.factor = factor
        self.scale = scale
        self.mask_flag = mask_flag
        self.dropout = nn.Dropout(attention_dropout)
        
    def _prob_QK(self, Q, K, sample_k, n_top):
        """ProbSparse sampling"""
        B, H, L_K, E = K.shape
        _, _, L_Q, _ = Q.shape
        
        # Sample K
        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
        index_sample = torch.randint(L_K, (L_Q, sample_k))
        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
        
        # Calculate Q_K
        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(-2)
        
        # Find top queries
        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
        M_top = M.topk(n_top, sorted=False)[1]
        
        return M_top
    
    def forward(self, queries, keys, values):
        B, L_Q, H, D = queries.shape
        _, L_K, _, _ = keys.shape
        
        queries = queries.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)
        
        scale = self.scale or 1. / math.sqrt(D)
        
        # For cross-attention (L_Q != L_K), use full attention
        if L_Q != L_K:
            scores = torch.matmul(queries, keys.transpose(-2, -1)) * scale
            attn = F.softmax(scores, dim=-1)
            attn = self.dropout(attn)
            context = torch.matmul(attn, values)
            return context.transpose(1, 2).contiguous()
        
        # For self-attention, use ProbSparse
        U_part = self.factor * np.ceil(np.log(L_K)).astype('int').item()
        u = self.factor * np.ceil(np.log(L_Q)).astype('int').item()
        
        U_part = U_part if U_part < L_K else L_K
        u = u if u < L_Q else L_Q
        
        # ProbSparse sampling
        scores_top = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
        
        # Use mean of values as default context for non-selected queries
        V_mean = values.mean(dim=2, keepdim=True).expand(-1, -1, L_Q, -1)
        context = V_mean.clone()
        
        for i in range(B):
            for j in range(H):
                selected_Q = queries[i, j, scores_top[i, j], :]
                attn = torch.matmul(selected_Q, keys[i, j].transpose(-2, -1))
                attn = attn * scale
                attn = F.softmax(attn, dim=-1)
                attn = self.dropout(attn)
                out = torch.matmul(attn, values[i, j])
                context[i, j, scores_top[i, j], :] = out
        
        return context.transpose(1, 2).contiguous()

In [None]:
class AttentionLayer(nn.Module):
    """Attention layer wrapper"""
    def __init__(self, attention, d_model, n_heads, d_keys=None, d_values=None):
        super().__init__()
        d_keys = d_keys or (d_model // n_heads)
        d_values = d_values or (d_model // n_heads)
        
        self.inner_attention = attention
        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
        self.value_projection = nn.Linear(d_model, d_values * n_heads)
        self.out_projection = nn.Linear(d_values * n_heads, d_model)
        self.n_heads = n_heads
        
    def forward(self, queries, keys, values):
        B, L, _ = queries.shape
        _, S, _ = keys.shape
        H = self.n_heads
        
        queries = self.query_projection(queries).view(B, L, H, -1)
        keys = self.key_projection(keys).view(B, S, H, -1)
        values = self.value_projection(values).view(B, S, H, -1)
        
        out = self.inner_attention(queries, keys, values)
        out = out.view(B, L, -1)
        
        return self.out_projection(out)

In [None]:
class EncoderLayer(nn.Module):
    """Informer Encoder Layer"""
    def __init__(self, attention, d_model, d_ff=None, dropout=0.1):
        super().__init__()
        d_ff = d_ff or 4 * d_model
        self.attention = attention
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        new_x = self.attention(x, x, x)
        x = x + self.dropout(new_x)
        x = self.norm1(x)
        
        y = x
        y = self.dropout(F.relu(self.conv1(y.transpose(-1, 1))))
        y = self.dropout(self.conv2(y).transpose(-1, 1))
        
        return self.norm2(x + y)

In [None]:
class DecoderLayer(nn.Module):
    """Informer Decoder Layer"""
    def __init__(self, self_attention, cross_attention, d_model, d_ff=None, dropout=0.1):
        super().__init__()
        d_ff = d_ff or 4 * d_model
        self.self_attention = self_attention
        self.cross_attention = cross_attention
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, cross):
        x = x + self.dropout(self.self_attention(x, x, x))
        x = self.norm1(x)
        
        x = x + self.dropout(self.cross_attention(x, cross, cross))
        x = self.norm2(x)
        
        y = x
        y = self.dropout(F.relu(self.conv1(y.transpose(-1, 1))))
        y = self.dropout(self.conv2(y).transpose(-1, 1))
        
        return self.norm3(x + y)

In [None]:
class Encoder(nn.Module):
    """Informer Encoder"""
    def __init__(self, attn_layers, norm_layer=None):
        super().__init__()
        self.attn_layers = nn.ModuleList(attn_layers)
        self.norm = norm_layer
        
    def forward(self, x):
        for attn_layer in self.attn_layers:
            x = attn_layer(x)
        
        if self.norm is not None:
            x = self.norm(x)
        
        return x

In [None]:
class Decoder(nn.Module):
    """Informer Decoder"""
    def __init__(self, layers, norm_layer=None):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        self.norm = norm_layer
        
    def forward(self, x, cross):
        for layer in self.layers:
            x = layer(x, cross)
        
        if self.norm is not None:
            x = self.norm(x)
        
        return x

In [None]:
class Informer(nn.Module):
    """Complete Informer Model"""
    def __init__(self, enc_in, dec_in, c_out, seq_len, label_len, out_len,
                 d_model=512, n_heads=8, e_layers=2, d_layers=1, d_ff=512, 
                 dropout=0.0, attn='prob', embed='fixed', freq='h', activation='gelu'):
        super().__init__()
        self.pred_len = out_len
        self.label_len = label_len
        self.seq_len = seq_len
        
        # Encoding
        self.enc_embedding = nn.Linear(enc_in, d_model)
        self.dec_embedding = nn.Linear(dec_in, d_model)
        
        # Positional encoding - separate for encoder and decoder
        self.enc_position_encoding = nn.Parameter(torch.randn(1, seq_len, d_model))
        self.dec_position_encoding = nn.Parameter(torch.randn(1, label_len + out_len, d_model))
        
        # Encoder
        self.encoder = Encoder(
            [
                EncoderLayer(
                    AttentionLayer(
                        ProbAttention(False, attention_dropout=dropout),
                        d_model, n_heads
                    ),
                    d_model, d_ff, dropout=dropout
                ) for _ in range(e_layers)
            ],
            norm_layer=nn.LayerNorm(d_model)
        )
        
        # Decoder
        self.decoder = Decoder(
            [
                DecoderLayer(
                    AttentionLayer(
                        ProbAttention(True, attention_dropout=dropout),
                        d_model, n_heads
                    ),
                    AttentionLayer(
                        ProbAttention(False, attention_dropout=dropout),
                        d_model, n_heads
                    ),
                    d_model, d_ff, dropout=dropout
                ) for _ in range(d_layers)
            ],
            norm_layer=nn.LayerNorm(d_model)
        )
        
        # Projection
        self.projection = nn.Linear(d_model, c_out, bias=True)
        
    def forward(self, x_enc, x_dec):
        # Encoding
        enc_out = self.enc_embedding(x_enc)
        enc_out = enc_out + self.enc_position_encoding[:, :x_enc.size(1), :]
        enc_out = self.encoder(enc_out)
        
        # Decoding
        dec_out = self.dec_embedding(x_dec)
        dec_out = dec_out + self.dec_position_encoding[:, :x_dec.size(1), :]
        dec_out = self.decoder(dec_out, enc_out)
        dec_out = self.projection(dec_out)
        
        return dec_out[:, -self.pred_len:, :]

In [None]:
class InformerDataset(Dataset):
    """Dataset for Informer with encoder-decoder structure"""
    def __init__(self, data, seq_len, label_len, pred_len, train=True):
        self.data = data
        self.seq_len = seq_len
        self.label_len = label_len
        self.pred_len = pred_len
        self.train = train
        
        self.X_enc = []
        self.X_dec = []
        self.y = []
        
        total_len = seq_len + pred_len
        for i in range(len(data) - total_len + 1):
            # Encoder input: historical data
            self.X_enc.append(data[i:i+seq_len])
            
            # Decoder input: last label_len of encoder + zeros for prediction
            dec_input = np.concatenate([
                data[i+seq_len-label_len:i+seq_len],
                np.zeros((pred_len, data.shape[1]))
            ], axis=0)
            self.X_dec.append(dec_input)
            
            # Target: future values
            if train:
                self.y.append(data[i+seq_len:i+seq_len+pred_len])
        
        self.X_enc = np.array(self.X_enc, dtype=np.float32)
        self.X_dec = np.array(self.X_dec, dtype=np.float32)
        if train:
            self.y = np.array(self.y, dtype=np.float32)
    
    def __len__(self):
        return len(self.X_enc)
    
    def __getitem__(self, idx):
        if self.train:
            return (torch.FloatTensor(self.X_enc[idx]), 
                    torch.FloatTensor(self.X_dec[idx]), 
                    torch.FloatTensor(self.y[idx]))
        else:
            return (torch.FloatTensor(self.X_enc[idx]), 
                    torch.FloatTensor(self.X_dec[idx]))

In [None]:
sales = pd.read_csv('sales_train_validation.csv')
calendar = pd.read_csv('calendar.csv')

day_cols = [col for col in sales.columns if col.startswith('d_')]

In [None]:
def create_aggregated_series(sales_df, level):
    if level == 'store_category':
        group_cols = ['store_id', 'cat_id']
    elif level == 'store':
        group_cols = ['store_id']
    elif level == 'category':
        group_cols = ['cat_id']
    else:
        raise ValueError(f"Unknown level: {level}")
    
    agg_sales = sales_df.groupby(group_cols)[day_cols].sum().reset_index()
    
    if len(group_cols) > 1:
        agg_sales['series_id'] = agg_sales[group_cols].apply(lambda x: '_'.join(x), axis=1)
    else:
        agg_sales['series_id'] = agg_sales[group_cols[0]]
    
    return agg_sales

agg_sales = create_aggregated_series(sales, AGGREGATION_LEVEL)
n_series = len(agg_sales)

In [None]:
models = {}
series_results = []

LABEL_LEN = FORECAST_HORIZON // 2  # Half of pred_len for label_len

for idx, row in agg_sales.iterrows():
    series_id = row['series_id']
    print(f"\n   [{idx+1}/{n_series}] Training {series_id}...")
    
    # Get time series
    ts_data = np.array([row[col] for col in day_cols], dtype=np.float32)
    
    # Normalize
    scaler = StandardScaler()
    ts_data_scaled = scaler.fit_transform(ts_data.reshape(-1, 1))
    
    # Split train/val
    train_data = ts_data_scaled[:-FORECAST_HORIZON]
    val_data = ts_data_scaled
    
    # Create datasets
    train_dataset = InformerDataset(
        train_data, LOOKBACK_WINDOW, LABEL_LEN, FORECAST_HORIZON, train=True
    )
    
    if len(train_dataset) < BATCH_SIZE:
        print(f"      Warning: Not enough data for {series_id}, skipping...")
        continue
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    # Initialize model
    model = Informer(
        enc_in=1, dec_in=1, c_out=1,
        seq_len=LOOKBACK_WINDOW,
        label_len=LABEL_LEN,
        out_len=FORECAST_HORIZON,
        d_model=D_MODEL,
        n_heads=N_HEADS,
        e_layers=E_LAYERS,
        d_layers=D_LAYERS,
        d_ff=D_FF,
        dropout=DROPOUT
    ).to(device)
    
    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # Training loop
    model.train()
    best_loss = float('inf')
    patience = 5
    patience_counter = 0
    
    for epoch in range(EPOCHS):
        epoch_loss = 0
        for batch_x_enc, batch_x_dec, batch_y in train_loader:
            batch_x_enc = batch_x_enc.to(device)
            batch_x_dec = batch_x_dec.to(device)
            batch_y = batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_x_enc, batch_x_dec)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(train_loader)
        
        if (epoch + 1) % 5 == 0:
            print(f"      Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}")
        
        # Early stopping
        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"      Early stopping at epoch {epoch+1}")
                break
    
    # Validation prediction
    model.eval()
    with torch.no_grad():
        # Create validation input
        val_enc = torch.FloatTensor(train_data[-LOOKBACK_WINDOW:]).unsqueeze(0).to(device)
        val_dec = torch.FloatTensor(
            np.concatenate([
                train_data[-LABEL_LEN:],
                np.zeros((FORECAST_HORIZON, 1))
            ], axis=0)
        ).unsqueeze(0).to(device)
        
        prediction_scaled = model(val_enc, val_dec).cpu().numpy().squeeze()
    
    # Denormalize
    prediction = scaler.inverse_transform(prediction_scaled.reshape(-1, 1)).flatten()
    prediction = np.maximum(prediction, 0)
    
    # Get actual values
    actual = ts_data[-FORECAST_HORIZON:]
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(actual, prediction))
    mae = mean_absolute_error(actual, prediction)
    
    print(f"      âœ“ RMSE: {rmse:.2f}, MAE: {mae:.2f}")
    
    # Store results
    models[series_id] = {'model': model, 'scaler': scaler}
    series_results.append({
        'series_id': series_id,
        'rmse': rmse,
        'mae': mae,
        'predictions': prediction,
        'actuals': actual
    })
    
    # Clear memory
    del model, train_dataset, train_loader
    torch.cuda.empty_cache()
    gc.collect()