# Imports

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime

# Hyperparameters

In [None]:
torch.manual_seed(1)
np.random.seed(1)

# Transformer Specific Hyperparams
forward_expansion = 4
output_dim = 1
model_dim = 512
num_heads = 8
n_enc = 15
n_dec = 1
# num_layers = 6

# LSTM Specific Hyperparams
num_layers = 3
num_features = 3
hidden_dim = 128

# Data Hyperparams
seq_len = 15 
batch_size = 32
num_epochs = 10

# Model Hyperparams
lr = 0.001
decay = 0
factor = 0.1
dropout = 0.1
optimizer_name = 'Adam'
scheduler_name = 'MultiStepLR'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def make_optimizer(optimizer_name, model, **kwargs):
    if optimizer_name=='Adam':
        optimizer = torch.optim.Adam(model.parameters(),lr=kwargs['lr'])
    elif optimizer_name=='SGD':
        optimizer = torch.optim.SGD(model.parameters(),lr=kwargs['lr'],momentum=kwargs['momentum'], weight_decay=kwargs['weight_decay'])
    else:
        raise ValueError('Not valid optimizer name')
    return optimizer
    
def make_scheduler(scheduler_name, optimizer, **kwargs):
    if scheduler_name=='MultiStepLR':
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,milestones=kwargs['milestones'],gamma=kwargs['factor'])
    else:
        raise ValueError('Not valid scheduler name')
    return scheduler

# Data Preprocessing

## Load & Merge Datasets

In [None]:
oil_df = pd.read_csv('data/oil.csv')
test_df = pd.read_csv('data/test.csv')
train_df = pd.read_csv('data/train.csv')

# set date entries to DateTimes
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])
oil_df['date'] = pd.to_datetime(oil_df['date'])

# merge oil df with train and test df based on date values
train_df = train_df.merge(oil_df, on='date', how='left')
train_df['dcoilwtico'] = train_df['dcoilwtico'].interpolate(limit_direction='both')

test_df = test_df.merge(oil_df, on='date', how='left')
test_df['dcoilwtico'] = test_df['dcoilwtico'].interpolate(limit_direction='both')

train_df = train_df.set_index('id')
test_df = test_df.set_index('id')

In [None]:
# get the unique categories for both categorical features
store_nbrs = sorted(train_df['store_nbr'].unique())
families = sorted(train_df['family'].unique())

## Helper Functions for Subset Data Creation

In [None]:
def create_subset(store_nbr, family, df):
    '''Creates subset DataFrame from df with all data corresponding to store_nbr and family, and dates
        
    Args:
        store_nbr: Current store_nbr
        family: Current family
        df: Original training or testing dataframe 
            
    Returns:
        dates: subset dates used for later indexing with predicted testing data
        subset_df: DataFrame with all continuous data corresponding to store_nbr and family
    '''
    
    # gets all rows from input df corresponding to the specific store_nbr and family combination 
    subset_df = df[(df['store_nbr'] == store_nbr) & (df['family'] == family)]
    dates = subset_df['date']
    
    # adds sales column in case using testing data
    if 'sales' not in subset_df:
        subset_df.insert(loc=0, column='sales', value=0.0)
    
    # set date as index and fill in missing dates for the (store_nbr, family) combination with zeros
    if 'date' in subset_df:
        df = df.set_index('date')
        subset_df = subset_df.set_index('date')
        date_range = pd.date_range(start=df.index.min(), end=df.index.max())
        subset_df = subset_df.reindex(date_range)
        
        # Fill non-'dcoilwtico' columns with 0
        for col in subset_df.columns:
            if col != 'dcoilwtico':
                subset_df[col].fillna(0, inplace=True)

        # Interpolate 'dcoilwtico' column
        subset_df["dcoilwtico"] = subset_df["dcoilwtico"].interpolate(limit_direction="both")
    
    # drop store_nbr and family columns
    if 'store_nbr' in subset_df and 'family' in subset_df:
        subset_df = subset_df.drop(columns=['store_nbr', 'family'])
    
    # dates = (num_entries, 1) col is date
    # subset_df = (num_entries, num_features) cols are continuous features: sales, onpromotion, dcoilwtico
    return dates, subset_df

In [None]:
def create_sequences(df):
    '''Creates sequence data for DataFrame indexed on continuous time steps
        
    Args:
        df: Subset DataFrame for current (store_nbr, family) combination
            
    Returns:
        X: numpy array of all of the sequences made from df
            shape = (num_sequences, seq_len, num_features)
        y: numpy array of the single scalar value of sales number from day after sequence ends
            shape = (num_sequences, )
    '''
    X, y = [], []
    
    for idx in range(len(df) - seq_len):
        # seq_x = (seq_len, num_features)
        seq_x = df.iloc[idx:idx+seq_len].values
        X.append(seq_x)
        
        if 'sales' in df:
            # seq_y = scalar value of sales number from day after sequence ends
            seq_y = df.iloc[idx + seq_len]['sales']
            y.append(seq_y)
    
    # Convert the list of sequences to a numpy array and ensure the data type is float32
    X = np.stack(X).astype(np.float32)
    y = np.array(y, dtype=np.float32)

    return X, y

# Model Building

## RMSLE Loss Function Implementation

In [None]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super(RMSLELoss, self).__init__()

    def forward(self, predicted, actual):
        """Computes the Root Mean Squared Logarithmic Error

        Args:
            predicted: The predicted values in tensor form
            actual: The actual target values in tensor form

        Returns:
            loss: The RMSLE loss in tensor form
        """
        # Ensure that predicted and actual are positive and non-zero
        predicted = torch.clamp(predicted, min=1e-6)
        actual = torch.clamp(actual, min=1e-6)

        # Calculate the squared logarithmic error
        log_error = torch.log(predicted + 1) - torch.log(actual + 1)
        squared_log_error = torch.square(log_error)

        # Calculate the mean of the squared log error
        mean_squared_log_error = torch.mean(squared_log_error)

        # Return the square root of the mean squared log error
        loss = torch.sqrt(mean_squared_log_error)
        
        return loss

## Transformer Architecture

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, model_dim, num_heads):
        super(SelfAttention, self).__init__()

        self.model_dim = model_dim # D
        self.num_heads = num_heads # H
        self.head_dim = model_dim // num_heads # D/H = int division

        self.values = nn.Linear(in_features=self.model_dim, out_features=self.model_dim, bias=False)
        self.keys = nn.Linear(in_features=self.model_dim, out_features=self.model_dim, bias=False)
        self.queries = nn.Linear(in_features=self.model_dim, out_features=self.model_dim, bias=False)

        self.fc_out = nn.Linear(self.num_heads*self.head_dim, self.model_dim)

    def forward(self, values, keys, query, mask=None):
        batch_size = query.shape[0]
        # For encoder: v, k, q = out, out, out = (B, N_enc, D)
        # For masked mha: v, k, q = x, x, x = (B, N_dec, D)
        # For encoder-decoder mha: v, k = enc, enc = (B, N_enc, D), q = x = (B, N_dec, D)
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # send to Linear Layer to get into head_dim space
        # For encoder: v, k, q = (B, N_enc, D/H)
        # For masked mha: v, k, q = (B, N_dec, D/H)
        # For enc-dec mha: v, k = (B, N_enc, D/H), q = (B, N_dec, D/H)
        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(query)

        # split embedding into self.heads pieces
        # For encoder: v, k, q = (B, H, N_enc, D/H)
        # For masked mha: v, k, q = (B, H, N_dec, D/H)
        # For enc-dec mha: v, k = (B, H, N_enc, D/H), q = (B, H, N_dec, D/H)
        values = values.reshape(batch_size, self.num_heads, value_len, self.head_dim)
        keys = keys.reshape(batch_size, self.num_heads, key_len, self.head_dim)
        queries = query.reshape(batch_size, self.num_heads, query_len, self.head_dim)

        # Calculate energy = Q @ K.T
        # For encoder: q, k = (B, H, N_enc, D/H) -> energy = (B, H, N_enc, N_enc)
        # For masked mha: q, k = (B, H, N_dec, D/H) -> energy = (B, H, N_dec, N_dec)
        # For enc-dec mha: q = (B, H, N_dec, D/H), k = (B, H, N_enc, D/H) -> energy = (B, H, N_dec, N_enc)
        energy = torch.einsum('bhqd,bhkd->bhqk', [queries, keys])

        # mask for masked mha decoder = (B, H, N_dec, N_dec)
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-1e20'))

        # A = softmax((Q @ K.T)/sqrt(d_K))
        # dim=3 means we normalize across 3rd dim=key_len
        # same dims as energy step
        attention = torch.softmax(energy / (self.model_dim ** (1/2)), dim=3)

        # For encoder: a = (B, H, N_enc, N_enc), v = (B, H, N_enc, D/H)
        # -> out = (B D) -> reshape into out = (B, N_enc, H*D/H) = (B, N_enc, D)
        # For masked mha: a = (B, H, N_dec, N_dec), v = (B, H, N_dec, D/H)
        # -> out = (B, H, N_dec, D) -> reshape into out = (B, N_dec, H*D/H) = (B, N_dec, D)
        # For enc-dec mha: a = (B, H, N_dec, N_enc), v = (B, H, N_enc, D/H)
        # -> out = (B, H, N_dec, D) -> reshape into out = (B, N_dec, H*D/H) = (B, N_dec, D)
        out = torch.einsum('bhav,bhvd->bhad', [attention, values]).reshape(
            batch_size, query_len, self.num_heads*self.head_dim
        )

        # send to Linear Layer output is (B, query_len, model_dim)
        # For encoder: out = (B, N_enc, D)
        # For masked mha: out = (B, N_dec, D)
        # For enc-dec mha: out = (B, N_dec, D)
        out = self.fc_out(out)

        return out
    
class EncoderBlock(nn.Module):
    def __init__(self, model_dim, num_heads, forward_expansion, dropout):
        super(EncoderBlock, self).__init__()

        self.attention = SelfAttention(model_dim=model_dim, num_heads=num_heads)
        # normalizes over last dim, D
        self.norm1 = nn.LayerNorm(normalized_shape=model_dim)
        self.norm2 = nn.LayerNorm(normalized_shape=model_dim)

        # forward_expansion = 4 for 4*512=2048
        self.feed_forward = nn.Sequential(
            nn.Linear(in_features=model_dim, out_features=forward_expansion*model_dim),
            nn.ReLU(),
            nn.Linear(in_features=forward_expansion*model_dim, out_features=model_dim)
        )

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, value, key, query):
        # Multi-Head Attention + Add & Norm Block
        # attention = (B, N_enc, D)
        attention = self.attention(value, key, query)
        # adding query represents the skip connection
        # query = (B, N_enc, D)
        # x = (B, N_enc, D)
        x = self.dropout(self.norm1(attention + query))

        # Feed Forward + Add & Norm Block
        # forward = (B, N_enc, D)
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))

        # out = (B, N_enc, D)
        return out
    
class Encoder(nn.Module):
    def __init__(
            self,
            model_dim,
            num_layers,
            num_heads,
            device,
            forward_expansion,
            dropout,
            n_enc,
            num_features
    ):
        super(Encoder, self).__init__()

        self.device = device
        self.linear = nn.Linear(in_features=num_features, out_features=model_dim)
        self.position_embedding = nn.Embedding(num_embeddings=n_enc, embedding_dim=model_dim)

        self.layers = nn.ModuleList(
            [
                EncoderBlock(
                    model_dim=model_dim,
                    num_heads=num_heads,
                    forward_expansion=forward_expansion,
                    dropout=dropout
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input, x = (batch_size, N_enc=15, num_features=3) = (B, N_enc, 3)
        batch_size, N_enc, num_features = x.shape
        # positions = (N_enc) -> (B, N_enc)
        positions = torch.arange(N_enc, dtype=torch.long).expand(batch_size, N_enc).to(self.device)

        # input w/ linear + positional embedding
        # input w/ linear = (batch_size, N_enc, model_dim) = (B, N_enc, D)
        # embedded positions = (batch_size, N_enc, model_dim) = (B, N_enc, D)
        out = self.dropout(self.linear(x) + self.position_embedding(positions))

        # each encoder layer block
        # out = (B, N_enc, D)
        for layer in self.layers:
            # value, key, query in encoder = out, out, out
            out = layer(out, out, out)

        # returns the output from last encoder block = (B, N_enc, D)
        return out

class DecoderBlock(nn.Module):
    def __init__(self, model_dim, num_heads, forward_expansion, dropout):
        super(DecoderBlock, self).__init__()

        self.attention = SelfAttention(model_dim=model_dim, num_heads=num_heads)
        self.norm = nn.LayerNorm(normalized_shape=model_dim)
        self.encoder_block = EncoderBlock(
            model_dim=model_dim, num_heads=num_heads, forward_expansion=forward_expansion, dropout=dropout
        )
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, value, key, mask):
        # Masked Multi-Head Attn. + Add & Norm Block
        # x = (B, N_dec, D)
        attention = self.attention(x, x, x, mask)
        # query part for the encoder-decoder multi-head attention = (B, N_dec, D)
        query = self.dropout(self.norm(attention + x))

        # the encoder-decoder multi-head attention + add & norm + feed forward + add & norm
        # is just an encoder block with value, key coming from enc_out, and query from masked MHA
        # V, K = (B, N_enc, D), Q = (B, N_dec, D)
        out = self.encoder_block(value, key, query)

        # out = (B, N_dec, D)
        return out

class Decoder(nn.Module):
    def __init__(
            self,
            output_dim,
            model_dim,
            num_layers,
            num_heads,
            forward_expansion,
            dropout,
            n_dec,
            device
    ):
        super(Decoder, self).__init__()

        self.device = device
        self.linear = nn.Linear(in_features=n_dec, out_features=model_dim)
        self.position_embedding = nn.Embedding(num_embeddings=n_dec, embedding_dim=model_dim)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(
                    model_dim=model_dim,
                    num_heads=num_heads,
                    forward_expansion=forward_expansion,
                    dropout=dropout
                )
                for _ in range(num_layers)
            ]
        )

        self.fc_out = nn.Linear(in_features=model_dim, out_features=output_dim)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, enc_out, mask):
        # input, x = (batch_size, N_dec=1, num_features=3) = (B, N_dec, 3)
        batch_size, N_dec, num_features = x.shape
        # positions = (N_dec) -> (B, N_dec)
        positions = torch.arange(N_dec, dtype=torch.long).expand(batch_size, N_dec).to(self.device)

        # input w/ linear + positional embedding = (B, N_dec, D)
        # input w/ linear = (batch_size, N_dec, model_dim) = (B, N_dec, D)
        # embedded positions = (batch_size, N_dec, model_dim) = (B, N_dec, D)
        out = self.dropout(self.linear(x) + self.position_embedding(positions))

        for layer in self.layers:
            # queries = input to decoder block, or output from prev decoder block
            # keys, values = both are enc_out, that's why there's two, not a typo
            # mask is for N_dec, N_dec masking
            # out = (B, N_dec, D)
            out = layer(out, enc_out, enc_out, mask)

        # go from out = (B, N_dec, D) -> (B, N_dec, output_dim=1)
        out = self.fc_out(out)

        return out

class Transformer(nn.Module):
    def __init__(
            self,
            model_dim=512,
            num_layers=6,
            forward_expansion=4,
            num_heads=8,
            dropout=0,
            device='cuda',
            n_enc=15,
            n_dec=1,
            num_features=3,
            output_dim=1
    ):
        super(Transformer, self).__init__()

        self.encoder = Encoder(
            model_dim=model_dim,
            num_layers=num_layers,
            num_heads=num_heads,
            device=device,
            forward_expansion=forward_expansion,
            dropout=dropout,
            n_enc=n_enc,
            num_features=num_features
        )

        self.decoder = Decoder(
            output_dim=output_dim,
            model_dim=model_dim,
            num_layers=num_layers,
            num_heads=num_heads,
            forward_expansion=forward_expansion,
            dropout=dropout,
            n_dec=n_dec,
            device=device
        )

        self.device = device
        
        self.fc_out = nn.Sequential(
            nn.Linear(output_dim, output_dim),
            nn.ReLU()
        )

    def make_mask(self, label):
        # label = (B, N_dec, 1)
        batch_size, N_dec, output_dim = label.shape

        # mask = torch.ones((N_dec, N_dec)) = (N_dec, N_dec) matrix of 1s
        # torch.tril(mask) = zeroes out everything about the diagonal of mask
        # expands mask to (batch_size, num_heads, N_dec, N_dec) = (B, H, N_dec, N_dec)
        mask = torch.tril(torch.ones((N_dec, N_dec))).expand(
            batch_size, 1, N_dec, N_dec
        )

        return mask.to(self.device)

    def forward(self, input_seq, label):
        # input = (B=32, N_enc=15, num_features=3)
        # label = (B=32, N_dec=1, output_dim=1)
        mask = self.make_mask(label=label)
        enc_out = self.encoder(input_seq)

        # decoder takes
        # label=input label, (B, N_dec, 1)
        # enc_out=output from encoder, (B, N_enc, D)
        # mask=mask, (B, H, N_dec, N_dec)
        out = self.decoder(label, enc_out, mask)
        out = self.fc_out(out)
        # out = (B, N_dec, output_dim=1)
        return out

## LSTM Architecture

In [None]:
class LSTM(nn.Module):
    def __init__(self, num_layers, num_features, hidden_dim, dropout):
        super(LSTM, self).__init__()
        
        self.num_layers = num_layers
        self.num_features = num_features
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(
            input_size=num_features,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=False
        )
        
        self.fc = nn.Sequential(
            nn.Linear(
            in_features=hidden_dim,
            out_features=1
            ),
            nn.ReLU()
        )
    
    def forward(self, x, internal):
        '''Computes the model forward pass
        
        Args:
            x: The input sequence with shape (batch_size, seq_len, num_features)
            internal: The initial internal states of the LSTM as a tuple (h_0, c_0)
                both h_0 and c_0 have shape (num_layers, batch_size, hidden_size)
        
        Returns:
            sales_pred: Normalized sales predictions for day after sequence across the batch
            internal: Final hidden state and cell state for each layer as a tuple (h_n, c_n)
                both h_n and c_n have shape (num_layers, batch_size, hidden_size)
            
        '''
        
        # 1.) Input from DataLoader
        # input: 
        #    xt = (batch_size, seq_len, num_features)
        curr_batch_size = x.size(0)
        
        # 2.) LSTM Layer
        # input: 
        #    xt = (batch_size, seq_len, num_features)
        # output: 
        #    lstm_out = tensor of output features from the last layer of LSTM for each time step
        #        tensor has shape (batch_size, seq_len, hidden_dim)
        #    internal = internal hidden and cell states of nth LSTM layer, tuple of (h_n, c_n)
        #        both h_c and c_n with shape (num_layers, batch_size, hidden_dim)
        lstm_out, internal = self.lstm(x, internal)
        
        # 3.) Extract Next-Day Prediction
        # input:
        #    lstm_out = LSTM output for all time steps in the sequence
        #        has shape (batch_size, seq_len, hidden_dim)
        # output:
        #    lstm_next = LSTM feature prediction for the day after the sequence
        #        has shape (batch_size, hidden_dim)
        lstm_next = lstm_out[:, -1, :]
        
        # 4.) Linear Layer
        # input:
        #    lstm_next = (batch_size, hidden_dim)
        # output:
        #    sales_pred = final predicted normalized sales value for day after sequence ends
        #        has shape (batch_size, 1)
        sales_pred = self.fc(lstm_next)
        
        return sales_pred, internal
    
    def init_internal_states(self, curr_batch_size):
        '''Initializes hidden and cell states for the LSTM
        
        Args:
            curr_batch_size: The size of the current batch in case we have uneven batches
        
        Returns:
            internal = initial internal hidden and cell states for LSTM, tuple of (h_0, c_0)
                both h_0 and c_0 with shape (num_layers, curr_batch_size, hidden_dim)
        
        '''
        
        h_0 = torch.zeros((self.num_layers, curr_batch_size, self.hidden_dim), dtype=torch.float32).to(device)
        c_0 = torch.zeros((self.num_layers, curr_batch_size, self.hidden_dim), dtype=torch.float32).to(device)
        
        internal = (h_0,c_0)
        
        return internal

# Training Loop

In [None]:
def train(architecture, model, train_loader, num_epochs, criterion, optimizer, scheduler):
    model.train()

    for epoch in range(1, num_epochs + 1):
        train_loss = 0
        num_batches = 0

        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()
            
            inputs, sales_target = batch
            inputs = inputs.float().to(device) # inputs = (B, N_enc, 3)
            
            if architecture == 'transformer': # sales_target = (B, N_dec, 3)
                sales_target = sales_target.float().unsqueeze(1).unsqueeze(2).to(device)
                
                out = model(inputs, sales_target)
                sales_pred = out[:, -1, :].squeeze(-1)
                
            elif architecture == 'lstm': # sales_target = ()
                sales_target = sales_target.float().to(device)
                                
                # initializes hidden and cell states with current batch_size
                internal = model.init_internal_states(inputs.size(0))
                internal = tuple(i.detach() for i in internal)
                
                sales_pred, internal = model(inputs, internal)

            loss = criterion(sales_pred, sales_target)
            loss.backward()

            optimizer.step()

            train_loss += loss.item()
            num_batches += 1

            if batch_idx == len(train_loader) - 1:        
                progress = 100. * batch_idx / len(train_loader)
                print(f'Train(Epoch {epoch})[{progress:.0f}%]: Loss: {train_loss / (batch_idx + 1):.4f}')
        
        scheduler.step()


# Predicting Testing Values Loop

In [None]:
def predict_sales(architecture, model, store_nbr, family, current_seq, norm_values, subset_test_data, test_df):
    # Predict sales values for testing dates
    model.eval()
    
    if architecture == 'lstm':
        internal = model.init_internal_states(1)

    for date in subset_test_data.index:
        with torch.no_grad():
            # turns (15, 3) sequence into (1, 15, 3) Tensor for batch dimension
            current_seq_input = torch.Tensor(current_seq).float().unsqueeze(0).to(device)
            label = torch.zeros(1, 1, 1).float().to(device)
            
            if architecture == 'transformer':
                out = model(current_seq_input, label)
                sales_pred = out[:, -1, :].squeeze(-1)
                sales_pred = sales_pred.cpu().numpy() # Get the scalar prediction
            elif architecture == 'lstm':
                sales_pred, _ = model(current_seq_input, internal)
                sales_pred = sales_pred.cpu().numpy().flatten()[0]  # Get the scalar prediction

            # updates the testing date's sales number from 0 to the predicted value
            # takes that date's row as the "next day", this is a (1, 3) np array
            subset_test_data.loc[date, 'sales'] = sales_pred
            next_day = subset_test_data.loc[date].values

            # updates the current (15, 3) sequence to shift over 1 in time
            new_seq = np.vstack((current_seq, next_day))
            current_seq = new_seq[1:]

    # Unnormalize the testing data
    for col in subset_test_data.columns:
        mn, mx = norm_values[col]
        subset_test_data[col] = subset_test_data[col] * (mx - mn) + mn
    
    # Create dataframe of ID mapped to predicted sales values for this store_nbr, family pair
    subset_test_data = subset_test_data.reset_index().rename(columns={'index': 'date'})
    filtered_test_df = test_df[(test_df['store_nbr'] == store_nbr) & (test_df['family'] == family)]
    merged_df = filtered_test_df.reset_index().merge(subset_test_data, on='date', how='left')

    predicted_values = merged_df[['id', 'sales']].copy()
    predicted_values.rename(columns={'id': 'ID', 'sales': 'Sales'}, inplace=True)
        
    return predicted_values

# Save Submission

In [None]:
def save_submission(architecture, all_preds):
    total_df = pd.concat(all_preds, ignore_index=True)
    total_df = total_df.sort_values(by='ID', ignore_index=True)
    print(total_df)
    total_df.columns = ['id', 'sales']
    total_df.to_csv(f'submission_{architecture}.csv', index=False)

# Main Loop

In [None]:
architectures = ['transformer', 'lstm']

for architecture in architectures:
    all_preds = []
    
    for store_nbr in store_nbrs:
        
        counter = 0
        for family in families:
            
            counter += 1
            # Create subset of training & testing data and then normalize
            _, subset_train_df = create_subset(store_nbr=store_nbr, family=family, df=train_df)
            testing_dates, subset_test_data = create_subset(store_nbr=store_nbr, family=family, df=test_df)
            
            # Store normalize values across training data to later unnormalize with
            norm_values = {}
            for col in subset_train_df:
                mn, mx = subset_train_df[col].min(), subset_train_df[col].max()
                norm_values[col] = (mn, mx)
                if mx != mn:
                    subset_train_df[col] = (subset_train_df[col] - mn) / (mx - mn)
                    subset_test_data[col] = (subset_test_data[col] - mn) / (mx - mn)

            # Create sequences from training subset and make DataLoader
            X, y = create_sequences(df=subset_train_df)
            train_data = TensorDataset(torch.from_numpy(X), torch.from_numpy(y.astype('float32')))
            train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

            # Make model
            
            if architecture == 'transformer': 
                model = Transformer(
                    model_dim=model_dim,
                    num_layers=6,
                    forward_expansion=forward_expansion,
                    num_heads=num_heads,
                    dropout=dropout,
                    device=device,
                    n_enc=n_enc,
                    n_dec=n_dec,
                    num_features=3,
                    output_dim=output_dim
                ).to(device)

            elif architecture == 'lstm':
                model = LSTM(
                    num_layers=num_layers, 
                    num_features=num_features, 
                    hidden_dim=hidden_dim, 
                    dropout=dropout
                ).to(device)

            print(model)
            criterion = RMSLELoss().to(device)
            optimizer = make_optimizer(optimizer_name, model, lr=lr, momentum=0, weight_decay=0)
            scheduler = make_scheduler(scheduler_name, optimizer, milestones=[5], factor=0.1)

            # Train
            train(
                architecture=architecture,
                model=model, 
                train_loader=train_loader, 
                num_epochs=num_epochs, 
                criterion=criterion, 
                optimizer=optimizer, 
                scheduler=scheduler
            )

            # Predict on Testing Data
            id_to_sales = predict_sales(
                architecture=architecture,
                model=model, 
                store_nbr=store_nbr, 
                family=family, 
                current_seq=X[-1], 
                norm_values=norm_values, 
                subset_test_data=subset_test_data, 
                test_df=test_df
            )

            all_preds.append(id_to_sales)
    
    save_submission(architecture=architecture, all_preds=all_preds)