# Import & Setting

### Import

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from IPython.display import clear_output

import torch
from PIL import Image
from torchvision import transforms

import pytorch_forecasting as pf
from pytorch_forecasting.models.base_model import BaseModelWithCovariates

from transformers import SwinModel

device = torch.device("cuda")

  from .autonotebook import tqdm as notebook_tqdm


### Setting

In [2]:
# Params for sampling 
num_samples = None

# Params for Train_test_split 
train_test_split_rto = 0.1

# Dataset
window_size = 30
predict_length = 7
batch_size = 32

# Model
d_model = 128
dropout = 0.3
nhead = 4
num_layers = 4
d_ff = 512
# d_model = 512
# dropout = 0.3
# nhead = 8
# num_layers = 6
# d_ff = 2048

# Preprocess Data

### Read data

In [3]:
df_raw = pd.read_csv("HnM/transactions_train.csv", dtype={"article_id":str}) # Read data
df_raw.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


### Pre-sampling

In [4]:
df_presampling = df_raw.copy()
print(df_presampling["article_id"].nunique())

# Filter out no image articles
df_presampling["img_path"] = df_presampling["article_id"].apply(lambda x: f'./HnM/images/{x[:3]}/{x}.jpg') # Generate image path
df_presampling["is_valid"] = df_presampling["img_path"].apply(lambda x: 1 if os.path.isfile(x) else 0) # Check whether the article has corresponding image file
df_presampling = df_presampling[df_presampling["is_valid"] == 1] # Filter out articles which do not have a corresponding image
print(df_presampling["article_id"].nunique())

# # Filter out short product_lives
df_presampling["t_dat"] = pd.to_datetime(df_presampling["t_dat"])
df_presampling["min_date"]= df_presampling.groupby("article_id")["t_dat"].transform("min")
df_presampling["max_date"]= df_presampling.groupby("article_id")["t_dat"].transform("max")
df_presampling["product_life_length"] = df_presampling["max_date"] - df_presampling["min_date"]
# df_presampling = df_presampling[df_presampling["product_life_length"].dt.days >= (window_size + predict_length)] # Product life length should be greater or equal than (window_size + predict_length)
print(df_presampling["article_id"].nunique())

df_presampling.head()

104547
104106
104106


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,img_path,is_valid,min_date,max_date,product_life_length
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,./HnM/images/066/0663713001.jpg,1,2018-09-20,2019-06-16,269 days
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,./HnM/images/054/0541518023.jpg,1,2018-09-20,2019-12-27,463 days
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,./HnM/images/050/0505221004.jpg,1,2018-09-20,2019-05-25,247 days
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,./HnM/images/068/0685687003.jpg,1,2018-09-20,2020-02-08,506 days
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,./HnM/images/068/0685687004.jpg,1,2018-09-20,2020-03-04,531 days


### Sample data

In [5]:
df_sampled = df_presampling.copy()

# Sample by sales amount
df_sampled = df_sampled.groupby(["article_id", "t_dat"], as_index=False).agg(sales=("price", "count")) # Generate sales column
sample_id_li = df_sampled.groupby("article_id").agg({"sales":"sum"}).sort_values("sales", ascending=False) # Sort article_id by number of sales
sample_id_li = sample_id_li.iloc[:num_samples].index if num_samples else sample_id_li.index # Slice article_id
df_sampled = df_sampled[df_sampled["article_id"].isin(sample_id_li)].reset_index(drop=True)
print(df_sampled.shape)
df_sampled.head()

(7418149, 3)


Unnamed: 0,article_id,t_dat,sales
0,108775015,2018-09-20,30
1,108775015,2018-09-21,48
2,108775015,2018-09-22,11
3,108775015,2018-09-23,26
4,108775015,2018-09-24,33


### Post-sampling

In [None]:
df_post = df_sampled.copy()

# Explode dates
def func(x):
    full_date = pd.DataFrame(pd.date_range(x["t_dat"].min(), x["t_dat"].max(), freq="d"), columns=["t_dat"])
    x = x.merge(full_date, on="t_dat", how="right").reset_index(drop=True)
    x["article_id"] = x["article_id"].unique()[0]
    x["sales"] = x["sales"].fillna(0)
    return x
df_post = df_post.groupby("article_id", as_index=False).apply(lambda x: func(x)).reset_index(drop=True)
df_post["time_idx"] = df_post.groupby("article_id").cumcount()

### LabelEncode image path
imgpath_encoder = LabelEncoder()
df_post["img_path"] = df_post["article_id"].apply(lambda x: f'HnM/images/{x[:3]}/{x}.jpg')
df_post["img_path"] = imgpath_encoder.fit_transform(df_post["img_path"])

# Generate additional features
df_post["year"] = df_post["t_dat"].dt.year
df_post["month"] = df_post["t_dat"].dt.month
df_post["day"] = df_post["t_dat"].dt.day

# Train test split
num_samples = num_samples if num_samples else df_post["article_id"].nunique()
num_train = int(np.round(num_samples * train_test_split_rto))
sample_id_li_train = sample_id_li[:num_train]

df_train = df_post[df_post["article_id"].isin(sample_id_li_train)].reset_index(drop=True)
df_valid = df_post[~df_post["article_id"].isin(sample_id_li_train)].reset_index(drop=True)
assert df_train.shape[0] + df_valid.shape[0] == df_post.shape[0]

# Make Dataset

In [7]:
from pytorch_forecasting.data import GroupNormalizer, NaNLabelEncoder
train_dataset = pf.TimeSeriesDataSet(
    data=df_train,
    time_idx="time_idx",
    target="sales",
    group_ids=["img_path"],
    # static_reals=["img_path"], # image is a static information which does not change by time
    min_encoder_length=window_size,
    max_encoder_length=window_size,
    min_prediction_idx=predict_length,
    max_prediction_length=predict_length,
    time_varying_known_reals=["sales"],
    # target_normalizer=None,
    # scalers={"img_path":None}, # Since img_path is a set of labels encoded by external encoder, make it not to treat it as numbers
    categorical_encoders={"img_path":NaNLabelEncoder(add_nan=True)}
)
valid_dataset = pf.TimeSeriesDataSet.from_dataset(train_dataset, df_post, predict=True, stop_randomization=True)

train_dataloader = train_dataset.to_dataloader(batch_size=batch_size, shuffle=True)
valid_dataloader = valid_dataset.to_dataloader(train=False, batch_size=batch_size, shuffle=True, drop_last=True)



In [8]:
import joblib

joblib.dump(train_dataloader, "train_dataloader.pkl")
joblib.dump(valid_dataloader, "valid_dataloader.pkl")
joblib.dump(train_dataset, "train_dataset.pkl")

['train_dataset.pkl']

# Model

### Architecture

In [9]:
import joblib

train_dataset = joblib.load("train_dataset.pkl")
train_dataloader = joblib.load("train_dataloader.pkl")
valid_dataloader = joblib.load("valid_dataloader.pkl")

In [10]:
class PositionalEncoding(torch.nn.Module):
    # PE(pos, 2i) = sin(pos/10000^{2i/d_model}), 
    # PE(pos, 2i+1) = cos(pos/10000^{2i/d_model})
    def __init__(self, max_len, d_model, dropout):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)

        position = torch.arange(max_len).reshape(-1,1).to(device)
        i = torch.arange(d_model).to(device)//2
        exp_term = 2*i/d_model
        div_term = torch.pow(10000, exp_term).reshape(1, -1)
        self.pos_encoded = position / div_term

        self.pos_encoded[:, 0::2] = torch.sin(self.pos_encoded[:, 0::2])
        self.pos_encoded[:, 1::2] = torch.cos(self.pos_encoded[:, 1::2])

    def forward(self, x):
        output = x + self.pos_encoded[:x.shape[1], :]
        return self.dropout(output)
    
class Mask(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def get_padding_mask(self, arr):
        res = torch.eq(arr, 0).type(torch.FloatTensor).to(device)
        res = torch.where(res==1, -torch.inf, 0)
        return res
    
    def get_lookahead_mask(self, arr):
        seq_len = arr.shape[1]
        mask = torch.triu(torch.ones((seq_len, seq_len))*-1e-9, 1).to(device)
        return mask

    def forward(self, arr):
        padding_mask = self.get_padding_mask(arr)
        lookahead_mask = self.get_lookahead_mask(arr)
        return padding_mask, lookahead_mask

In [11]:
class MultimodalTransformer(torch.nn.Module):
    def __init__(self, max_seq_len, d_model, dropout, nhead, d_ff, num_layers, swin_transformer):
        super().__init__()
        # Encoder
        self.enc_mask = Mask()
        self.linear1 = torch.nn.Linear(1, d_model)
        self.enc_pos_encoding = PositionalEncoding(max_seq_len, d_model, dropout)
        self.encoder = torch.nn.TransformerEncoder(torch.nn.TransformerEncoderLayer(d_model, nhead, d_ff, dropout, batch_first=True), num_layers)
        
        # Decoder
        self.swin_transformer = swin_transformer
        self.attn = torch.nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True)
        self.linear2 = torch.nn.Linear(self.swin_transformer..hconfigidden_size, d_model)
        self.layernorm = torch.nn.LayerNorm(d_model)

        self.fc1 = torch.nn.Linear(d_model, d_model)
        self.relu1 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(d_model, d_model)
        self.relu2 = torch.nn.ReLU()

        self.flatten = torch.nn.Flatten()
        self.linear3 = torch.nn.Linear(d_model*49, d_model)
        self.linear4 = torch.nn.Linear(d_model, predict_length)
    
    def forward(self, enc_input, dec_input):
        # Encoding
        linear1_ = self.linear1(enc_input)
        enc_pos_encoding_ = self.enc_pos_encoding(linear1_)
        encoder_ = self.encoder(enc_pos_encoding_)
        
        # Decoding
        ### Self attention
        swin_transformer_ = self.swin_transformer(dec_input).last_hidden_state
        linear2_ = self.linear2(swin_transformer_)

        ### Cross attention
        attn_, attn_weight = self.attn(query=linear2_, key=encoder_, value=encoder_)
        layernorm_ = self.layernorm(linear2_ + attn_)

        ### Feed forward
        relu1_ = self.relu1(self.fc1(layernorm_))
        relu2_ = self.relu2(self.fc2(relu1_))

        # Final
        flatten_ = self.flatten(relu2_)
        linear3_ = self.linear3(flatten_)
        linear4_ = self.linear4(linear3_)
        
        return linear4_

In [12]:
class MultimodalTransformerFromDataset(BaseModelWithCovariates):
    def __init__(self, imgpath_encoder, predict_length, swin_transformer, window_size, d_model, dropout, nhead, d_ff, num_layers, 
                 static_categoricals, time_varying_categoricals_encoder, time_varying_categoricals_decoder, static_reals, 
                 time_varying_reals_encoder,  time_varying_reals_decoder, x_reals, x_categoricals, embedding_labels, embedding_paddings, 
                 categorical_groups, embedding_sizes, **kwargs):
        self.save_hyperparameters()
        super().__init__(**kwargs)

        self.imgpath_encoder = imgpath_encoder
        self.predict_length = predict_length
        self.network = MultimodalTransformer(window_size, d_model, dropout, nhead, d_ff, num_layers, swin_transformer)
        # self.network.to(device)
    
    def forward(self, data):
        # Gather time series data
        x = data[0]["encoder_cont"][:, :, 1].unsqueeze(-1) # shape: (batch_size, window_size, 1)
        y = data[1][0] # shape: (batch_size, predict_length)

        # Gather image data
        img_path = data[0]["encoder_cont"][:, :, 0].type(torch.int).unique(dim=-1).squeeze() # Label encoded image_path → shape: (batch_size, ) 
        img_path = self.imgpath_encoder.inverse_transform(img_path) # The real image path e.g) 'HnM/images/068/0687169002.jpg' → shape: (batch_size, )

        # Process image data
        img_li = []
        transform = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]) # Transform image based on ImageNet standard

        for n, path in enumerate(img_path): # Iterate images
            img = transform(Image.open(path).convert("RGB")) # Transform an image
            img_li.append(img)
        img_tensor = torch.stack(img_li, dim=0) # Put all the images together
        
        # Prediction
        pred = self.network(x.to(device), img_tensor.to(device))
        pred = self.transform_output(prediction=pred, target_scale=data[0]["target_scale"].to(device)) # Inverse transform the output
        
        return pred, y.to(device)

In [13]:
import gc
torch.cuda.empty_cache()
gc.collect()

swin_transformer = SwinModel.from_pretrained("microsoft/swin-tiny-patch4-window7-224") # Get pre-trained SwinTransformer
swin_transformer.to(device)

model = MultimodalTransformerFromDataset.from_dataset(
    train_dataset,
    predict_length=predict_length,
    swin_transformer=swin_transformer,
    window_size=window_size,
    d_model=d_model,
    dropout=dropout,
    nhead=nhead,
    d_ff=d_ff,
    num_layers=num_layers,
    imgpath_encoder= imgpath_encoder
    )
model.to(device); print()

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(





### Train

In [18]:
optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.MSELoss()
loss_fn_ = torch.nn.MSELoss(reduction="none")
train_loss_li, valid_loss_li = [], []


def plot_loss(train_loss_li, valid_loss_li):
    plt.plot(train_loss_li, label="train")
    plt.plot(valid_loss_li, label="valid")
    plt.title("loss")
    plt.legend()

def plot_bestsample(loss, pred, y, iter, msg):
    loss = loss.mean(axis=1) # Shape: (batch_size, )
    _, best_idx_li = torch.sort(loss)
    for best_idx in best_idx_li:
        # best_pred = torch.round(pred[best_idx])
        best_pred = pred[best_idx]
        best_pred = best_pred.cpu().detach().numpy() # Sales is always int → Shape: (predict_length, )
        best_pred[best_pred < 0] = 0 # Sales never becomes negative
        best_y = y[best_idx].cpu().detach().numpy()
        if (np.max(best_y) < 10): # If predicted value is all 0, consider as not the best
            continue
        break
    
    plt.plot(best_pred, label="pred")
    plt.plot(best_y, label="y", color="gray", alpha=0.3)
    plt.title(f"{iter}th iter: Best example amongst {msg} dataset")
    plt.legend()

def train():
    total_train_loss, total_valid_loss = 0, 0
    for n, (train_data, valid_data) in enumerate(zip(train_dataloader, valid_dataloader)):
        clear_output(wait=True)

        # Train
        model.train(True)
        optimizer.zero_grad()
        train_pred, train_y = model(train_data)

        # Get train loss
        train_loss = loss_fn(train_pred, train_y) # Shape: (batch_size, predict_length)
        train_loss.backward()
        train_loss_raw = loss_fn_(train_pred, train_y)
        total_train_loss += train_loss.item()
        train_loss_li.append(total_train_loss/(n+1))
        optimizer.step()

        # Validation
        model.eval()
        valid_pred, valid_y = model(valid_data)

        # Get validation loss
        valid_loss = loss_fn(valid_pred, valid_y)
        valid_loss_raw = loss_fn_(valid_pred, valid_y)
        total_valid_loss += valid_loss.item()
        valid_loss_li.append(total_valid_loss/(n+1))

        # Plot
        plt.figure(figsize=(18,5))
        plt.subplot(1,3,1); plot_loss(train_loss_li, valid_loss_li)
        plt.subplot(1,3,2); plot_bestsample(train_loss_raw, train_pred, train_y, n, "TRAIN")
        plt.subplot(1,3,3); plot_bestsample(valid_loss_raw, valid_pred, valid_y, n, "VALID")
        plt.show()

        # Report
        print(f"\r {n}/{len(train_dataloader)} → train_loss: {np.mean(train_loss_li)}, valid_loss: {np.mean(valid_loss_li)}", end="")
            
for epoch in range(10):
    mean_train_loss = train()

KeyboardInterrupt: 

: 

In [17]:
len(train_dataloader)

123510

In [16]:
len(valid_dataloader)

2818