# Import & Setting

### Import

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

from sklearn.preprocessing import LabelEncoder

import torch
from torchvision import transforms
from PIL import Image
import pytorch_forecasting as pf

from datasets import load_dataset
from transformers import AutoImageProcessor, SwinModel
import timm

device = torch.device("cuda:0")

  from .autonotebook import tqdm as notebook_tqdm


### Setting

In [2]:
# Params for sampling 
num_samples = None

# Params for Train_test_split 
train_test_split_rto = 0.3

# Dataset
window_size = 30
predict_length = 7
batch_size = 64

# Model
d_model = 128
dropout = 0.3
nhead = 4
num_layers = 4
d_ff = 512

# Preprocess Data

### Read data

In [3]:
df_raw = pd.read_csv("HnM/transactions_train.csv", dtype={"article_id":str})
df_raw["img_path"] = df_raw["article_id"].apply(lambda x: f'./HnM/images/{x[:3]}/{x}.jpg')
df_raw["is_valid"] = df_raw["img_path"].apply(lambda x: 1 if os.path.isfile(x) else 0)
df_raw.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,img_path,is_valid
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,./HnM/images/066/0663713001.jpg,1
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,./HnM/images/054/0541518023.jpg,1
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,./HnM/images/050/0505221004.jpg,1
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,./HnM/images/068/0685687003.jpg,1
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,./HnM/images/068/0685687004.jpg,1


In [4]:
# More than quantile 0.9
print(df_raw["article_id"].nunique())
q_valid_prod = df_raw.groupby(["article_id", "t_dat"], as_index=False).size().drop("size", axis=1)
q_valid_prod = q_valid_prod.groupby("article_id").size().sort_values(ascending=False)
q_valid_prod = q_valid_prod[q_valid_prod>q_valid_prod.quantile(0.8)].index
print(len(q_valid_prod))

104547
20804


### Sample data

In [5]:
# Get valid data only
df_sample = df_raw[
    (df_raw["is_valid"] == 1)
    &(df_raw["article_id"].isin(q_valid_prod))
    ]

# Get sample articld_id
num_samples = num_samples if num_samples else df_sample["article_id"].nunique()
sample_id_li = df_sample.groupby(["article_id", "t_dat"], as_index=False).agg(_=("price", "count")) # Get uinque dates per each article
sample_id_li = sample_id_li.groupby("article_id").size().sort_values(ascending=False)[:num_samples].index

# Sample
df_sample = df_sample[df_sample["article_id"].isin(sample_id_li)].reset_index(drop=True)
df_sample.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,img_path,is_valid
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,./HnM/images/066/0663713001.jpg,1
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,./HnM/images/054/0541518023.jpg,1
2,2018-09-20,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,688873012,0.030492,1,./HnM/images/068/0688873012.jpg,1
3,2018-09-20,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,501323011,0.053373,1,./HnM/images/050/0501323011.jpg,1
4,2018-09-20,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,598859003,0.045746,2,./HnM/images/059/0598859003.jpg,1


### Preprocess

In [6]:
# To datetime
df_prep = df_sample.copy()
df_prep["t_dat"] = pd.to_datetime(df_prep["t_dat"])

# Preprocess
### Explode dates
def func(x):
    full_date = pd.DataFrame(pd.date_range(x["t_dat"].min(), x["t_dat"].max(), freq="d"), columns=["t_dat"])
    x = x.merge(full_date, on="t_dat", how="right").reset_index(drop=True)
    x["article_id"] = x["article_id"].unique()[0]
    x["sales"] = x["sales"].fillna(0)
    return x

df_prep = df_prep.groupby(["article_id", "t_dat"], as_index=False).agg(sales=("price", "count"))
df_prep = df_prep.groupby("article_id", as_index=False).apply(lambda x: func(x)).reset_index(drop=True)
df_prep["time_idx"] = df_prep.groupby("article_id").cumcount()

### LabelEncode image path
imgpath_encoder = LabelEncoder()
df_prep["img_path"] = df_prep["article_id"].apply(lambda x: f'HnM/images/{x[:3]}/{x}.jpg')
df_prep["img_path"] = imgpath_encoder.fit_transform(df_prep["img_path"])

# Train test split
num_train = int(np.round(num_samples * train_test_split_rto))
sample_id_li_train = sample_id_li[:num_train]

df_train = df_prep[df_prep["article_id"].isin(sample_id_li_train)].reset_index(drop=True)
df_valid = df_prep[~df_prep["article_id"].isin(sample_id_li_train)].reset_index(drop=True)
assert df_train.shape[0] + df_valid.shape[0] == df_prep.shape[0]

# Make Dataset

In [7]:
class MultimodalDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, imgpath_encoder, predict_length):
        self.dataset = dataset
        self.imgpath_encoder = imgpath_encoder
        self.predict_length = predict_length

    def __getitem__(self, idx):
        dataset = self.dataset[idx][0]["x_cont"]
        seq = dataset[:, -1]
        x = seq[:-self.predict_length].unsqueeze(-1)
        y = seq[-self.predict_length:].unsqueeze(-1)

        img_path = dataset[:,0][0].type(torch.int).unsqueeze(0)
        img_path = self.imgpath_encoder.inverse_transform(img_path)
        print(img_path)
        raise
        transform = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        for n, path in enumerate(img_path):
            img = transform(Image.open(path).convert("RGB"))
            img_tensor = img if n == 0 else torch.vstack([img_tensor, img])
        
        return {"x":x, "y":y, "img":img_tensor}
    
    def __len__(self):
        return len(self.dataset)

In [10]:
train_dataset_raw = pf.TimeSeriesDataSet(
    data=df_train,
    time_idx="time_idx",
    target="sales",
    group_ids=["article_id"],
    static_reals=["img_path"],
    min_encoder_length=window_size,
    max_encoder_length=window_size,
    min_prediction_idx=predict_length,
    max_prediction_length=predict_length,
    time_varying_unknown_reals=["sales"],
    # target_normalizer=None,
    categorical_encoders={
        "article_id": None,
    },
    scalers={"img_path":None}
)
valid_dataset_raw = pf.TimeSeriesDataSet.from_dataset(train_dataset_raw, df_valid)

train_dataset = MultimodalDataset(train_dataset_raw, imgpath_encoder, predict_length)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = MultimodalDataset(valid_dataset_raw, imgpath_encoder, predict_length)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
next(iter(train_dataset))

['HnM/images/010/0108775015.jpg']


RuntimeError: No active exception to reraise

# Model

### Architecture

In [None]:
class PositionalEncoding(torch.nn.Module):
    # PE(pos, 2i) = sin(pos/10000^{2i/d_model}), 
    # PE(pos, 2i+1) = cos(pos/10000^{2i/d_model})
    def __init__(self, max_len, d_model, dropout):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)

        position = torch.arange(max_len).reshape(-1,1).to(device)
        i = torch.arange(d_model).to(device)//2
        exp_term = 2*i/d_model
        div_term = torch.pow(10000, exp_term).reshape(1, -1)
        self.pos_encoded = position / div_term

        self.pos_encoded[:, 0::2] = torch.sin(self.pos_encoded[:, 0::2])
        self.pos_encoded[:, 1::2] = torch.cos(self.pos_encoded[:, 1::2])

    def forward(self, x):
        output = x + self.pos_encoded[:x.shape[1], :]
        return self.dropout(output)
    
class Mask(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def get_padding_mask(self, arr):
        res = torch.eq(arr, 0).type(torch.FloatTensor).to(device)
        res = torch.where(res==1, -torch.inf, 0)
        return res
    
    def get_lookahead_mask(self, arr):
        seq_len = arr.shape[1]
        mask = torch.triu(torch.ones((seq_len, seq_len))*-1e-9, 1).to(device)
        return mask

    def forward(self, arr):
        padding_mask = self.get_padding_mask(arr)
        lookahead_mask = self.get_lookahead_mask(arr)
        return padding_mask, lookahead_mask

In [None]:
class MultimodalTransformer(torch.nn.Module):
    def __init__(self, max_seq_len, d_model, dropout, nhead, d_ff, num_layers, swin_transformer):
        super().__init__()
        # Encoder
        self.enc_mask = Mask()
        self.linear1 = torch.nn.Linear(1, d_model)
        self.enc_pos_encoding = PositionalEncoding(max_seq_len, d_model, dropout)
        self.encoder = torch.nn.TransformerEncoder(torch.nn.TransformerEncoderLayer(d_model, nhead, d_ff, dropout, batch_first=True), num_layers)
        
        # Decoder
        self.swin_transformer = swin_transformer
        self.attn = torch.nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True)
        self.linear2 = torch.nn.Linear(self.swin_transformer.config.hidden_size, d_model)
        self.layernorm = torch.nn.LayerNorm(d_model)

        self.fc1 = torch.nn.Linear(d_model, d_model)
        self.relu1 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(d_model, d_model)
        self.relu2 = torch.nn.ReLU()

        self.flatten = torch.nn.Flatten()
        self.linear3 = torch.nn.Linear(d_model*49, d_model)
        self.linear4 = torch.nn.Linear(d_model, predict_length)
    
    def forward(self, enc_input, dec_input):
        # Encoding
        # enc_padding_mask, _ = self.enc_mask(enc_input.squeeze())
        linear1_ = self.linear1(enc_input)
        enc_pos_encoding_ = self.enc_pos_encoding(linear1_)
        # encoder_ = self.encoder(enc_pos_encoding_, src_key_padding_mask=enc_padding_mask)
        encoder_ = self.encoder(enc_pos_encoding_)
        
        # Decoding
        ### Self attention
        swin_transformer_ = self.swin_transformer(dec_input).last_hidden_state
        linear2_ = self.linear2(swin_transformer_)

        ### Cross attention
        attn_, attn_weight = self.attn(query=linear2_, key=encoder_, value=encoder_)
        layernorm_ = self.layernorm(linear2_ + attn_)

        ### Feed forward
        relu1_ = self.relu1(self.fc1(layernorm_))
        relu2_ = self.relu2(self.fc2(relu1_))

        # Final
        flatten_ = self.flatten(relu2_)
        linear3_ = self.linear3(flatten_)
        linear4_ = self.linear4(linear3_)
        

        return linear4_

In [None]:
import gc
from transformers import SwinModel
torch.cuda.empty_cache()
gc.collect()

swin_transformer = SwinModel.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
swin_transformer.to(device)

model = MultimodalTransformer(window_size, d_model, dropout, nhead, d_ff, num_layers, swin_transformer)
# model = torch.nn.DataParallel(model, output_device=1)
model.to(device)
print()




### Train

In [None]:
import matplotlib.pyplot as plt
from IPython.display import clear_output

optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.MSELoss()
temp = None
loss_li = []

def train():
    global temp
    total_loss = 0
    for n, (data, valid) in enumerate(zip(train_dataloader, valid_dataloader)):
        model.train(True)
        clear_output(wait=True)
        x, y, img = data["x"], data["y"].squeeze(), data["img"]

        # Train
        optimizer.zero_grad()
        pred = model(x.to(device), img.to(device))
        pred = train_dataset_raw.target_normalizer.inverse_transform(pred)
        
        loss = loss_fn(pred, y.to(device))
        loss.backward()
        optimizer.step()

        # Report
        total_loss += loss.item()
        mean_loss = total_loss / (n+1)

        # # Plot loss
        # loss_li.append(mean_loss)
        # plt.plot(loss_li)

        # Plot valid prediction
        model.eval()
        x, y, img = valid["x"], valid["y"].squeeze(), valid["img"]
        pred = model(x.to(device), img.to(device))
        pred = train_dataset_raw.target_normalizer.inverse_transform(pred)
        loss = torch.nn.MSELoss(reduction="none")(pred, y.to(device))
        loss = loss.mean(axis=1)
        best_idx = loss.argmin()
        plt.plot(pred[best_idx].cpu().detach().numpy(), label="pred")
        plt.plot(y[best_idx].cpu().detach().numpy(), label="y")
        plt.legend()
        plt.show()

        print(f"\r{mean_loss}", end="")

    return mean_loss
    
for epoch in range(1):
    mean_train_loss = train()


torch.Size([64, 3, 224, 224])
torch.Size([64, 30, 1])
torch.Size([64, 3, 224, 224])
tensor([[-8.8018e-02, -5.8104e-02, -5.2780e-03,  5.5865e-02, -9.9650e-02,
          9.1390e-02,  2.4734e-02],
        [-2.0067e-02, -8.1871e-02, -4.9101e-02,  4.2356e-03, -1.1786e-01,
          8.3833e-02,  3.1956e-03],
        [-3.0159e-02, -2.9640e-02, -9.3836e-03,  1.2075e-02, -4.2337e-02,
          5.9468e-02,  1.2219e-02],
        [-3.4092e-02, -7.5626e-02, -4.7786e-02, -4.8521e-02, -6.4422e-02,
          9.6413e-02,  1.6364e-02],
        [-6.8928e-02, -6.4887e-02, -3.0160e-02,  7.4031e-03, -8.7608e-02,
          1.3236e-01,  3.2139e-02],
        [-2.8419e-02, -8.6400e-02, -6.4012e-02, -1.7384e-02, -1.0226e-01,
          1.3433e-01, -1.1174e-01],
        [-5.7114e-02, -8.5768e-02, -3.5214e-02, -8.7600e-02, -4.2149e-02,
          1.0672e-01, -1.7981e-02],
        [-6.6314e-02, -3.4071e-02, -2.0043e-02,  1.8899e-02, -4.9322e-02,
          5.7989e-02,  1.6925e-02],
        [-3.5268e-02, -1.5112e-02,  

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!