In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from transformer import Decoder, PositionalEncoding, Encoder
import torch.nn.functional as F

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm


data = pd.read_csv("./transaction_sample.csv")

del data["sales_channel_id"]
data.head(2)

data["t_dat"] = pd.to_datetime(data["t_dat"])
data.sort_values(by=["customer_id", "t_dat"], ascending=True, inplace=True)

entity2idx = lambda data, increment: {item: idx + increment for idx, item in enumerate(data)}
idx2entity = lambda entity2idx: {idx: entity for entity, idx in entity2idx.items()}

user2idx = entity2idx(data.customer_id.unique(), 0)
idx2user = idx2entity(user2idx)

pid2idx = entity2idx(data.article_id.unique(), 2)
idx2pid = idx2entity(pid2idx)

data.customer_id = data.customer_id.map(user2idx)
data.article_id = data.article_id.map(pid2idx)

data.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price
40669,2018-09-27,0,2,0.033881
96086,2018-10-04,0,3,0.033881


In [3]:
train = data.groupby(["customer_id", "t_dat"], sort=False)["article_id"].apply(list).reset_index()
train = train[train['article_id'].apply(len) > 1].reset_index(drop=True)

del train["t_dat"]
train.head(2)

Unnamed: 0,customer_id,article_id
0,0,"[4, 5]"
1,0,"[7, 8, 9, 10, 11]"


In [4]:
# max_session_len = train['article_id'].apply(len).max() + 1
max_session_len = 32

In [5]:
pid2idx["<PAD>"] = 0
pid2idx["<EOS>"] = 1

In [6]:
from typing import List, Dict


class CDataset(Dataset):
    def __init__(self, data: List, max_session_len: int, pid2idx: Dict):
        self.data = data
        self.max_session_len = max_session_len
        self.pid2idx = pid2idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        user = torch.LongTensor([data[0]])
        items = torch.LongTensor(data[1][:self.max_session_len-1])

        pad = torch.LongTensor([pid2idx["<EOS>"]] + [pid2idx["<PAD>"]] * (self.max_session_len - 1 - len(items)))
        return user, torch.cat((items, pad))

In [7]:
# Train-Test Split
split_ratio = 0.8
msk = np.random.rand(len(train)) < split_ratio

train_data = train[msk]
test_data = train[~msk]

print(len(train_data), len(test_data))

390619 98406


In [8]:
batch_size = 64

train_dataset = CDataset(data=train_data.values.tolist(), max_session_len=max_session_len, pid2idx=pid2idx)
test_dataset = CDataset(data=test_data.values.tolist(), max_session_len=max_session_len, pid2idx=pid2idx)

train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=0
)
test_loader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False, num_workers=0
)

In [9]:
train_dataset[0]

(tensor([0]),
 tensor([ 7,  8,  9, 10, 11,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]))

In [10]:
class GPT(nn.Module):
    def __init__(self, number_of_layers, head, dimension, nusers, nitems, max_seq_len, dropout):
        super(GPT, self).__init__()
        self.users = nn.Embedding(nusers, dimension)
        self.items = nn.Embedding(nitems, dimension, padding_idx=0)
        self.penc = PositionalEncoding(max_seq_len=max_seq_len, dimension=dimension, dropout=dropout)
        
        # self.decoder = Decoder(number_of_layers=number_of_layers, head=head, dimension=dimension, dropout=dropout)
        self.decoder = Encoder(number_of_layers=number_of_layers, 
                               head=head, 
                               dimension=dimension, 
                               dropout=dropout, 
                               hidden_dimension=dimension)
        self.ffnn = nn.Linear(dimension, nitems)

    def forward(self, users, items, mask=None):
        """
        users = [batch, 1]
        items = [batch, max_seq_len-1]
        """
        # batch, dimension
        users = self.users(users)

        # batch, max_seq_len, dimension
        items = self.items(items)
        
        # user item session
        input_vec = torch.cat((users, items), dim=1)
        input_vec = self.penc(input_vec)
        
        # output = self.decoder(input_vec=input_vec, 
        #                       encoder_output=input_vec, 
        #                       encmask=mask, 
        #                       decmask=mask)
        output = self.decoder(input_vec=input_vec, mask=mask)
        # return self.ffnn(output)
        return  F.log_softmax(self.ffnn(output), dim=-1)

def init_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [11]:
class Batch:
    def __init__(self, users, items, device, pad_idx=0):
        """
        users = [batch]
        items = [batch, max_session_length+1] i.e. [item1, item2, item3, <EOS>]
        """
        self.device = device
        self.pad_idx = pad_idx
        self.target = items
        self.ntokens = (self.target != pad_idx).data.sum()
        
        self.item_model_input = items[:, :-1]
        self.user_model_input = users

        self.mask = self.mask().to(device)

    def mask(self):
        # <PAD> Mask for decoder
        trg_pad_mask = (self.item_model_input != self.pad_idx)
        trg_pad_mask = torch.cat((torch.ones((trg_pad_mask.size(0), 1), dtype=torch.long, device=self.device), trg_pad_mask), dim=1)
        trg_pad_mask = trg_pad_mask.unsqueeze(1)

        # Future Masking
        dimension = self.item_model_input.size(-1) + 1
        future_mask = torch.tril(torch.ones(1, dimension, dimension)).to(self.device)
        future_mask[future_mask != 0] = 1
        future_mask = Variable(future_mask > 0)

        # Final Decoder Mask
        # "AND" condition on <PAD> Mask and Future Mask
        return trg_pad_mask & future_mask

In [12]:
# max_seq_len = max_session_length + 1 (for user emb)
device = torch.device("mps")
number_of_layers = 3
head = 8
dimension = 128
nusers = len(user2idx)
nitems = len(pid2idx)
max_seq_len = max_session_len
dropout = 0.1

model = GPT(number_of_layers=number_of_layers,
            head=head,
            dimension=dimension,
            nusers=nusers,
            nitems=nitems,
            max_seq_len=max_seq_len,
            dropout=dropout).to(device)

model.apply(init_weights)

8 heads 128 dimension in MultiHeadedAttention
8 heads 128 dimension in MultiHeadedAttention
8 heads 128 dimension in MultiHeadedAttention


GPT(
  (users): Embedding(150208, 128)
  (items): Embedding(61544, 128, padding_idx=0)
  (penc): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): Encoder(
    (enclays): ModuleList(
      (0-2): 3 x EncoderLayer(
        (attn): MultiHeadedAttention(
          (wq): Linear(in_features=128, out_features=128, bias=True)
          (wk): Linear(in_features=128, out_features=128, bias=True)
          (wv): Linear(in_features=128, out_features=128, bias=True)
          (out): Linear(in_features=128, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ffnn): FeedForwardNet(
          (net): Sequential(
            (0): Linear(in_features=128, out_features=128, bias=True)
            (1): GELU(approximate='none')
            (2): Dropout(p=0.1, inplace=False)
            (3): Linear(in_features=128, out_features=128, bias=True)
          )
        )
        (resconn1): ResidualConnection(
          (norm): LayerNor

In [13]:
class Criterion(nn.Module):
    def __init__(self, pad_idx=0):
        super().__init__()
        self.pad_idx = pad_idx
        self.criterion = nn.CrossEntropyLoss(reduction="sum")
        
    def forward(self, prediction, target):
        prediction = prediction.contiguous().view(-1, prediction.size(-1))
        target = target.contiguous().view(-1)
        
        one_hot_target = torch.nn.functional.one_hot(
            target, num_classes=prediction.size(-1)
        ).float()
        one_hot_target[:, self.pad_idx] = 0.
        return self.criterion(prediction, one_hot_target)

# class Criterion(nn.Module):
#     def __init__(self, vocab_size, pad_index, alpha):
#         super().__init__()
#         self.alpha = alpha
#         self.vocab_size = vocab_size
#         self.pad_index = pad_index

#     def forward(self, prediction, target):
#         prediction = prediction.contiguous().view(-1, prediction.size(-1))
#         target = target.contiguous().view(-1)

#         one_hot_target = torch.nn.functional.one_hot(target, num_classes=prediction.size(-1))
#         one_hot_target[:, self.pad_index] = 0
#         one_hot_target = (one_hot_target * (1 - self.alpha)) + (
#             self.alpha / (self.vocab_size - 2)
#         )
#         one_hot_target.masked_fill_((target == self.pad_index).unsqueeze(1), 0)

#         return F.kl_div(prediction, one_hot_target, reduction="sum")


class CustomAdam:
    def __init__(self, dimension, optimizer, warmup_steps=4000, step_num=0):
        self.optimizer = optimizer
        self.step_num = step_num
        self.dimension = dimension
        self.warmup_steps = warmup_steps

    def step(self):
        self.step_num += 1
        lr = self.rate()

        for pg in self.optimizer.param_groups:
            pg["lr"] = lr

        self.optimizer.step()

    def rate(self):
        return self.dimension ** (-0.5) * min(
            self.step_num ** (-0.5), self.step_num * self.warmup_steps ** (-1.5)
        )

In [14]:
# criterion = nn.CrossEntropyLoss(reduction="sum")
criterion = Criterion()# (vocab_size=nitems, pad_index=0, alpha=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.98), eps=1e-9)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-2)

In [15]:
# optim = torch.optim.Adam(
#             model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9
#         )

# optimizer = CustomAdam(
#         dimension=512,
#         warmup_steps=400,
#         optimizer=optim,
#         step_num=0,
#     )

In [16]:
def train(epoch, model, optimizer, criterion, device, dataloader):
    model.train()

    total_loss = 0
    total_tokens = 0
    
    correct = 0
    processed = 0
    running_accuracy = 0
    
    pbar = tqdm(dataloader)

    for minibatch, (users, items) in enumerate(pbar):
        users = users.to(device)
        items = items.to(device)

        batch = Batch(users, items, device, pad_idx=0)
        optimizer.zero_grad()
        
        output = model(batch.user_model_input, batch.item_model_input, batch.mask)
        loss = criterion(output, batch.target) / batch.ntokens
        
        loss.backward()
        optimizer.step()

        total_loss += loss.detach().item()
        # total_tokens += batch.ntokens

        pred = output.argmax(-1)
        labels = batch.target

        correct += pred.eq(labels.view_as(pred)).sum().item()
        processed += torch.count_nonzero(labels).item()
        running_accuracy += round((correct / processed) * 100, 14)
        
        pbar.set_description(f"Train Epoch: {epoch} Loss: {loss}, Accuracy: {round((correct / processed) * 100, 14)}")

    return total_loss / total_tokens

In [17]:
def test(epoch, model, criterion, device, dataloader):
    model.eval()

    total_loss = 0
    total_tokens = 0
    
    pbar = tqdm(dataloader)

    for minibatch, (users, items) in enumerate(pbar):
        users = users.to(device)
        items = items.to(device)

        batch = Batch(users, items, device, pad_idx=0)
        output = model(batch.user_model_input, batch.item_model_input, batch.mask)
        
        # output = output.contiguous().view(-1, output.size(-1))
        loss = criterion(output, batch.target)

        total_loss += loss.detach().item()
        total_tokens += batch.ntokens
        pbar.set_description(f"Test Epoch: {epoch} Loss: {total_loss / total_tokens}")

    return total_loss / total_tokens

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    train(epoch, model, optimizer, criterion, device, train_loader)
    test(epoch, model, criterion, device, test_loader)

  processed += torch.count_nonzero(labels).item()
Train Epoch: 0 Loss: 85.95155334472656, Accuracy: 12.56801917659615:   3%| | 169/6104 [02:04<1:12:07