# SR-SAN 
This is the code for the paper [Session-based Recommendation with Self-Attention Networks](https://arxiv.org/abs/2102.01922) by Jun Fang. 

I have taken the original code and converted it into a Jupytr notebook.

The following are the libraries that needed to be imported.

In [1]:
import datetime
import math
import numpy as np
import torch
from torch import nn
from torch.nn import Module, Parameter
import torch.nn.functional as F
from torch.nn import TransformerEncoder
from torch.nn import TransformerEncoderLayer
import argparse
import pickle
import time
from utils import  Data, split_validation

The following is the Self-Attention Network which is implemented as a [Transform Encoder Layer](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html). I have added additional comments to the module below to highlight the various important components as described in the paper. The rest of the code are fairly standard components of a Pytorch module.

In [2]:
class SelfAttentionNetwork(Module):
    def __init__(self, opt, n_node):
        super(SelfAttentionNetwork, self).__init__()
        self.hidden_size = opt.hiddenSize
        self.n_node = n_node
        self.batch_size = opt.batchSize

        self.embedding = nn.Embedding(self.n_node, self.hidden_size)  # This is the embedding layer.
        self.transformerEncoderLayer = TransformerEncoderLayer(d_model=self.hidden_size, nhead=opt.nhead,dim_feedforward=self.hidden_size * opt.feedforward)  # The transformer encoder layer
        self.transformerEncoder = TransformerEncoder(self.transformerEncoderLayer, opt.layer)  # The transformer encoder, composed of multiple layers. 
        self.loss_function = nn.CrossEntropyLoss()  # This defines the loss function.
        self.optimizer = torch.optim.Adam(self.parameters(), lr=opt.lr, weight_decay=opt.l2)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=opt.lr_dc_step, gamma=opt.lr_dc)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    def compute_scores(self, hidden, mask):
        ht = hidden[torch.arange(mask.shape[0]).long(), torch.sum(mask, 1) - 1]  # batch_size x latent_size
        b = self.embedding.weight[1:]  # n_nodes x latent_size
        scores = torch.matmul(ht, b.transpose(1, 0))
        return scores

    def forward(self, inputs, A):
        hidden = self.embedding(inputs)
        hidden = hidden.transpose(0,1).contiguous()
        hidden = self.transformerEncoder(hidden)
        hidden = hidden.transpose(0,1).contiguous()
        return hidden

This function transforms the Tensor into a Pytorch CUDA object, if CUDA is available.

In [3]:
def trans_to_cuda(variable):
    if torch.cuda.is_available():
        return variable.cuda()
    else:
        return variable

This function transforms the Tensor into a Pytorch CPU object, if CUDA object is used.

In [4]:
def trans_to_cpu(variable):
    if torch.cuda.is_available():
        return variable.cpu()
    else:
        return variable

This forward function computes the hidden layer.

In [5]:
def forward(model, i, data):
    alias_inputs, A, items, mask, targets = data.get_slice(i)
    alias_inputs = trans_to_cuda(torch.Tensor(alias_inputs).long())
    items = trans_to_cuda(torch.Tensor(items).long())
    A = trans_to_cuda(torch.Tensor(A).float())
    mask = trans_to_cuda(torch.Tensor(mask).long())
    hidden = model(items, A)
    get = lambda i: hidden[i][alias_inputs[i]]
    seq_hidden = torch.stack([get(i) for i in torch.arange(len(alias_inputs)).long()])
    return targets, model.compute_scores(seq_hidden, mask)

The `train_test` function is used to train the model. First, the function uses an optimizer to find the minimum loss of the model. After that, it calculates MRR@20 and HR@20 values and returns it.

In [6]:
def train_test(model, train_data, test_data):    
    print('start training: ', datetime.datetime.now())
    model.train()
    total_loss = 0.0
    slices = train_data.generate_batch(model.batch_size)
    for i, j in zip(slices, np.arange(len(slices))):
        model.optimizer.zero_grad()
        targets, scores = forward(model, i, train_data)
        targets = trans_to_cuda(torch.Tensor(targets).long())
        loss = model.loss_function(scores, targets - 1)
        loss.backward()
        model.optimizer.step()
        total_loss += loss
        if j % int(len(slices) / 5 + 1) == 0:
            print('[%d/%d] Loss: %.4f' % (j, len(slices), loss.item()))
    print('\tLoss:\t%.3f' % total_loss)

    print('start predicting: ', datetime.datetime.now())
    model.eval()
    hit, mrr = [], []
    slices = test_data.generate_batch(model.batch_size)
    for i in slices:
        targets, scores = forward(model, i, test_data)
        sub_scores = scores.topk(20)[1]
        sub_scores = trans_to_cpu(sub_scores).detach().numpy()
        for score, target, mask in zip(sub_scores, targets, test_data.mask):
            hit.append(np.isin(target - 1, score))
            if len(np.where(score == target - 1)[0]) == 0:
                mrr.append(0)
            else:
                mrr.append(1 / (np.where(score == target - 1)[0][0] + 1))
    hit = np.mean(hit) * 100
    mrr = np.mean(mrr) * 100
    model.scheduler.step()
    return hit, mrr

The following are the options (hyperparameters) used in the code. I have preserved the default values.

In [7]:
class Option:
    dataset = "yoochoose1_64" # dataset name: diginetica/yoochoose1_64
    valid_portion = 0.1       # split the portion of training set as validation set
    epoch = 12                # the number of epochs to train for
    validation = False
    hiddenSize = 96           # hidden state size
    batchSize = 100           # input batch size
    nhead = 2                 # the number of heads of multi-head attention
    feedforward = 4           # the multipler of hidden state size
    layer = 1                 # number of SAN layers
    lr = 0.001                # learning rate
    l2 = 1e-5                 # l2 penalty
    lr_dc_step = 3            # the number of steps after which the learning rate decay
    lr_dc = 0.1               # learning rate decay rate
    patience = 10             # the number of epoch to wait before early stop

opt = Option()

This code will load the train and test pre-processed code, which is encoded as a pickle file.

In [8]:

train_data = pickle.load(open('./datasets/' + opt.dataset + '/train.txt', 'rb'))
if opt.validation:
    train_data, valid_data = split_validation(train_data, opt.valid_portion)
    test_data = valid_data
else:
    test_data = pickle.load(open('./datasets/' + opt.dataset + '/test.txt', 'rb'))

train_data = Data(train_data, shuffle=True)
test_data = Data(test_data, shuffle=False)

# The number of nodes based on the dataset.
if opt.dataset == 'diginetica':
    n_node = 43098
else:
    n_node = 37484

Here the SAN model is created. If CUDA is enabled, it will use the CUDA version of the module.

In [9]:
model = trans_to_cuda(SelfAttentionNetwork(opt, n_node))

The following code will train the model. The best HR@20 and MRR@20 determined in each epoch.

In [10]:
start = time.time()
best_result = [0, 0]
best_epoch = [0, 0]
bad_counter = 0
for epoch in range(opt.epoch):
    print('-------------------------------------------------------')
    print('epoch: ', epoch)
    hit, mrr = train_test(model, train_data, test_data)
    flag = 0
    if hit >= best_result[0]:
        best_result[0] = hit
        best_epoch[0] = epoch
        flag = 1
    if mrr >= best_result[1]:
        best_result[1] = mrr
        best_epoch[1] = epoch
        flag = 1
    print('Best Result:')
    print('\tRecall@20:\t%.4f\tMMR@20:\t%.4f\tEpoch:\t%d,\t%d'% (best_result[0], best_result[1], best_epoch[0], best_epoch[1]))
    bad_counter += 1 - flag
    if bad_counter >= opt.patience:
        break
print('-------------------------------------------------------')
end = time.time()
print("Run time: %f s" % (end - start))

-------------------------------------------------------
epoch:  0
start training:  2022-05-01 18:31:22.966508


  """


[0/3699] Loss: 10.5344
[740/3699] Loss: 6.2977
[1480/3699] Loss: 5.7345
[2220/3699] Loss: 4.5850
[2960/3699] Loss: 4.8966
	Loss:	20159.918
start predicting:  2022-05-01 18:49:23.135382
Best Result:
	Recall@20:	69.2798	MMR@20:	29.2985	Epoch:	0,	0
-------------------------------------------------------
epoch:  1
start training:  2022-05-01 18:49:57.106424
[0/3699] Loss: 4.1349
[740/3699] Loss: 4.3210
[1480/3699] Loss: 4.3392
[2220/3699] Loss: 4.2475
[2960/3699] Loss: 4.1755
	Loss:	15730.980
start predicting:  2022-05-01 19:08:40.334549
Best Result:
	Recall@20:	70.3621	MMR@20:	29.4718	Epoch:	1,	1
-------------------------------------------------------
epoch:  2
start training:  2022-05-01 19:09:12.521523
[0/3699] Loss: 4.1631
[740/3699] Loss: 3.8239
[1480/3699] Loss: 4.3064
[2220/3699] Loss: 4.2536
[2960/3699] Loss: 4.0381
	Loss:	14961.311
start predicting:  2022-05-01 19:27:40.090421
Best Result:
	Recall@20:	70.4462	MMR@20:	29.9591	Epoch:	2,	2
--------------------------------------------