In [1]:
from src.data import data_split, WrapBatch
data_type = "movielens"
user_num, item_num, Seq_train, Seq_val, Seq_test = data_split(data_type)

sampler = WrapBatch(
    Seq_train,
    user_num,
    item_num,
    batch_size = 16,
    max_len = 200,
    n_workers = 2
)

u, seq, pos, neg = sampler.next_batch()

In [None]:
# 1) Item Embedding (batch_size, max_len, K)
# 2) Position Embedding (0 ~ 199) if max_len == 200 (batchsize, max_len, K)
# 3) Zero-padding 

In [20]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim


In [18]:
class PointWiseFeedForward(nn.Module):
    def __init__(self, 
                hidden_units,
                dropout_rate):
        super(PointWiseFeedForward, self).__init__()

        self.conv1 = nn.Conv1d(hidden_units, hidden_units, kernel_size = 1) # 입력(A, B, C) -> 출력(A, B, C)
        self.dropout1 = nn.Dropout(p = dropout_rate)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2d(hidden_units, hidden_units, kernel_size = 1)
        self.dropout2 = nn.Dropout(p = dropout_rate)

    def forward(self, inputs): # inputs: (batch_size, max_len, K)
        inputs = inputs.transpose(-1, -2) # 마지막 차원 K와 마지막에서 두 번째 차원 max_len 변경 -> (B, K, max_len)
        inputs = self.conv1(inputs)
        inputs = self.dropout1(inputs)
        inputs = self.relu(inputs)
        inputs = self.conv2(inputs)
        inputs = self.dropout2(inputs)
        outputs = inputs.transpose(-1, -2) # 복원 (batch_size, max_len, K)
        return outputs

In [13]:
class SASRec(nn.Module):
    def __init__(self,
                user_num, 
                item_num, 
                K, 
                max_len, 
                dropout_rate,
                num_blocks, 
                num_heads,
                hidden_units,
                first_norm, 
                device):
        super().__init__()

        assert K % num_heads == 0

        self.user_num = user_num
        self.item_num = item_num
        self.K = K
        self.max_len = max_len
        self.dropout_rate = dropout_rate
        self.num_blocks = num_blocks # MHA 개수
        self.num_heads = num_heads # Head width
        self.hidden_units = hidden_units
        self.first_norm = first_norm
        self.device = device

        self.item_emb = nn.Embedding(self.item_num + 1, self.K, padding_idx = 0)
        self.pos_emb = nn.Embedding(self.max_len + 1, self.K, padding_idx = 0)
        self.emb_dropout = nn.Dropout(p = self.dropout_rate)

        self.attention_layernorms = nn.ModuleList()
        self.attention_layers = nn.ModuleList()
        self.forward_layernorms = nn.ModuleList()
        self.forward_layers = nn.ModuleList()

        self.last_layernorm = nn.LayerNorm(self.hidden_units, eps = 1e-8)

        for _ in range(self.num_blocks):
            new_attn_laternorm = nn.LayerNorm(self.hidden_units, eps = 1e-8)
            self.attention_layernorms.append(new_attn_laternorm)

            new_attn_layer = nn.MultiheadAttention(
                self.hidden_units,
                self.num_heads,
                self.dropout_rate
            )
            self.attention_layers.append(new_attn_layer)

            new_fwd_layer_norm = nn.LayerNorm(self.hidden_units, eps = 1e-8)
            self.forward_layernorms.append(new_fwd_layer_norm)

            new_fwd_layer = PointWiseFeedForward(self.hidden_units, self.dropout_rate)
            self.forward_layers.append(new_fwd_layer)

    def log2feats(self, logs):
        seqs = self.item_emb(torch.LongTensor(logs).to(self.device))
        seqs *= self.item_emb.embedding_dim ** 0.5 # normalization
        
        poss = np.tile(np.arange(1, logs.shape[1] + 1, [logs.shpe[0], 1]))
        poss *= (logs != 0) # 로그 기록에서 padding 반영

        seqs += self.pos_emb(torch.LongTensor(poss).to(self.device)) # 로그 임베딩 + 포지션 임베딩(learnable paramter)
        seqs = self.emb_dropout(seqs)

        tl = seqs.shape[1]
        attention_mask = ~torch.tril(torch.ones(tl, tl), dtype = torch.bool, device = self.device)

        for i in range(self.num_blocks):
            seqs = torch.transpose(seqs, 0, 1) # (max_len, Batch_size, K)
            if self.first_norm:
                x = self.attention_layernorms[i](seqs)
                mha_outputs, _ = self.attention_layers[i](
                    x, x, x,
                    attn_mask = attention_mask
                )
                seqs = seqs + mha_outputs # resudual
                seqs = torch.traanspose(seqs, 0, 1) # (batch_size, max_len, K)
                seqs = seqs + self.forward_layers[i](self.forward_layernorms[i](seqs))

            else:
                mha_outputs, _ = self.attention_layers[i](
                    seqs, seqs, seqs,
                    attn_mask = attention_mask
                )
                seqs = self.attention_layernorms[i](seqs + mha_outputs)
                seqs = torch.transpose(seqs, 0, 1)
                seqs = self.forward_layernorms[i](seqs + self.forward_layers[i](seqs))

        outputs = self.last_layernorm(seqs)
        return outputs
    
    def forward(self,
                user_ids,
                seqs,
                pos_seqs,
                neg_seqs):
        log_feats = self.log2feats(seqs)

        pos_embs = self.item_emb(torch.LongTensor(pos_seqs).to(self.device))
        neg_embs = self.item_emb(torch.LongTensor(neg_seqs).to(self.device))

        pos_logits = (log_feats * pos_embs).sum(dim = -1)
        neg_logits = (log_feats ( neg_embs)).sum(dim = -1)

        return pos_logits, neg_logits

        


        

            

In [47]:
import torch
import torch.nn as nn
import numpy as np


class SASRec(nn.Module):
    def __init__(
        self,
        user_num,
        item_num,
        K,
        max_len,
        dropout_rate,
        num_blocks,
        num_heads,
        hidden_units,
        first_norm,
        device,
    ):
        super().__init__() 

        # K(임베딩 차원)과 head 수 체크
        assert K % num_heads == 0
        assert (
            K == hidden_units
        ), "K(embedding dim)와 hidden_units는 동일하게 두는 게 안전합니다."

        self.user_num = user_num
        self.item_num = item_num
        self.K = K
        self.max_len = max_len
        self.dropout_rate = dropout_rate
        self.num_blocks = num_blocks  # MHA block 개수
        self.num_heads = num_heads    # head 수
        self.hidden_units = hidden_units  # = K
        self.first_norm = first_norm      # Pre-LN 여부
        self.device = device

        # 임베딩: (item_id → K차원), (position → K차원)
        self.item_emb = nn.Embedding(self.item_num + 1, self.hidden_units, padding_idx=0)
        self.pos_emb = nn.Embedding(self.max_len + 1, self.hidden_units, padding_idx=0)
        self.emb_dropout = nn.Dropout(p=self.dropout_rate)

        # Transformer block 구성 요소들
        self.attention_layernorms = nn.ModuleList()
        self.attention_layers = nn.ModuleList()
        self.forward_layernorms = nn.ModuleList()
        self.forward_layers = nn.ModuleList()

        self.last_layernorm = nn.LayerNorm(self.hidden_units, eps=1e-8)

        for _ in range(self.num_blocks):
            # Self-Attention 앞/뒤에 들어갈 LN
            new_attn_layernorm = nn.LayerNorm(self.hidden_units, eps=1e-8)
            self.attention_layernorms.append(new_attn_layernorm)

            # Multi-Head Self-Attention
            new_attn_layer = nn.MultiheadAttention(
                embed_dim=self.hidden_units,
                num_heads=self.num_heads,
                dropout=self.dropout_rate,
                batch_first=False,  # (T, B, E) 형식 사용
            )
            self.attention_layers.append(new_attn_layer)

            # FFN 앞/뒤에 들어갈 LN
            new_fwd_layernorm = nn.LayerNorm(self.hidden_units, eps=1e-8)
            self.forward_layernorms.append(new_fwd_layernorm)

            # Position-wise FFN
            new_fwd_layer = PointWiseFeedForward(self.hidden_units, self.dropout_rate)
            self.forward_layers.append(new_fwd_layer)

    def log2feats(self, logs):
        """
        logs: (batch_size, max_len) 형태의 item id 시퀀스 (numpy array 또는 tensor)
        return: (batch_size, max_len, hidden_units) 시퀀스 표현
        """
        # numpy → LongTensor → device
        if isinstance(logs, np.ndarray):
            logs_t = torch.LongTensor(logs).to(self.device)
        else:
            logs_t = logs.long().to(self.device)

        # (B, T, K)
        seqs = self.item_emb(logs_t)
        # 임베딩 스케일링 (Transformer 스타일)
        seqs *= self.item_emb.embedding_dim ** 0.5

        # position index: (B, T)
        # [[1, 2, 3, ..., T],
        #  [1, 2, 3, ..., T],
        #   ... ]
        poss = np.tile(
            np.arange(1, logs_t.shape[1] + 1),
            (logs_t.shape[0], 1)
        )
        # padding(0) 위치는 0으로 만들어서 pos_emb 영향 제거
        poss *= (logs_t.cpu().numpy() != 0)

        poss_t = torch.LongTensor(poss).to(self.device)
        # item_emb + pos_emb
        seqs = seqs + self.pos_emb(poss_t)  # (B, T, K)
        seqs = self.emb_dropout(seqs)

        # 인과 마스크 생성 (T, T)
        tl = seqs.shape[1]
        attention_mask = ~torch.tril(
            torch.ones((tl, tl), dtype=torch.bool, device=self.device)
        )
        # MultiheadAttention은 (T, B, K)를 받으므로 transposed 상태에서 mask를 그대로 사용

        # Transformer blocks
        for i in range(self.num_blocks):
            # (B, T, K) → (T, B, K)
            seqs = torch.transpose(seqs, 0, 1)

            if self.first_norm:
                # Pre-LN 방식
                x = self.attention_layernorms[i](seqs)  # (T, B, K)
                mha_outputs, _ = self.attention_layers[i](
                    x, x, x,
                    attn_mask=attention_mask
                )
                seqs = seqs + mha_outputs  # residual
                seqs = torch.transpose(seqs, 0, 1)  # (B, T, K)
                seqs = seqs + self.forward_layers[i](self.forward_layernorms[i](seqs))
            else:
                # Post-LN 방식
                mha_outputs, _ = self.attention_layers[i](
                    seqs, seqs, seqs,
                    attn_mask=attention_mask
                )
                seqs = self.attention_layernorms[i](seqs + mha_outputs)
                seqs = torch.transpose(seqs, 0, 1)  # (B, T, K)
                seqs = self.forward_layernorms[i](seqs + self.forward_layers[i](seqs))

        # 마지막 LayerNorm (B, T, K)
        outputs = self.last_layernorm(seqs)
        return outputs

    def forward(self,
                seqs,      # (B, T)
                pos_seqs,  # (B, T) positive target
                neg_seqs   # (B, T) negative target
                ):
        # 시퀀스 → 시점별 hidden 표현
        log_feats = self.log2feats(seqs)  # (B, T, K)

        pos_seqs_t = torch.LongTensor(pos_seqs).to(self.device)
        neg_seqs_t = torch.LongTensor(neg_seqs).to(self.device)

        pos_embs = self.item_emb(pos_seqs_t)  # (B, T, K)
        neg_embs = self.item_emb(neg_seqs_t)  # (B, T, K)

        # 시점별 dot product → 각 타임스텝의 (positive/negative) logit
        pos_logits = (log_feats * pos_embs).sum(dim=-1)  # (B, T)
        neg_logits = (log_feats * neg_embs).sum(dim=-1)  # (B, T)

        return pos_logits, neg_logits#, log_feats


In [5]:
max_len = 200

item_emb = nn.Embedding(item_num + 1, 8, padding_idx = 0)
pos_emb = nn.Embedding(max_len + 1, 8, padding_idx=0)

In [6]:
seq = np.array(seq)

In [7]:
seq.shape[0]

16

In [8]:
torch.LongTensor(seq)

tensor([[     0,      0,      0,  ...,     28,    125,    538],
        [     0,      0,      0,  ...,  79132, 112552, 139385],
        [  2380,   5507,   1721,  ...,   2606,   7142,   6887],
        ...,
        [     0,      0,      0,  ...,   1270,   2407,   3033],
        [     0,      0,      0,  ...,    515,    370,    594],
        [ 60040,  61132,  49651,  ...,   3683,  71464,  81591]])

In [9]:
position = np.tile(np.arange(1, seq.shape[1] + 1), [seq.shape[0], 1]) # 동일한 차원의 포지션 인코딩

In [10]:
position *= (seq != 0)

In [11]:
position

array([[  0,   0,   0, ..., 198, 199, 200],
       [  0,   0,   0, ..., 198, 199, 200],
       [  1,   2,   3, ..., 198, 199, 200],
       ...,
       [  0,   0,   0, ..., 198, 199, 200],
       [  0,   0,   0, ..., 198, 199, 200],
       [  1,   2,   3, ..., 198, 199, 200]], shape=(16, 200))

In [12]:
tl = seq.shape[1]

In [13]:
print(tl)

200


In [None]:
attention_mask = ~torch.tril(torch.ones((tl, tl), dtype=torch.bool, device=self.dev))

In [24]:
item_emb(torch.LongTensor(seq)).shape

torch.Size([16, 200, 8])

In [26]:
pos_emb(torch.LongTensor(pos)).shape

IndexError: index out of range in self

In [5]:
a = np.array([5, 6])

In [6]:
a

array([5, 6])

In [7]:
np.tile(a, 3)

array([5, 6, 5, 6, 5, 6])

In [48]:
batcher = WrapBatch(Seq_train, user_num, item_num, batch_size = 16)
model = SASRec(user_num = user_num,
                item_num = item_num,
                K = 16,
                max_len = 200,
                dropout_rate = 0.1,
                num_blocks = 2,
                num_heads = 1,
                hidden_units = 16,
                first_norm = True,
                device = "cpu")

In [49]:
bce_criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-4)
num_epochs = 2
num_batch = 16

In [35]:
u, seq, pos, neg = np.array(u), np.array(seq), np.array(pos), np.array(neg)

In [38]:
a, b, c = model(seq, pos, neg)

In [31]:
a

tensor([[  0.0000,   0.0000,   0.0000,  ...,  -2.2961,   0.6431,   1.8700],
        [ -4.4368,   7.4203,  -0.0522,  ...,  -5.6885,  -0.8233,   8.0495],
        [  0.0000,   0.0000,   0.0000,  ...,  -2.5846,  -4.8858,  -6.8889],
        ...,
        [  1.6703,  -0.1006,  -0.9466,  ...,  -4.2049,   2.3656,   0.5574],
        [  0.0000,   0.0000,   0.0000,  ...,   4.1725,   5.3107, -13.6621],
        [  0.0000,   0.0000,   0.0000,  ...,   0.6092,   5.3331,   2.5382]],
       grad_fn=<SumBackward1>)

In [30]:
a.shape

torch.Size([16, 200])

In [40]:
c.shape

torch.Size([16, 200, 16])

In [None]:
c[0][0]

tensor([-0.4548,  0.0203,  1.7574,  0.2292, -0.4827,  0.2113, -0.5580,  1.7425,
         0.2174,  0.8339, -2.4244,  0.4552, -0.2461, -0.5008,  0.4849, -1.2853],
       grad_fn=<SelectBackward0>)

In [50]:
pos_labels, neg_labels = torch.ones(a.shape, device = "cpu"), torch.ones(b.shape, device = "cpu")

In [51]:
print(pos_labels)

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])


In [52]:
import random

def evaluate(model, dataset,
            max_len: int = 200,
            is_test: bool = False):

    train, val, test, user_num, item_num = dataset.copy()

    NDCG, HT = 0.0, 0.0
    valid_user = 0.0
    
    if user_num > 10000:
        users = random.sample(range(1, user_num + 1), 10000)
    else:
        users = range(1, user_num + 1)

    for u in users:
        if len(train[u]) < 1 or len(test[u]) < 1: continue

        seq = np.zeros([max_len], dtype = np.int32) # history 구성
        idx = max_len - 1
        if is_test:
            seq[idx] = val[u][0]
            idx -= 1
        for i in reversed(train[u]):
            seq[idx] = i
            idx -= 1
            if idx == -1: break

        rated = set(train[u]) 
        rated.add(0)

        item_idx = [test[u][0]] if is_test else [val[u][0]]# 정답
        for _ in range(100):
            t = np.random.randint(1, item_num + 1)
            while t in rated: t = np.random.randint(1, item_num + 1)
            item_idx.append(t)

        predictions = - model.predict(*[np.array(l) for l in [[u], [seq], item_idx]]) # negative score for ascending
        predictions = predictions[0] # 정답 아이템에 대한 스코어

        rank = predictions.argsort().argsort()[0].item() # 정답 아이템이 해당 스코어들 상에서의 랭킹, int

        valid_user += 1

        if rank < 10:
            NDCG += 1 / np.log2(rank + 2)
            HT += 1

    return NDCG / valid_user, HT / valid_user



In [None]:
best_val_ndcg, best_val_hr = 0.0, 0.0
best_test_ndcg, best_test_hr = 0.0, 0.0
# T = 0.0 # 학습 경과 누적 시간 (total elapsed time)

for epoch in range(1, num_epochs + 1):
    for step in range(num_batch):
        u, seq, pos, neg = sampler.next_batch()
        u, seq, pos, neg = np.array(u), np.array(seq), np.array(pos), np.array(neg)
        pos_logits, neg_logits = model(seq, pos, neg)
        pos_labels, neg_lables = torch.ones(pos_logits.shape, device = "cpu"), torch.zeros(neg_logits.shape, device = "cpu")
        
        indices = np.where(pos != 0)
        optimizer.zero_grad()
        loss = bce_criterion(pos_logits[indices], pos_labels[indices])
        loss += bce_criterion(neg_logits[indices], neg_labels[indices])

        # for param in model.item_emb.parameters():
        #     loss += args.l2_emb * torch.sum(param ** 2) # L2정규화

        loss.backward()
        optimizer.step()

    if step % 2 == 0: # 2 epoch 마다 평가
        model.eval()
        print("Evaluating...")
        val_NDCG, val_HR = evaluate(model, dataset, max_len, is_test = False)
        test_NDCG, test_HR = evaluate(model, dataset, max_len, is_test = True)

        
        


        

        


In [None]:
# evaluate
# 1) 유저 시퀀스 구성, validation 경우, history: train, test의 경우, history: train + validation
# 2) history를 고정 길이의 seq로 변환
# 3) candidate 아이템 1개 만들기: 정답 1개(validation, test) + negative 100개(train아이테 제외, 랜덤하게)
# 4) 모델이 각 candidate 아이템의 점수를 계산
# 5) 각 아이템의 ranking 계산 및 정렬
# 6) Top-K 안에 들어오는지 확인
