# SASRec (Self-Attentive Sequential Recommendation) Implementation

이 노트북에서는 **SASRec** 모델을 구현합니다.
- **핵심 아이디어**: 유저가 과거에 본 영화들의 **순서(Sequence)**를 보고, 다음에 볼 영화를 예측합니다.
- **구조**: Transformer의 Self-Attention 메커니즘을 사용하여 시퀀스 내의 패턴을 파악합니다.

### 1. 데이터 로드 및 시퀀스 생성

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# 1. 데이터 로드
ratings = pd.read_csv('../data/ratings.csv')

# 2. 데이터 샘플링 (속도를 위해 10%만 사용)
# 주의: 시퀀스 모델은 유저별 기록이 중요하므로, 유저 단위로 샘플링하는 것이 좋음
user_ids = ratings['userId'].unique()
sample_user_ids = np.random.choice(user_ids, size=int(len(user_ids) * 0.1), replace=False)
ratings = ratings[ratings['userId'].isin(sample_user_ids)].copy()

# 3. 인덱싱 (0은 Padding용으로 비워둠 -> 1부터 시작)
item_ids = ratings['movieId'].unique()
item2idx = {m: i+1 for i, m in enumerate(item_ids)}
ratings['item_idx'] = ratings['movieId'].map(item2idx)
num_items = len(item_ids) + 1 # 0번은 padding

# 4. 시퀀스 생성 (User별로 시간순 정렬 후 영화 리스트 만들기)
ratings = ratings.sort_values(['userId', 'timestamp'])
user_group = ratings.groupby('userId')['item_idx'].apply(list).reset_index()

print(f"Users: {len(user_group)}, Items: {num_items}")
user_group.head()

Users: 20094, Items: 40295


Unnamed: 0,userId,item_idx
0,1,"[135, 138, 132, 140, 131, 24, 31, 49, 62, 70, ..."
1,8,"[144, 23, 147, 65, 26, 148, 128, 151, 153, 162..."
2,14,"[185, 165, 191, 166, 179, 188, 177, 171, 169, ..."
3,31,"[193, 243, 161, 245, 242, 244, 201, 203, 233, ..."
4,32,"[284, 253, 283, 254, 268, 269, 282, 260, 264, ..."


### 2. Dataset & DataLoader
- **Input**: `[영화1, 영화2, 영화3, ..., 영화N-1]`
- **Target**: `[영화2, 영화3, 영화4, ..., 영화N]` (한 칸씩 밀린 것)
- **Padding**: 시퀀스 길이를 맞추기 위해 앞부분을 0으로 채움

In [2]:
class SASRecDataset(Dataset):
    def __init__(self, user_sequences, max_len=50):
        self.user_sequences = user_sequences
        self.max_len = max_len
        
    def __len__(self):
        return len(self.user_sequences)
    
    def __getitem__(self, idx):
        seq = self.user_sequences[idx]
        
        # 시퀀스 길이 맞추기 (Truncate or Pad)
        # 마지막 아이템은 '정답'으로 쓸 거라 제외하고 입력으로 씀
        # 학습 때는 (입력: 0~N-1, 정답: 1~N) 방식으로 전체를 다 씀
        
        seq_len = len(seq)
        if seq_len < self.max_len + 1:
            # 패딩 (앞쪽에 0 채우기)
            pad_len = self.max_len + 1 - seq_len
            seq = [0] * pad_len + seq
        else:
            # 자르기 (최신 데이터 위주로)
            seq = seq[-(self.max_len + 1):]
            
        # Input: 처음 ~ 마지막-1
        # Target: 두번째 ~ 마지막
        input_seq = torch.LongTensor(seq[:-1])
        target_seq = torch.LongTensor(seq[1:])
        
        return input_seq, target_seq

# Train/Val Split
train_seqs, val_seqs = train_test_split(user_group['item_idx'].tolist(), test_size=0.1, random_state=42)

max_len = 50
train_ds = SASRecDataset(train_seqs, max_len=max_len)
val_ds = SASRecDataset(val_seqs, max_len=max_len)

batch_size = 128
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

### 3. SASRec Model Architecture
- **Embedding**: Item Embedding + Position Embedding
- **Transformer Block**: Self-Attention -> LayerNorm -> FeedForward -> LayerNorm
- **Prediction**: Output * Item Embedding (Dot Product)

In [3]:
class SASRec(nn.Module):
    def __init__(self, num_items, max_len, embed_dim=64, num_heads=2, num_layers=2, dropout=0.1):
        super().__init__()
        self.num_items = num_items
        self.max_len = max_len
        
        # Embeddings
        self.item_embedding = nn.Embedding(num_items, embed_dim, padding_idx=0)
        self.position_embedding = nn.Embedding(max_len, embed_dim)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, 
            nhead=num_heads, 
            dim_feedforward=embed_dim*4, 
            dropout=dropout,
            batch_first=True, # (Batch, Seq, Feature)
            norm_first=True   # Pre-LayerNorm (학습 안정성)
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)
        
        self._init_weights()
        
    def _init_weights(self):
        nn.init.xavier_uniform_(self.item_embedding.weight)
        nn.init.xavier_uniform_(self.position_embedding.weight)

    def forward(self, input_seq):
        # input_seq: (Batch, Max_Len)
        batch_size = input_seq.size(0)
        seq_len = input_seq.size(1)
        
        # Masking (Padding은 무시하도록)
        # True면 무시(masking)됨. 0인 부분이 True가 되도록 설정
        src_key_padding_mask = (input_seq == 0)
        
        # Causal Mask (미래 정보 참조 금지)
        # 대각선 위쪽을 -inf로 채움
        src_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(input_seq.device)
        
        # Embedding
        items = self.item_embedding(input_seq)
        positions = self.position_embedding(torch.arange(seq_len, device=input_seq.device))
        x = items + positions
        x = self.dropout(x)
        
        # Transformer
        # src_mask: 미래 참조 방지
        # src_key_padding_mask: 패딩 무시
        out = self.transformer_encoder(x, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        out = self.layer_norm(out)
        
        # Prediction (모든 아이템과의 내적)
        # (Batch, Seq, Dim) * (Num_Items, Dim)^T -> (Batch, Seq, Num_Items)
        logits = torch.matmul(out, self.item_embedding.weight.transpose(0, 1))
        
        return logits

### 4. Training Loop

In [4]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = SASRec(num_items, max_len).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0) # Padding(0)은 Loss 계산 제외
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
best_val_loss = float('inf')

for epoch in range(epochs):
    model.train()
    train_loss = 0
    
    for input_seq, target_seq in train_loader:
        input_seq, target_seq = input_seq.to(device), target_seq.to(device)
        
        optimizer.zero_grad()
        logits = model(input_seq)
        
        # Loss 계산을 위해 차원 변경
        # logits: (Batch, Seq, Num_Items) -> (Batch*Seq, Num_Items)
        # target: (Batch, Seq) -> (Batch*Seq)
        loss = criterion(logits.view(-1, num_items), target_seq.view(-1))
        
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
    avg_train_loss = train_loss / len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for input_seq, target_seq in val_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            logits = model(input_seq)
            loss = criterion(logits.view(-1, num_items), target_seq.view(-1))
            val_loss += loss.item()
            
    avg_val_loss = val_loss / len(val_loader)
    
    print(f"Epoch {epoch+1:2d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), '../models/sasrec_model.pth')
        print("  -> Saved Best Model")

Using device: mps




Epoch  1 | Train Loss: 8.3660 | Val Loss: 7.8820
  -> Saved Best Model
Epoch  2 | Train Loss: 7.7230 | Val Loss: 7.6272
  -> Saved Best Model
Epoch  3 | Train Loss: 7.3684 | Val Loss: 7.2180
  -> Saved Best Model
Epoch  4 | Train Loss: 7.1008 | Val Loss: 7.0200
  -> Saved Best Model
Epoch  5 | Train Loss: 6.9097 | Val Loss: 6.8641
  -> Saved Best Model
Epoch  6 | Train Loss: 6.7813 | Val Loss: 6.7826
  -> Saved Best Model
Epoch  7 | Train Loss: 6.6884 | Val Loss: 6.7126
  -> Saved Best Model
Epoch  8 | Train Loss: 6.6009 | Val Loss: 6.6454
  -> Saved Best Model
Epoch  9 | Train Loss: 6.5180 | Val Loss: 6.5917
  -> Saved Best Model
Epoch 10 | Train Loss: 6.4452 | Val Loss: 6.5429
  -> Saved Best Model


In [5]:
# Inference Example
# 마지막 시퀀스를 넣었을 때, 그 다음 나올 아이템(영화) Top 10 예측

model.load_state_dict(torch.load('../models/sasrec_model.pth'))
model.eval()

def recommend(user_seq, k=10):
    # 전처리 (Padding/Truncate)
    seq_len = len(user_seq)
    if seq_len < max_len:
        pad_len = max_len - seq_len
        seq = [0] * pad_len + user_seq
    else:
        seq = user_seq[-max_len:]
        
    input_tensor = torch.LongTensor([seq]).to(device)
    
    with torch.no_grad():
        logits = model(input_tensor)
        # 마지막 시점(Last Time Step)의 예측값만 가져오기
        last_logits = logits[0, -1, :] # (Num_Items,)
        
        # Top K
        top_k_vals, top_k_indices = torch.topk(last_logits, k)
        
    return top_k_indices.cpu().numpy()

# 테스트용 유저 한 명 (Validation Set에서)
test_seq = val_seqs[0]
recommended_indices = recommend(test_seq)

# 인덱스 -> 영화 ID 변환을 위한 역매핑
idx2item = {v: k for k, v in item2idx.items()}

print(f"Input Sequence (Last 5): {[idx2item.get(i, 'Pad') for i in test_seq[-5:]]}")
print("Recommended Top 10:")
for idx in recommended_indices:
    if idx == 0: continue
    print(f"Movie ID: {idx2item[idx]}")

Input Sequence (Last 5): [np.int64(79132), np.int64(48082), np.int64(778), np.int64(44694), np.int64(1617)]
Recommended Top 10:
Movie ID: 7361
Movie ID: 44555
Movie ID: 4973
Movie ID: 4226
Movie ID: 55820
Movie ID: 1089
Movie ID: 296
Movie ID: 1193
Movie ID: 1213
Movie ID: 6016
