# Transformer 기반 시퀀스 추천 모델

유저의 영화 시청 시퀀스를 기반으로 다음에 볼 영화를 예측하는 모델

## 모델 구조
```
Input: [movie_1, movie_2, ..., movie_t] (시청 순서대로)
       ↓ (KG+BERT embedding lookup)
       [emb_1, emb_2, ..., emb_t] (300-dim each)
       ↓ (+ positional encoding)
       Transformer Encoder
       ↓
       predicted_next_embedding (300-dim)
       ↓ (cosine similarity with all movie embeddings)
       Top-K 추천
```

## 학습 방식
- Input: 유저의 시청 시퀀스 [1:t]
- Target: 다음 영화 임베딩 [t+1]
- Loss: Cosine Embedding Loss 또는 InfoNCE (Contrastive)

In [None]:
import sys
import os

# MPS fallback 설정 (Mac에서 Transformer 호환성 문제 해결)
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

sys.path.insert(0, "/Users/jisoo/projects/thesis/carte_test")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import platform

from config import PROCESSED

# 한글 폰트
if platform.system() == 'Darwin':
    plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

# Device
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## 1. 데이터 로드

In [40]:
# 평점 데이터 로드
ratings = pd.read_parquet(PROCESSED.RATINGS_PARQUET)
print(f"평점 수: {len(ratings):,}")
print(f"유저 수: {ratings['userId'].nunique():,}")
print(f"영화 수: {ratings['movieId'].nunique():,}")
ratings.head()

평점 수: 13,717,662
유저 수: 200,948
영화 수: 54,520


Unnamed: 0,userId,movieId,rating,timestamp
0,3,3248,4.0,1084486164
1,3,1957,5.0,1084486061
2,3,534,4.0,1084486058
3,3,2150,4.0,1084486055
4,3,26,4.0,1084486051


In [41]:
# KG+BERT 임베딩 로드
EMB_PATH = PROCESSED.DIR / "ablation_embeddings" / "emb_kg_gnn_bert.parquet"

if not EMB_PATH.exists():
    print(f"Warning: {EMB_PATH} not found, trying kg_gnn...")
    EMB_PATH = PROCESSED.DIR / "ablation_embeddings" / "emb_kg_gnn.parquet"

emb_df = pd.read_parquet(EMB_PATH)
print(f"임베딩 영화 수: {len(emb_df):,}")

# 임베딩 행렬 생성
movie_ids = emb_df['movieId'].to_numpy()
embeddings = np.array(emb_df['embedding'].tolist(), dtype=np.float32)
emb_dim = embeddings.shape[1]
print(f"임베딩 차원: {emb_dim}")

# movieId → index 매핑
movie_to_idx = {mid: i for i, mid in enumerate(movie_ids)}
idx_to_movie = {i: mid for mid, i in movie_to_idx.items()}

# 정규화
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings_norm = embeddings / (norms + 1e-12)

임베딩 영화 수: 53,630
임베딩 차원: 300


## 2. 시퀀스 데이터 생성

In [None]:
# 임베딩이 있는 영화만 필터링
valid_movies = set(movie_ids)
ratings_filtered = ratings[ratings['movieId'].isin(valid_movies)].copy()
print(f"필터링 후 평점 수: {len(ratings_filtered):,}")

# ========================================
# 학습 데이터 설정
# ========================================
SAMPLE_USERS = 10000  # 유저 수 (전체: None)

if SAMPLE_USERS is not None:
    sampled_user_ids = ratings_filtered['userId'].drop_duplicates().sample(n=SAMPLE_USERS, random_state=42)
    ratings_filtered = ratings_filtered[ratings_filtered['userId'].isin(sampled_user_ids)]
    print(f"샘플링 후 평점 수: {len(ratings_filtered):,} ({SAMPLE_USERS:,} 유저)")

# 유저별로 시청 시퀀스 생성 (timestamp 순서)
print("유저별 시퀀스 생성 중...")
user_sequences = (
    ratings_filtered
    .sort_values(['userId', 'timestamp'])
    .groupby('userId')['movieId']
    .apply(list)
    .to_dict()
)

# 시퀀스 길이 통계
seq_lengths = [len(seq) for seq in user_sequences.values()]
print(f"\n유저 수: {len(user_sequences):,}")
print(f"시퀀스 길이 - min: {min(seq_lengths)}, max: {max(seq_lengths)}, mean: {np.mean(seq_lengths):.1f}")

In [None]:
# Train/Val/Test 분할 (유저 단위)
# 각 유저의 마지막 영화를 test, 그 전 영화를 val로 사용

MIN_SEQ_LEN = 5  # 최소 시퀀스 길이
MAX_SEQ_LEN = 50  # 최대 시퀀스 길이 (Transformer 입력)
MAX_TRAIN_PER_USER = 50  # 유저당 최대 train 샘플

train_data = []
val_data = []
test_data = []

for user_id, seq in user_sequences.items():
    if len(seq) < MIN_SEQ_LEN:
        continue
    
    # Test: 마지막 영화 예측
    test_seq = seq[:-1][-MAX_SEQ_LEN:]
    test_target = seq[-1]
    test_data.append((user_id, test_seq, test_target))
    
    # Val: 마지막-1 영화 예측
    if len(seq) >= MIN_SEQ_LEN + 1:
        val_seq = seq[:-2][-MAX_SEQ_LEN:]
        val_target = seq[-2]
        val_data.append((user_id, val_seq, val_target))
    
    # Train: 유저당 최대 MAX_TRAIN_PER_USER개만 샘플링
    train_positions = list(range(MIN_SEQ_LEN - 1, len(seq) - 2))
    if len(train_positions) > MAX_TRAIN_PER_USER:
        # 균등하게 샘플링
        step = len(train_positions) // MAX_TRAIN_PER_USER
        train_positions = train_positions[::step][:MAX_TRAIN_PER_USER]
    
    for i in train_positions:
        train_seq = seq[max(0, i - MAX_SEQ_LEN + 1):i + 1]
        train_target = seq[i + 1]
        train_data.append((user_id, train_seq, train_target))

print(f"Train samples: {len(train_data):,}")
print(f"Val samples: {len(val_data):,}")
print(f"Test samples: {len(test_data):,}")

## 3. Dataset & DataLoader

In [44]:
class SeqDataset(Dataset):
    def __init__(self, data, movie_to_idx, embeddings, max_len=50):
        """
        data: list of (user_id, sequence, target_movie_id)
        """
        self.data = data
        self.movie_to_idx = movie_to_idx
        self.embeddings = torch.from_numpy(embeddings)
        self.max_len = max_len
        self.emb_dim = embeddings.shape[1]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        user_id, seq, target = self.data[idx]
        
        # 시퀀스를 인덱스로 변환
        seq_idx = [self.movie_to_idx[mid] for mid in seq]
        target_idx = self.movie_to_idx[target]
        
        # 시퀀스 임베딩
        seq_emb = self.embeddings[seq_idx]  # (seq_len, emb_dim)
        target_emb = self.embeddings[target_idx]  # (emb_dim,)
        
        # 패딩 (앞에서부터)
        seq_len = len(seq_idx)
        if seq_len < self.max_len:
            pad = torch.zeros(self.max_len - seq_len, self.emb_dim)
            seq_emb = torch.cat([pad, seq_emb], dim=0)
            mask = torch.cat([torch.zeros(self.max_len - seq_len), torch.ones(seq_len)])
        else:
            seq_emb = seq_emb[-self.max_len:]
            mask = torch.ones(self.max_len)
        
        return {
            'seq_emb': seq_emb,  # (max_len, emb_dim)
            'mask': mask,  # (max_len,)
            'target_emb': target_emb,  # (emb_dim,)
            'target_idx': target_idx,
        }

In [45]:
# 데이터셋 생성
BATCH_SIZE = 256

train_dataset = SeqDataset(train_data, movie_to_idx, embeddings_norm, MAX_SEQ_LEN)
val_dataset = SeqDataset(val_data, movie_to_idx, embeddings_norm, MAX_SEQ_LEN)
test_dataset = SeqDataset(test_data, movie_to_idx, embeddings_norm, MAX_SEQ_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

Train batches: 79
Val batches: 8
Test batches: 8


## 4. Transformer 모델 정의

In [46]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))  # (1, max_len, d_model)
    
    def forward(self, x):
        # x: (batch, seq_len, d_model)
        return x + self.pe[:, :x.size(1), :]

In [47]:
class SeqRecommender(nn.Module):
    def __init__(self, emb_dim=300, n_heads=6, n_layers=2, dim_feedforward=512, dropout=0.1, max_len=50):
        super().__init__()
        
        self.emb_dim = emb_dim
        
        # Input projection (optional, 임베딩 차원 조정)
        self.input_proj = nn.Linear(emb_dim, emb_dim)
        
        # Positional encoding
        self.pos_encoder = PositionalEncoding(emb_dim, max_len)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim,
            nhead=n_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        # Output projection
        self.output_proj = nn.Sequential(
            nn.Linear(emb_dim, emb_dim),
            nn.LayerNorm(emb_dim),
        )
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, seq_emb, mask=None):
        """
        seq_emb: (batch, seq_len, emb_dim)
        mask: (batch, seq_len) - 1 for valid, 0 for padding
        
        Returns: (batch, emb_dim) - predicted next embedding
        """
        # Input projection
        x = self.input_proj(seq_emb)
        x = self.dropout(x)
        
        # Positional encoding
        x = self.pos_encoder(x)
        
        # Create attention mask (True = ignore)
        if mask is not None:
            attn_mask = (mask == 0)  # (batch, seq_len)
        else:
            attn_mask = None
        
        # Transformer
        x = self.transformer(x, src_key_padding_mask=attn_mask)  # (batch, seq_len, emb_dim)
        
        # 마지막 유효한 위치의 출력 추출
        if mask is not None:
            # 각 배치에서 마지막 유효 위치 찾기
            seq_lens = mask.sum(dim=1).long()  # (batch,)
            batch_idx = torch.arange(x.size(0), device=x.device)
            last_idx = seq_lens - 1
            x = x[batch_idx, last_idx]  # (batch, emb_dim)
        else:
            x = x[:, -1, :]  # (batch, emb_dim)
        
        # Output projection
        x = self.output_proj(x)
        
        # L2 정규화 (cosine similarity를 위해)
        x = F.normalize(x, p=2, dim=-1)
        
        return x

In [48]:
# 모델 생성
model = SeqRecommender(
    emb_dim=emb_dim,
    n_heads=6,
    n_layers=2,
    dim_feedforward=512,
    dropout=0.1,
    max_len=MAX_SEQ_LEN,
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(model)

Model parameters: 1,522,024
SeqRecommender(
  (input_proj): Linear(in_features=300, out_features=300, bias=True)
  (pos_encoder): PositionalEncoding()
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
        )
        (linear1): Linear(in_features=300, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=300, bias=True)
        (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (output_proj): Sequential(
    (0): Linear(in_features=300, out_features=300, bias=True)
    (1): LayerNorm((300,), eps=1e-05, elementwise

## 5. 학습

In [49]:
class InfoNCELoss(nn.Module):
    """Contrastive loss with in-batch negatives"""
    def __init__(self, temperature=0.1):
        super().__init__()
        self.temperature = temperature
    
    def forward(self, pred_emb, target_emb):
        """
        pred_emb: (batch, emb_dim) - normalized
        target_emb: (batch, emb_dim) - normalized
        """
        # Similarity matrix (batch x batch)
        sim = torch.mm(pred_emb, target_emb.t()) / self.temperature  # (batch, batch)
        
        # Labels: diagonal (positive pairs)
        labels = torch.arange(sim.size(0), device=sim.device)
        
        # Cross entropy loss
        loss = F.cross_entropy(sim, labels)
        
        return loss

In [None]:
# Loss & Optimizer
criterion = InfoNCELoss(temperature=0.07)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)  # T_max = N_EPOCHS

In [51]:
def evaluate(model, loader, embeddings_norm, device, k_list=[10, 20, 50]):
    """Hit@K, MRR 계산"""
    model.eval()
    
    all_emb = torch.from_numpy(embeddings_norm).to(device)  # (n_movies, emb_dim)
    
    hits = {k: 0 for k in k_list}
    mrr_sum = 0
    total = 0
    
    with torch.no_grad():
        for batch in loader:
            seq_emb = batch['seq_emb'].to(device)
            mask = batch['mask'].to(device)
            target_idx = batch['target_idx'].to(device)
            
            # 예측
            pred_emb = model(seq_emb, mask)  # (batch, emb_dim)
            
            # 모든 영화와의 유사도
            scores = torch.mm(pred_emb, all_emb.t())  # (batch, n_movies)
            
            # 랭킹
            _, indices = scores.sort(descending=True, dim=1)
            
            for i in range(len(target_idx)):
                target = target_idx[i].item()
                rank = (indices[i] == target).nonzero(as_tuple=True)[0].item() + 1
                
                for k in k_list:
                    if rank <= k:
                        hits[k] += 1
                
                mrr_sum += 1.0 / rank
                total += 1
    
    results = {f'Hit@{k}': hits[k] / total for k in k_list}
    results['MRR'] = mrr_sum / total
    
    return results

In [None]:
# 학습 루프
N_EPOCHS = 10  # 에폭 수 증가
best_mrr = 0
history = {'train_loss': [], 'val_loss': [], 'val_mrr': [], 'val_hit10': []}

for epoch in range(N_EPOCHS):
    # Train
    model.train()
    train_loss = 0
    
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{N_EPOCHS}')
    for batch in pbar:
        seq_emb = batch['seq_emb'].to(device)
        mask = batch['mask'].to(device)
        target_emb = batch['target_emb'].to(device)
        
        optimizer.zero_grad()
        
        pred_emb = model(seq_emb, mask)
        loss = criterion(pred_emb, target_emb)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        train_loss += loss.item()
        pbar.set_postfix(loss=f'{loss.item():.4f}')
    
    train_loss /= len(train_loader)
    scheduler.step()
    
    # Validation Loss 계산
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            seq_emb = batch['seq_emb'].to(device)
            mask = batch['mask'].to(device)
            target_emb = batch['target_emb'].to(device)
            
            pred_emb = model(seq_emb, mask)
            loss = criterion(pred_emb, target_emb)
            val_loss += loss.item()
    val_loss /= len(val_loader)
    
    # Validation Metrics
    val_results = evaluate(model, val_loader, embeddings_norm, device)
    
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['val_mrr'].append(val_results['MRR'])
    history['val_hit10'].append(val_results['Hit@10'])
    
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}, "
          f"Val MRR={val_results['MRR']:.4f}, Hit@10={val_results['Hit@10']:.4f}")
    
    # Best model 저장
    if val_results['MRR'] > best_mrr:
        best_mrr = val_results['MRR']
        torch.save(model.state_dict(), PROCESSED.DIR / 'seq_recommender_best.pt')
        print(f"  -> Best model saved!")

In [None]:
# 학습 곡선 시각화
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# 1. Loss (Train & Validation)
axes[0].plot(range(1, len(history['train_loss'])+1), history['train_loss'], marker='o', label='Train Loss', color='blue')
axes[0].plot(range(1, len(history['val_loss'])+1), history['val_loss'], marker='s', label='Val Loss', color='red')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training & Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 2. MRR
axes[1].plot(range(1, len(history['val_mrr'])+1), history['val_mrr'], marker='o', color='green')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('MRR')
axes[1].set_title('Validation MRR')
axes[1].grid(True, alpha=0.3)

# 3. Hit@10
axes[2].plot(range(1, len(history['val_hit10'])+1), history['val_hit10'], marker='s', color='purple')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('Hit@10')
axes[2].set_title('Validation Hit@10')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 최종 결과 출력
print("\n[학습 결과 요약]")
print(f"  최종 Train Loss: {history['train_loss'][-1]:.4f}")
print(f"  최종 Val Loss:   {history['val_loss'][-1]:.4f}")
print(f"  최종 Val MRR:    {history['val_mrr'][-1]:.4f}")
print(f"  최종 Val Hit@10: {history['val_hit10'][-1]:.4f}")

## 6. 테스트 평가

In [None]:
# Best model 로드
model.load_state_dict(torch.load(PROCESSED.DIR / 'seq_recommender_best.pt', weights_only=True))

# Test 평가
test_results = evaluate(model, test_loader, embeddings_norm, device, k_list=[1, 5, 10, 20, 50, 100])

print("\n" + "="*50)
print("Test Results")
print("="*50)
for metric, value in test_results.items():
    print(f"  {metric}: {value:.4f}")

## 7. Baseline 비교

In [None]:
def evaluate_baseline(loader, embeddings_norm, device, method='mean'):
    """Baseline: 시퀀스 임베딩의 단순 평균/마지막"""
    all_emb = torch.from_numpy(embeddings_norm).to(device)
    
    hits = {k: 0 for k in [1, 5, 10, 20, 50, 100]}
    mrr_sum = 0
    total = 0
    
    with torch.no_grad():
        for batch in loader:
            seq_emb = batch['seq_emb'].to(device)  # (batch, seq_len, emb_dim)
            mask = batch['mask'].to(device)
            target_idx = batch['target_idx'].to(device)
            
            if method == 'mean':
                # Masked mean
                mask_expanded = mask.unsqueeze(-1)  # (batch, seq_len, 1)
                pred_emb = (seq_emb * mask_expanded).sum(dim=1) / mask.sum(dim=1, keepdim=True)
            elif method == 'last':
                # 마지막 유효 위치
                seq_lens = mask.sum(dim=1).long()
                batch_idx = torch.arange(seq_emb.size(0), device=device)
                pred_emb = seq_emb[batch_idx, seq_lens - 1]
            
            pred_emb = F.normalize(pred_emb, p=2, dim=-1)
            
            scores = torch.mm(pred_emb, all_emb.t())
            _, indices = scores.sort(descending=True, dim=1)
            
            for i in range(len(target_idx)):
                target = target_idx[i].item()
                rank = (indices[i] == target).nonzero(as_tuple=True)[0].item() + 1
                
                for k in hits:
                    if rank <= k:
                        hits[k] += 1
                mrr_sum += 1.0 / rank
                total += 1
    
    results = {f'Hit@{k}': hits[k] / total for k in hits}
    results['MRR'] = mrr_sum / total
    return results

In [None]:
# Baseline 평가
print("Baseline 평가 중...")
baseline_mean = evaluate_baseline(test_loader, embeddings_norm, device, method='mean')
baseline_last = evaluate_baseline(test_loader, embeddings_norm, device, method='last')

print("\n" + "="*70)
print("Baseline vs Transformer 비교")
print("="*70)
print(f"{'Method':<20} {'MRR':>10} {'Hit@10':>10} {'Hit@20':>10} {'Hit@50':>10}")
print("-"*70)
print(f"{'Mean Pooling':<20} {baseline_mean['MRR']:>10.4f} {baseline_mean['Hit@10']:>10.4f} {baseline_mean['Hit@20']:>10.4f} {baseline_mean['Hit@50']:>10.4f}")
print(f"{'Last Item':<20} {baseline_last['MRR']:>10.4f} {baseline_last['Hit@10']:>10.4f} {baseline_last['Hit@20']:>10.4f} {baseline_last['Hit@50']:>10.4f}")
print(f"{'Transformer':<20} {test_results['MRR']:>10.4f} {test_results['Hit@10']:>10.4f} {test_results['Hit@20']:>10.4f} {test_results['Hit@50']:>10.4f}")
print("="*70)

In [None]:
# 비교 시각화
methods = ['Mean Pooling', 'Last Item', 'Transformer']
metrics = ['MRR', 'Hit@10', 'Hit@20', 'Hit@50']
data = {
    'Mean Pooling': baseline_mean,
    'Last Item': baseline_last,
    'Transformer': test_results,
}

fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(metrics))
width = 0.25

for i, method in enumerate(methods):
    vals = [data[method][m] for m in metrics]
    bars = ax.bar(x + i*width, vals, width, label=method)
    for bar, val in zip(bars, vals):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
                f'{val:.3f}', ha='center', va='bottom', fontsize=9)

ax.set_ylabel('Score')
ax.set_title('추천 성능 비교: Baseline vs Transformer')
ax.set_xticks(x + width)
ax.set_xticklabels(metrics)
ax.legend()
ax.grid(axis='y', alpha=0.3)
ax.set_ylim(0, max(max(data[m][metric] for m in methods for metric in metrics) * 1.2, 0.5))

plt.tight_layout()
plt.show()

## 8. 추천 예시

In [None]:
# 카탈로그 로드 (영화 제목 및 피처 정보용)
catalog = pd.read_parquet(PROCESSED.MOVIE_CATALOG_PARQUET)
movie_titles = catalog.set_index('movieId')['original_title'].to_dict()

# 번역 라이브러리 로드 시도
try:
    from deep_translator import GoogleTranslator
    translator = GoogleTranslator(source='en', target='ko')
    HAS_TRANSLATOR = True
    print("번역 기능 활성화 (deep_translator)")
except ImportError:
    try:
        from googletrans import Translator
        translator = Translator()
        HAS_TRANSLATOR = True
        print("번역 기능 활성화 (googletrans)")
    except ImportError:
        HAS_TRANSLATOR = False
        print("번역 라이브러리 없음 - pip install deep-translator 또는 pip install googletrans==4.0.0-rc1")

def translate_text(text):
    """텍스트를 한국어로 번역"""
    if not HAS_TRANSLATOR or pd.isna(text) or text == '':
        return None
    try:
        if hasattr(translator, 'translate') and callable(getattr(translator, 'translate')):
            result = translator.translate(text)
            # googletrans는 객체 반환, deep_translator는 문자열 반환
            return result.text if hasattr(result, 'text') else result
    except Exception as e:
        return None
    return None

# 피처 컬럼 정의
FEATURE_COLS = ['original_title', 'release_year', 'genre_1', 'genre_2', 'director_1', 
                'actor_1', 'actor_2', 'produced_by_company_1', 'tagline', 'overview']

def get_movie_features(movie_id):
    """영화의 주요 피처 정보 반환"""
    row = catalog[catalog['movieId'] == movie_id]
    if len(row) == 0:
        return None
    return row[FEATURE_COLS].iloc[0].to_dict()

def print_movie_features(movie_id, prefix="", show_translation=True):
    """영화 피처 정보 출력 (tagline, overview 포함)"""
    features = get_movie_features(movie_id)
    if features is None:
        print(f"{prefix}영화 정보 없음 (ID: {movie_id})")
        return
    
    print(f"{prefix}{features['original_title']} ({features['release_year']})")
    print(f"{prefix}  장르: {features['genre_1']}, {features['genre_2']}")
    print(f"{prefix}  감독: {features['director_1']}")
    print(f"{prefix}  배우: {features['actor_1']}, {features['actor_2']}")
    print(f"{prefix}  제작사: {features['produced_by_company_1']}")
    
    # Tagline
    tagline = features.get('tagline', '')
    if pd.notna(tagline) and tagline:
        print(f"{prefix}  태그라인: {tagline}")
        if show_translation and HAS_TRANSLATOR:
            tagline_ko = translate_text(tagline)
            if tagline_ko:
                print(f"{prefix}  태그라인(한국어): {tagline_ko}")
    
    # Overview (길면 200자로 자름)
    overview = features.get('overview', '')
    if pd.notna(overview) and overview:
        overview_short = overview[:200] + "..." if len(overview) > 200 else overview
        print(f"{prefix}  줄거리: {overview_short}")
        if show_translation and HAS_TRANSLATOR:
            overview_ko = translate_text(overview_short)
            if overview_ko:
                print(f"{prefix}  줄거리(한국어): {overview_ko}")

In [None]:
def recommend_for_user(model, user_seq, embeddings_norm, movie_to_idx, idx_to_movie, movie_titles, top_k=10, device='cpu'):
    """유저 시퀀스 기반 추천"""
    model.eval()
    
    # 시퀀스를 인덱스로 변환
    seq_idx = [movie_to_idx[mid] for mid in user_seq if mid in movie_to_idx]
    if len(seq_idx) == 0:
        return []
    
    # 임베딩
    seq_emb = torch.from_numpy(embeddings_norm[seq_idx]).unsqueeze(0).to(device)  # (1, seq_len, emb_dim)
    mask = torch.ones(1, len(seq_idx)).to(device)
    
    # 예측
    with torch.no_grad():
        pred_emb = model(seq_emb, mask)  # (1, emb_dim)
    
    # 모든 영화와 유사도
    all_emb = torch.from_numpy(embeddings_norm).to(device)
    scores = torch.mm(pred_emb, all_emb.t()).squeeze()  # (n_movies,)
    
    # 이미 본 영화 제외
    seen_idx = set(seq_idx)
    for idx in seen_idx:
        scores[idx] = -float('inf')
    
    # Top-K
    _, top_indices = scores.topk(top_k)
    
    recommendations = []
    for idx in top_indices.cpu().numpy():
        mid = idx_to_movie[idx]
        title = movie_titles.get(mid, f'Unknown ({mid})')
        score = scores[idx].item()
        recommendations.append({'movieId': mid, 'title': title, 'score': score})
    
    return recommendations

In [None]:
# 테스트 유저 예시 - 피처 정보 포함
test_user_id, test_seq, test_target = test_data[0]

print("="*100)
print(f"유저 {test_user_id}의 시청 이력 (최근 5개)")
print("="*100)
for i, mid in enumerate(test_seq[-5:]):
    print(f"\n[{i+1}]", end=" ")
    print_movie_features(mid)

print("\n" + "="*100)
print("실제 다음 영화 (정답)")
print("="*100)
print_movie_features(test_target)

print("\n" + "="*100)
print("모델 추천 Top-10")
print("="*100)
recs = recommend_for_user(model, test_seq, embeddings_norm, movie_to_idx, idx_to_movie, movie_titles, top_k=10, device=device)
for i, rec in enumerate(recs):
    marker = "⭐ HIT!" if rec['movieId'] == test_target else ""
    print(f"\n[{i+1}] (score={rec['score']:.4f}) {marker}")
    print_movie_features(rec['movieId'], prefix="  ")

In [None]:
# 추천 영화와 시청 영화 간 코사인 유사도 분석
print("="*100)
print("추천 Top-10과 시청 이력 간 코사인 유사도")
print("="*100)

# 시청한 영화 임베딩
watched_ids = test_seq[-10:]  # 최근 10개
watched_idx = [movie_to_idx[mid] for mid in watched_ids if mid in movie_to_idx]
watched_emb = embeddings_norm[watched_idx]

# 추천된 영화 임베딩
rec_ids = [rec['movieId'] for rec in recs]
rec_idx = [movie_to_idx[mid] for mid in rec_ids]
rec_emb = embeddings_norm[rec_idx]

# 유사도 행렬 계산 (추천 x 시청)
sim_matrix = np.dot(rec_emb, watched_emb.T)  # (10, 10)

# 추천 영화별 시청 영화와의 평균/최대 유사도
print("\n추천 영화별 시청 이력과의 유사도:")
print("-"*100)
print(f"{'순위':<4} {'추천 영화':<35} {'평균 유사도':>12} {'최대 유사도':>12} {'가장 유사한 시청 영화':<30}")
print("-"*100)

for i, rec in enumerate(recs):
    rec_title = movie_titles.get(rec['movieId'], 'Unknown')[:32]
    avg_sim = sim_matrix[i].mean()
    max_sim = sim_matrix[i].max()
    most_similar_idx = sim_matrix[i].argmax()
    most_similar_movie = movie_titles.get(watched_ids[most_similar_idx], 'Unknown')[:28]
    
    print(f"{i+1:<4} {rec_title:<35} {avg_sim:>12.4f} {max_sim:>12.4f} {most_similar_movie:<30}")

# 전체 요약
print("-"*100)
print(f"{'전체 평균':<40} {sim_matrix.mean():>12.4f}")

# 히트맵 시각화
fig, ax = plt.subplots(figsize=(12, 8))
im = ax.imshow(sim_matrix, cmap='RdYlGn', vmin=0, vmax=1)

# 축 레이블
rec_titles = [movie_titles.get(mid, '?')[:20] for mid in rec_ids]
watched_titles = [movie_titles.get(mid, '?')[:20] for mid in watched_ids]

ax.set_xticks(range(len(watched_titles)))
ax.set_yticks(range(len(rec_titles)))
ax.set_xticklabels(watched_titles, rotation=45, ha='right', fontsize=9)
ax.set_yticklabels(rec_titles, fontsize=9)

ax.set_xlabel('시청한 영화 (최근 10개)', fontweight='bold')
ax.set_ylabel('추천된 영화 (Top-10)', fontweight='bold')
ax.set_title('추천 영화 vs 시청 영화 코사인 유사도', fontweight='bold', fontsize=12)

# 값 표시
for i in range(len(rec_titles)):
    for j in range(len(watched_titles)):
        val = sim_matrix[i, j]
        color = 'white' if val < 0.5 else 'black'
        ax.text(j, i, f'{val:.2f}', ha='center', va='center', color=color, fontsize=8)

plt.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout()
plt.show()

In [None]:
# 추천 영화와 시청 영화 간 공통 피처 분석
print("="*100)
print("추천 영화와 시청 영화 간 공통 피처 분석")
print("="*100)

# 시청 영화의 피처 수집
watched_features = {
    'genres': set(),
    'directors': set(),
    'actors': set(),
    'companies': set()
}

for mid in test_seq[-10:]:
    f = get_movie_features(mid)
    if f:
        if pd.notna(f['genre_1']): watched_features['genres'].add(f['genre_1'])
        if pd.notna(f.get('genre_2')): watched_features['genres'].add(f['genre_2'])
        if pd.notna(f['director_1']): watched_features['directors'].add(f['director_1'])
        if pd.notna(f['actor_1']): watched_features['actors'].add(f['actor_1'])
        if pd.notna(f.get('actor_2')): watched_features['actors'].add(f['actor_2'])
        if pd.notna(f['produced_by_company_1']): watched_features['companies'].add(f['produced_by_company_1'])

print(f"\n시청 영화에서 추출된 피처:")
print(f"  장르: {watched_features['genres']}")
print(f"  감독: {watched_features['directors']}")
print(f"  배우 (일부): {list(watched_features['actors'])[:5]}...")
print(f"  제작사: {watched_features['companies']}")

# 각 추천 영화의 공통 피처 분석
print(f"\n추천 영화별 공통 피처:")
print("-"*100)
print(f"{'순위':<4} {'영화':<30} {'공통장르':<15} {'공통감독':<15} {'공통배우':<20} {'공통제작사':<15}")
print("-"*100)

for i, rec in enumerate(recs):
    f = get_movie_features(rec['movieId'])
    if f is None:
        continue
    
    title = f['original_title'][:28]
    
    # 공통 피처 찾기
    common_genres = []
    if pd.notna(f['genre_1']) and f['genre_1'] in watched_features['genres']:
        common_genres.append(f['genre_1'])
    if pd.notna(f.get('genre_2')) and f.get('genre_2') in watched_features['genres']:
        common_genres.append(f['genre_2'])
    
    common_director = f['director_1'] if pd.notna(f['director_1']) and f['director_1'] in watched_features['directors'] else '-'
    
    common_actors = []
    if pd.notna(f['actor_1']) and f['actor_1'] in watched_features['actors']:
        common_actors.append(f['actor_1'])
    if pd.notna(f.get('actor_2')) and f.get('actor_2') in watched_features['actors']:
        common_actors.append(f['actor_2'])
    
    common_company = f['produced_by_company_1'] if pd.notna(f['produced_by_company_1']) and f['produced_by_company_1'] in watched_features['companies'] else '-'
    
    genres_str = ','.join(common_genres)[:13] if common_genres else '-'
    actors_str = ','.join(common_actors)[:18] if common_actors else '-'
    director_str = common_director[:13] if common_director != '-' else '-'
    company_str = common_company[:13] if common_company != '-' else '-'
    
    print(f"{i+1:<4} {title:<30} {genres_str:<15} {director_str:<15} {actors_str:<20} {company_str:<15}")

print("-"*100)

In [None]:
# 여러 유저 예시
print("="*80)
print("여러 유저 추천 결과")
print("="*80)

for i in [0, 100, 500, 1000]:
    if i >= len(test_data):
        continue
    user_id, seq, target = test_data[i]
    recs = recommend_for_user(model, seq, embeddings_norm, movie_to_idx, idx_to_movie, movie_titles, top_k=5, device=device)
    
    print(f"\n[유저 {user_id}]")
    print(f"최근 시청: {', '.join([movie_titles.get(m, '?')[:20] for m in seq[-3:]])}")
    print(f"실제 다음: {movie_titles.get(target, 'Unknown')}")
    print(f"추천 Top-5:")
    for j, rec in enumerate(recs):
        hit = "*" if rec['movieId'] == target else " "
        print(f"  {hit} {j+1}. {rec['title'][:40]}")

## 9. 모델 저장

In [None]:
# 최종 모델 저장
save_path = PROCESSED.DIR / 'seq_recommender_final.pt'
torch.save({
    'model_state_dict': model.state_dict(),
    'config': {
        'emb_dim': emb_dim,
        'n_heads': 6,
        'n_layers': 2,
        'dim_feedforward': 512,
        'max_len': MAX_SEQ_LEN,
    },
    'test_results': test_results,
}, save_path)

print(f"모델 저장: {save_path}")
print(f"\n최종 Test 성능:")
for k, v in test_results.items():
    print(f"  {k}: {v:.4f}")