In [None]:
# -*- coding: utf-8 -*-
import os
import json
import random
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ==================== 0. 재현성 & 디바이스 ====================
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[INFO] Device: {device}")

# ==================== 1. 경로/설정 ====================
data_path = '/content/drive/MyDrive/review_business_5up_with_text.json' # <-파일 경로 수정
sbert_model_name = 'all-MiniLM-L6-v2'
embed_dim = 64
mlp_dims = (128, 64)
dropout_p = 0.2
lr = 1e-3
weight_decay = 1e-5
batch_size = 256
epochs = 50
patience = 5
min_delta = 1e-3
model_path = 'best_ucam_ncf_context.pt'
cache_dir = Path('./cache')
cache_dir.mkdir(exist_ok=True)
sbert_cache_path = cache_dir / f"{Path(data_path).stem}_{sbert_model_name}_embeddings.npy"

# ==================== 2. 유틸 함수 ====================
def mean_absolute_percentage_error(y_true, y_pred, eps=1e-10):
    y_true = np.asarray(y_true, dtype=np.float32)
    y_pred = np.asarray(y_pred, dtype=np.float32)
    denom = np.clip(np.abs(y_true), eps, None)
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100.0

# ==================== 3. 데이터 로드 ====================
rows = []
with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        rows.append(json.loads(line))
df = pd.DataFrame(rows)
df = df[['user_id', 'business_id', 'stars', 'text']].dropna().reset_index(drop=True)

# ID 매핑
user2idx = {uid: i for i, uid in enumerate(df['user_id'].unique())}
item2idx = {iid: i for i, iid in enumerate(df['business_id'].unique())}
df['user'] = df['user_id'].map(user2idx)
df['item'] = df['business_id'].map(item2idx)

# ==================== 4. SBERT 컨텍스트 임베딩 (캐시 지원) ====================
if sbert_cache_path.exists():
    print(f"[INFO] Load cached SBERT embeddings from {sbert_cache_path}")
    context_vectors = np.load(sbert_cache_path)
else:
    sbert_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"[INFO] Loading SBERT ({sbert_model_name}) on {sbert_device}")
    sbert = SentenceTransformer(sbert_model_name, device=sbert_device)
    print("[INFO] Encoding texts with SBERT...")
    context_vectors = sbert.encode(
        df['text'].tolist(),
        batch_size=256,
        convert_to_numpy=True,
        show_progress_bar=True
    )
    # 캐시 저장
    np.save(sbert_cache_path, context_vectors)
    print(f"[INFO] Saved embeddings to {sbert_cache_path}")

df['context_vector'] = list(context_vectors.astype(np.float32))
context_dim = context_vectors.shape[1]  # 보통 384

# ==================== 5. Dataset / DataLoader ====================
class UCAMDataset(Dataset):
    def __init__(self, users, items, ratings, contexts):
        self.users = np.asarray(users, dtype=np.int64)
        self.items = np.asarray(items, dtype=np.int64)
        self.ratings = np.asarray(ratings, dtype=np.float32)
        self.contexts = np.asarray(list(contexts), dtype=np.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.users[idx], dtype=torch.long),
            torch.tensor(self.items[idx], dtype=torch.long),
            torch.tensor(self.contexts[idx], dtype=torch.float32),
            torch.tensor(self.ratings[idx], dtype=torch.float32),
        )

# 80/10/10 split
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED)
train_df, val_df = train_test_split(train_val_df, test_size=0.1111, random_state=SEED)  # 0.9*0.1111≈0.1

train_dataset = UCAMDataset(train_df['user'].values, train_df['item'].values,
                            train_df['stars'].values, train_df['context_vector'].values)
val_dataset   = UCAMDataset(val_df['user'].values,   val_df['item'].values,
                            val_df['stars'].values,  val_df['context_vector'].values)
test_dataset  = UCAMDataset(test_df['user'].values,  test_df['item'].values,
                            test_df['stars'].values, test_df['context_vector'].values)

num_workers = 2 if torch.cuda.is_available() else 0
pin_memory = True if torch.cuda.is_available() else False

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, pin_memory=pin_memory, persistent_workers=bool(num_workers))
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=pin_memory, persistent_workers=bool(num_workers))
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=pin_memory, persistent_workers=bool(num_workers))

# ==================== 6. 모델: NCF + Context ====================
class NCF_WithContext(nn.Module):
    """
    - GMF 경로: element-wise product(u, i)
    - MLP 경로: concat(u, i) -> 비선형층들
    - 결합: concat(GMF, MLP, context_vec) -> MLP -> rating
    """
    def __init__(self, num_users, num_items, context_dim, embed_dim=64, mlp_dims=(128,64), dropout_p=0.2):
        super().__init__()
        self.user_embed = nn.Embedding(num_users, embed_dim)
        self.item_embed = nn.Embedding(num_items, embed_dim)

        mlp_layers = []
        in_dim = embed_dim * 2
        for d in mlp_dims:
            mlp_layers += [nn.Linear(in_dim, d), nn.ReLU(), nn.Dropout(dropout_p)]
            in_dim = d
        self.mlp = nn.Sequential(*mlp_layers)

        final_in = embed_dim + mlp_dims[-1] + context_dim
        self.predictor = nn.Sequential(
            nn.Linear(final_in, 64),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(64, 1)
        )

        # (선택) 임베딩 초기화
        nn.init.normal_(self.user_embed.weight, std=0.01)
        nn.init.normal_(self.item_embed.weight, std=0.01)

    def forward(self, user_ids, item_ids, context_vecs):
        u = self.user_embed(user_ids)                 # [B, E]
        i = self.item_embed(item_ids)                 # [B, E]
        gmf = u * i                                   # [B, E]
        mlp = self.mlp(torch.cat([u, i], dim=-1))     # [B, D]
        x = torch.cat([gmf, mlp, context_vecs], dim=-1)
        out = self.predictor(x).squeeze(1)            # [B]
        return out

# ==================== 7. 학습 준비 ====================
num_users = len(user2idx)
num_items = len(item2idx)
model = NCF_WithContext(num_users, num_items, context_dim=context_dim,
                        embed_dim=embed_dim, mlp_dims=mlp_dims, dropout_p=dropout_p).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.MSELoss()

best_val_rmse = float('inf')
epochs_no_improve = 0

# ==================== 8. 학습 루프 (조기 종료는 val) ====================
print("[INFO] Start training...")
for epoch in range(1, epochs + 1):
    model.train()
    total_train_loss = 0.0
    for user_ids, item_ids, contexts, ratings in train_loader:
        user_ids = user_ids.to(device)
        item_ids = item_ids.to(device)
        contexts = contexts.to(device)
        ratings  = ratings.to(device)

        optimizer.zero_grad()
        preds = model(user_ids, item_ids, contexts)
        loss = criterion(preds, ratings)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    # ----- 검증 -----
    model.eval()
    val_preds, val_true = [], []
    with torch.no_grad():
        for user_ids, item_ids, contexts, ratings in val_loader:
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            contexts = contexts.to(device)
            ratings  = ratings.to(device)
            out = model(user_ids, item_ids, contexts)
            val_preds.extend(out.detach().cpu().numpy())
            val_true.extend(ratings.detach().cpu().numpy())

    val_preds = np.array(val_preds, dtype=np.float32)
    val_true  = np.array(val_true,  dtype=np.float32)
    val_rmse  = float(np.sqrt(mean_squared_error(val_true, val_preds)))
    val_mae   = float(mean_absolute_error(val_true, val_preds))
    val_mape  = float(mean_absolute_percentage_error(val_true, val_preds))

    print(f"Epoch {epoch:02d} | TrainLoss {total_train_loss/len(train_loader):.4f} | "
          f"Val RMSE {val_rmse:.4f} | MAE {val_mae:.4f} | MAPE {val_mape:.2f}%")

    if val_rmse < best_val_rmse - min_delta:
        best_val_rmse = val_rmse
        epochs_no_improve = 0
        torch.save(model.state_dict(), model_path)
        print(f"  --> Improved. Save model (RMSE: {best_val_rmse:.4f})")
    else:
        epochs_no_improve += 1
        print(f"  --> No improvement ({epochs_no_improve}/{patience})")
        if epochs_no_improve >= patience:
            print("[INFO] Early stopping.")
            break

# ==================== 9. 테스트 평가 ====================
def evaluate_model(model, data_loader, device, clamp_to_star=True):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for users, items, contexts, ratings in data_loader:
            users = users.to(device)
            items = items.to(device)
            contexts = contexts.to(device)
            ratings = ratings.to(device)

            out = model(users, items, contexts)
            if clamp_to_star:
                out = torch.clamp(out, 1.0, 5.0)  # 별점 범위 유지
            preds.extend(out.detach().cpu().numpy())
            targets.extend(ratings.detach().cpu().numpy())

    preds   = np.array(preds, dtype=np.float32)
    targets = np.array(targets, dtype=np.float32)

    mse  = float(mean_squared_error(targets, preds))
    rmse = float(np.sqrt(mse))
    mae  = float(mean_absolute_error(targets, preds))
    mape = float(mean_absolute_percentage_error(targets, preds))

    print("\n✅ [UCAM: NCF+Context] 최종 테스트 지표")
    print(f"   - MSE  : {mse:.4f}")
    print(f"   - RMSE : {rmse:.4f}")
    print(f"   - MAE  : {mae:.4f}")
    print(f"   - MAPE : {mape:.2f}%")

if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path, map_location=device))
    print(f"[INFO] Loaded best model: {model_path}")

evaluate_model(model, test_loader, device, clamp_to_star=True)


[INFO] Device: cuda
[INFO] Loading SBERT (all-MiniLM-L6-v2) on cuda
[INFO] Encoding texts with SBERT...


Batches:   0%|          | 0/1750 [00:00<?, ?it/s]

[INFO] Saved embeddings to cache/review_business_5up_with_text_all-MiniLM-L6-v2_embeddings.npy
[INFO] Start training...
Epoch 01 | TrainLoss 1.1472 | Val RMSE 0.8023 | MAE 0.6345 | MAPE 22.62%
  --> Improved. Save model (RMSE: 0.8023)
Epoch 02 | TrainLoss 0.7103 | Val RMSE 0.7830 | MAE 0.6141 | MAPE 22.32%
  --> Improved. Save model (RMSE: 0.7830)
Epoch 03 | TrainLoss 0.6517 | Val RMSE 0.7666 | MAE 0.5975 | MAPE 21.70%
  --> Improved. Save model (RMSE: 0.7666)
Epoch 04 | TrainLoss 0.5923 | Val RMSE 0.7624 | MAE 0.5957 | MAPE 21.05%
  --> Improved. Save model (RMSE: 0.7624)
Epoch 05 | TrainLoss 0.4996 | Val RMSE 0.7782 | MAE 0.6021 | MAPE 21.39%
  --> No improvement (1/5)
Epoch 06 | TrainLoss 0.3559 | Val RMSE 0.8077 | MAE 0.6246 | MAPE 22.28%
  --> No improvement (2/5)
Epoch 07 | TrainLoss 0.2464 | Val RMSE 0.8272 | MAE 0.6374 | MAPE 22.49%
  --> No improvement (3/5)
Epoch 08 | TrainLoss 0.1862 | Val RMSE 0.8410 | MAE 0.6452 | MAPE 22.84%
  --> No improvement (4/5)
Epoch 09 | TrainLoss