In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [7]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Step 1: Load and preprocess data
data = []
with open('/content/drive/MyDrive/review_business_5up_with_text.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df = df[['user_id', 'business_id', 'stars', 'text']]


user2idx = {uid: i for i, uid in enumerate(df['user_id'].unique())}
item2idx = {iid: i for i, iid in enumerate(df['business_id'].unique())}
df['user'] = df['user_id'].map(user2idx)
df['item'] = df['business_id'].map(item2idx)

# Step 2: Generate context vectors using SBERT on GPU
sbert = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')  # GPU 사용
context_vectors = sbert.encode(df['text'].tolist(), show_progress_bar=True)
df['context_vector'] = list(context_vectors)

# Step 3: Dataset and Dataloader
class UCAMDataset(Dataset):
    def __init__(self, users, items, ratings, contexts):
        self.users = users
        self.items = items
        self.ratings = ratings
        self.contexts = contexts

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.users[idx], dtype=torch.long),
            torch.tensor(self.items[idx], dtype=torch.long),
            torch.tensor(self.contexts[idx], dtype=torch.float32),
            torch.tensor(self.ratings[idx], dtype=torch.float32)
        )

# Train/Test Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = UCAMDataset(train_df['user'].values, train_df['item'].values,
                            train_df['stars'].values, np.stack(train_df['context_vector']))
test_dataset = UCAMDataset(test_df['user'].values, test_df['item'].values,
                           test_df['stars'].values, np.stack(test_df['context_vector']))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Step 4: UCAM Model
class UCAM(nn.Module):
    def __init__(self, num_users, num_items, context_dim=384, embed_dim=64):
        super().__init__()
        self.user_embed = nn.Embedding(num_users, embed_dim)
        self.item_embed = nn.Embedding(num_items, embed_dim)
        self.fc1 = nn.Linear(embed_dim * 2 + context_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, user_ids, item_ids, context_vecs):
        u = self.user_embed(user_ids)
        i = self.item_embed(item_ids)
        x = torch.cat([u, i, context_vecs], dim=-1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x).squeeze()

# Step 5: Train
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = UCAM(num_users=len(user2idx), num_items=len(item2idx)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

print("모델 학습 시작...")

best_val_rmse = float('inf')      # 초기 최적 RMSE
epochs_no_improve = 0             # 개선되지 않은 에폭 수
patience = 5
epochs = 50                      # 최대 기다릴 에폭 수
min_delta = 0.001                 # 개선으로 인정할 최소 감소 폭
model_path = 'best_aatrec_model.pt'  # 최적 모델 저장 경로

for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    for user_ids, business_ids, sentiment_vectors, stars in train_loader:
        user_ids = user_ids.to(device)
        business_ids = business_ids.to(device)
        sentiment_vectors = sentiment_vectors.to(device)
        stars = stars.to(device)

        optimizer.zero_grad()
        predictions = model(user_ids, business_ids, sentiment_vectors)
        loss = criterion(predictions, stars)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {total_train_loss / len(train_loader):.4f}")

print("모델 학습 완료.")

# --- 최종 테스트 평가 ---
def evaluate_model(model, data_loader, device):
    model.eval()
    preds, targets = [], []

    with torch.no_grad():
        for users, items, contexts, ratings in data_loader:
            users = users.to(device)
            items = items.to(device)
            contexts = contexts.to(device)
            ratings = ratings.to(device)

            output = model(users, items, contexts)
            preds.extend(output.cpu().numpy())
            targets.extend(ratings.cpu().numpy())

    preds = np.array(preds)
    targets = np.array(targets)

    mae = mean_absolute_error(targets, preds)
    mse = mean_squared_error(targets, preds)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((targets - preds) / targets)) * 100

    print(f"\n📊 Evaluation Metrics:")
    print(f"MAE  : {mae:.4f}")
    print(f"MSE  : {mse:.4f}")
    print(f"RMSE : {rmse:.4f}")
    print(f"MAPE : {mape:.2f}%")

    return mae, mse, rmse, mape

evaluate_model(model, test_loader, device)


Batches:   0%|          | 0/13994 [00:00<?, ?it/s]

모델 학습 시작...
Epoch 1/50, Train Loss: 0.7787
Epoch 2/50, Train Loss: 0.6098
Epoch 3/50, Train Loss: 0.5421
Epoch 4/50, Train Loss: 0.4905
Epoch 5/50, Train Loss: 0.4471
Epoch 6/50, Train Loss: 0.4064
Epoch 7/50, Train Loss: 0.3692
Epoch 8/50, Train Loss: 0.3351
Epoch 9/50, Train Loss: 0.3033
Epoch 10/50, Train Loss: 0.2746
Epoch 11/50, Train Loss: 0.2488
Epoch 12/50, Train Loss: 0.2261
Epoch 13/50, Train Loss: 0.2058
Epoch 14/50, Train Loss: 0.1878
Epoch 15/50, Train Loss: 0.1718
Epoch 16/50, Train Loss: 0.1579
Epoch 17/50, Train Loss: 0.1456
Epoch 18/50, Train Loss: 0.1343
Epoch 19/50, Train Loss: 0.1246
Epoch 20/50, Train Loss: 0.1156
Epoch 21/50, Train Loss: 0.1081
Epoch 22/50, Train Loss: 0.1011
Epoch 23/50, Train Loss: 0.0947
Epoch 24/50, Train Loss: 0.0891
Epoch 25/50, Train Loss: 0.0839
Epoch 26/50, Train Loss: 0.0793
Epoch 27/50, Train Loss: 0.0752
Epoch 28/50, Train Loss: 0.0715
Epoch 29/50, Train Loss: 0.0677
Epoch 30/50, Train Loss: 0.0649
Epoch 31/50, Train Loss: 0.0619
Epoch

(0.6929627060890198,
 0.8327279686927795,
 np.float64(0.9125392970676822),
 np.float32(23.713192))