In [1]:
import pandas as pd

# (경로를 실제 ratings.csv 위치로 바꿔주세요)
ratings = pd.read_csv('../data/ratings.csv')

# userId와 movieId를 0부터 시작하는 정수 인덱스(user_idx, item_idx)로 변환
user_ids = ratings['userId'].unique()
item_ids = ratings['movieId'].unique()
user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {m: i for i, m in enumerate(item_ids)}

ratings['user_idx'] = ratings['userId'].map(user2idx)
ratings['item_idx'] = ratings['movieId'].map(item2idx)

In [2]:
from sklearn.model_selection import train_test_split

# 먼저 train+val과 test로 분리 (test: 전체의 10%)
train_val, test = train_test_split(ratings, test_size=0.1, random_state=42)

# train과 validation으로 분리 (전체의 10%가 validation)
train, val = train_test_split(train_val, test_size=0.111, random_state=42)

print(f"Train: {len(train)}, Validation: {len(val)}, Test: {len(test)}")

Train: 25603362, Validation: 3196821, Test: 3200021


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

class MFDataset(Dataset):
    def __init__(self, df):
        self.users = torch.LongTensor(df['user_idx'].values)
        self.items = torch.LongTensor(df['item_idx'].values)
        self.ratings = torch.FloatTensor(df['rating'].values)
    def __len__(self):
        return len(self.ratings)
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_ds = MFDataset(train)
val_ds = MFDataset(val)
test_ds = MFDataset(test)

train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1024)
test_loader = DataLoader(test_ds, batch_size=1024)

In [4]:
import torch.nn as nn

class MF(nn.Module):
    def __init__(self, num_users, num_items, embed_dim=32):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embed_dim)
        self.item_embedding = nn.Embedding(num_items, embed_dim)
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
    def forward(self, users, items):
        u = self.user_embedding(users)
        i = self.item_embedding(items)
        return (u * i).sum(1)  # user, item 임베딩의 내적값내적값

In [5]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
num_users = ratings['user_idx'].nunique()
num_items = ratings['item_idx'].nunique()
epochs = 10            # 실험을 위해 5회, 실제로는 10~30까지 늘릴 수 있음
model = MF(num_users, num_items, embed_dim=32).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [6]:
# Early stopping 설정
best_val_rmse = float('inf')
patience = 3  # 3 epoch 동안 개선 없으면 중단
counter = 0
best_model_state = None

for epoch in range(epochs):
    model.train()
    for users, items, ratings in train_loader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)
        optimizer.zero_grad()
        preds = model(users, items)
        loss = criterion(preds, ratings)
        loss.backward()
        optimizer.step()
    
    # validation 평가
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for users, items, ratings in val_loader:
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)
            preds = model(users, items)
            val_loss += criterion(preds, ratings).item() * users.size(0)
    val_loss = val_loss / len(val_ds)
    val_rmse = val_loss**0.5
    print(f"Epoch {epoch + 1} | Validation RMSE: {val_rmse:.4f}")
    
    # Early stopping 체크
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_model_state = model.state_dict().copy()  # 최고 모델 저장
        counter = 0
        print(f"  → 새로운 최고 성능! 모델 저장")
    else:
        counter += 1
        print(f"  → 개선 없음 ({counter}/{patience})")
        if counter >= patience:
            print(f"Early stopping! 최고 성능: {best_val_rmse:.4f}")
            model.load_state_dict(best_model_state)  # 최고 모델 복원
            break


Epoch 1 | Validation RMSE: 0.9057
  → 새로운 최고 성능! 모델 저장
Epoch 2 | Validation RMSE: 0.8488
  → 새로운 최고 성능! 모델 저장
Epoch 3 | Validation RMSE: 0.8163
  → 새로운 최고 성능! 모델 저장
Epoch 4 | Validation RMSE: 0.8003
  → 새로운 최고 성능! 모델 저장
Epoch 5 | Validation RMSE: 0.7961
  → 새로운 최고 성능! 모델 저장
Epoch 6 | Validation RMSE: 0.7969
  → 개선 없음 (1/3)
Epoch 7 | Validation RMSE: 0.7987
  → 개선 없음 (2/3)
Epoch 8 | Validation RMSE: 0.8011
  → 개선 없음 (3/3)
Early stopping! 최고 성능: 0.7961


In [7]:
model.eval()
test_loss = 0
with torch.no_grad():
    for users, items, ratings in test_loader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)
        preds = model(users, items)
        test_loss += criterion(preds, ratings).item() * users.size(0)
test_loss = test_loss / len(test_ds)
print(f"Test RMSE: {test_loss**0.5:.4f}")

Test RMSE: 0.8016


In [8]:
# 모델 저장
torch.save(model.state_dict(), '../models/mf_model.pth')
print("MF Model saved to ../models/mf_model.pth")

MF Model saved to ../models/mf_model.pth
