NeuMF모델

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os

# -------------------- 데이터 로딩 및 전처리 --------------------
df = pd.read_json('review_business_5up_5aspect_3sentiment_vectorized_clean.json', lines=True)
df_processed = df[['user_id', 'business_id', 'stars']].copy()

user_encoder = LabelEncoder()
business_encoder = LabelEncoder()
df_processed['user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed['business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
val_size_ratio = 1 / 8
train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42)

# -------------------- Dataset 정의 --------------------
class NeuMFDataset(Dataset):
    def __init__(self, df):
        self.user_ids = torch.tensor(df['user_encoded'].values, dtype=torch.long)
        self.item_ids = torch.tensor(df['business_encoded'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['stars'].values, dtype=torch.float)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]

train_dataset = NeuMFDataset(train_df)
val_dataset = NeuMFDataset(val_df)
test_dataset = NeuMFDataset(test_df)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# -------------------- 모델 정의 --------------------
class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, mf_dim=16, mlp_dims=[64, 32]):
        super(NeuMF, self).__init__()
        self.user_embedding_gmf = nn.Embedding(num_users, mf_dim)
        self.item_embedding_gmf = nn.Embedding(num_items, mf_dim)

        self.user_embedding_mlp = nn.Embedding(num_users, mlp_dims[0] // 2)
        self.item_embedding_mlp = nn.Embedding(num_items, mlp_dims[0] // 2)

        mlp_layers = []
        input_dim = mlp_dims[0]
        for dim in mlp_dims[1:]:
            mlp_layers.append(nn.Linear(input_dim, dim))
            mlp_layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*mlp_layers)

        self.final_layer = nn.Linear(mf_dim + mlp_dims[-1], 1)

    def forward(self, user_ids, item_ids):
        gmf_user = self.user_embedding_gmf(user_ids)
        gmf_item = self.item_embedding_gmf(item_ids)
        gmf_output = gmf_user * gmf_item

        mlp_user = self.user_embedding_mlp(user_ids)
        mlp_item = self.item_embedding_mlp(item_ids)
        mlp_input = torch.cat((mlp_user, mlp_item), dim=1)
        mlp_output = self.mlp(mlp_input)

        concat = torch.cat((gmf_output, mlp_output), dim=1)
        prediction = self.final_layer(concat)
        return prediction.view(-1)

# -------------------- 평가 지표 --------------------
def mean_absolute_percentage_error(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    epsilon = 1e-10
    return np.mean(np.abs((y_true - y_pred) / (y_true + epsilon))) * 100

# -------------------- 학습 --------------------
embedding_dim = 16
mlp_dims = [64, 32]
learning_rate = 0.001
epochs = 50
patience = 5
min_delta = 0.0001

model = NeuMF(num_users, num_businesses, embedding_dim, mlp_dims)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
model_path = 'best_neumf_model.pt'

best_val_rmse = float('inf')
epochs_no_improve = 0

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    train_bar = tqdm(train_loader, desc=f"[Epoch {epoch+1}] Training", leave=False)
    for user_ids, item_ids, ratings in train_bar:
        optimizer.zero_grad()
        predictions = model(user_ids, item_ids)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        train_bar.set_postfix(loss=loss.item())

    model.eval()
    val_predictions, val_true = [], []
    val_bar = tqdm(val_loader, desc=f"[Epoch {epoch+1}] Validating", leave=False)
    with torch.no_grad():
        for user_ids, item_ids, ratings in val_bar:
            preds = model(user_ids, item_ids)
            val_predictions.extend(preds.tolist())
            val_true.extend(ratings.tolist())

    val_mse = mean_squared_error(val_true, val_predictions)
    val_rmse = np.sqrt(val_mse)
    val_mae = mean_absolute_error(val_true, val_predictions)
    val_mape = mean_absolute_percentage_error(val_true, val_predictions)

    print(f"\nEpoch {epoch+1} | Train Loss: {total_train_loss/len(train_loader):.4f} | "
          f"Val MSE: {val_mse:.4f}, RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}, MAPE: {val_mape:.2f}%")

    if val_rmse < best_val_rmse - min_delta:
        best_val_rmse = val_rmse
        epochs_no_improve = 0
        torch.save(model.state_dict(), model_path)
        print(f"  --> 개선됨. 모델 저장됨 (RMSE: {best_val_rmse:.4f})")
    else:
        epochs_no_improve += 1
        print(f"  --> 개선 없음. ({epochs_no_improve}/{patience})")
        if epochs_no_improve == patience:
            print("조기 종료 발생.")
            break

# -------------------- 테스트 --------------------
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path))
    print(f"최적 모델 로드 완료: {model_path}")

model.eval()
test_preds, test_true = [], []
test_bar = tqdm(test_loader, desc="Testing", leave=False)
with torch.no_grad():
    for user_ids, item_ids, ratings in test_bar:
        preds = model(user_ids, item_ids)
        test_preds.extend(preds.tolist())
        test_true.extend(ratings.tolist())

test_mse = mean_squared_error(test_true, test_preds)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(test_true, test_preds)
test_mape = mean_absolute_percentage_error(test_true, test_preds)

print(f"\n✅ [NeuMF] 최종 테스트 평가 지표:")
print(f"   - MSE  : {test_mse:.4f}")
print(f"   - RMSE : {test_rmse:.4f}")
print(f"   - MAE  : {test_mae:.4f}")
print(f"   - MAPE : {test_mape:.2f}%")





Epoch 1 | Train Loss: 1.8998 | Val MSE: 1.3007, RMSE: 1.1405, MAE: 0.9114, MAPE: 37.19%
  --> 개선됨. 모델 저장됨 (RMSE: 1.1405)





Epoch 2 | Train Loss: 1.2091 | Val MSE: 1.2162, RMSE: 1.1028, MAE: 0.8775, MAPE: 35.75%
  --> 개선됨. 모델 저장됨 (RMSE: 1.1028)





Epoch 3 | Train Loss: 1.1096 | Val MSE: 1.1880, RMSE: 1.0900, MAE: 0.8565, MAPE: 35.22%
  --> 개선됨. 모델 저장됨 (RMSE: 1.0900)





Epoch 4 | Train Loss: 1.0279 | Val MSE: 1.1773, RMSE: 1.0850, MAE: 0.8574, MAPE: 34.45%
  --> 개선됨. 모델 저장됨 (RMSE: 1.0850)





Epoch 5 | Train Loss: 0.9538 | Val MSE: 1.1808, RMSE: 1.0866, MAE: 0.8522, MAPE: 34.39%
  --> 개선 없음. (1/5)





Epoch 6 | Train Loss: 0.8856 | Val MSE: 1.1987, RMSE: 1.0949, MAE: 0.8580, MAPE: 34.32%
  --> 개선 없음. (2/5)





Epoch 7 | Train Loss: 0.8218 | Val MSE: 1.2195, RMSE: 1.1043, MAE: 0.8686, MAPE: 34.16%
  --> 개선 없음. (3/5)





Epoch 8 | Train Loss: 0.7617 | Val MSE: 1.2514, RMSE: 1.1187, MAE: 0.8779, MAPE: 34.32%
  --> 개선 없음. (4/5)





Epoch 9 | Train Loss: 0.7052 | Val MSE: 1.2811, RMSE: 1.1319, MAE: 0.8882, MAPE: 34.46%
  --> 개선 없음. (5/5)
조기 종료 발생.
최적 모델 로드 완료: best_neumf_model.pt


                                                           


✅ [NeuMF] 최종 테스트 평가 지표:
   - MSE  : 1.1629
   - RMSE : 1.0784
   - MAE  : 0.8518
   - MAPE : 34.09%


