In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


라이브러리

In [4]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
import shutil

파라미터

In [5]:
# ====== 파일 경로 및 모델 설정 변경 ======
JSONL_PATH = '/content/drive/MyDrive/dataset_la_emd.jsonl'
MODEL_SAVE_PATH = '/content/drive/MyDrive/final_best_hybrid_bert_model.keras'

# BERT 임베딩 차원에 맞게 파라미터 수정
best_params = {
    'user_embedding_dim': 128,
    'business_embedding_dim': 32,
    'bert_mlp_dims': [768, 384, 192], # BERT 임베딩 차원(768)에 맞춰 시작
    'user_biz_mlp_dims': [128, 64],
    'final_mlp_dims': [64, 32],
    'learning_rate': 0.0001,
    'batch_size': 256
}

# ====== 2. 데이터 로드 및 전처리 ======
# JSONL 파일에서 데이터 로드
print("데이터 로드 중...")
records = []
with open(JSONL_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        records.append(json.loads(line))

df = pd.DataFrame(records)
print(" 데이터 로드 완료.")

# 필요한 필드만 선택하고 임베딩 열의 이름을 'embedding'으로 변경
df_processed = df[['user_id', 'business_id', 'stars', 'bert_embedding']].copy()
df_processed.rename(columns={'bert_embedding': 'embedding'}, inplace=True)

# 임베딩 데이터 타입을 NumPy 배열로 변환 (추후 모델 학습에 필요)
df_processed['embedding'] = df_processed['embedding'].apply(np.array)

print(" 데이터 전처리 완료.")

데이터 로드 중...
 데이터 로드 완료.
 데이터 전처리 완료.


모델

In [6]:
def build_hybrid_bert_model(num_users, num_businesses, user_embedding_dim, business_embedding_dim,
                             bert_embedding_dim, user_biz_mlp_dims, bert_mlp_dims, final_mlp_dims):

    # 사용자-비즈니스 상호작용 모듈
    user_input = keras.Input(shape=(1,), name='user_id')
    business_input = keras.Input(shape=(1,), name='business_id')

    user_embedding = layers.Embedding(num_users, user_embedding_dim, name='user_embedding')(user_input)
    user_vec = layers.Flatten()(user_embedding)

    business_embedding = layers.Embedding(num_businesses, business_embedding_dim, name='business_embedding')(business_input)
    business_vec = layers.Flatten()(business_embedding)

    combined_vec = layers.concatenate([user_vec, business_vec], axis=1)
    interaction_features = combined_vec
    for dim in user_biz_mlp_dims:
        interaction_features = layers.Dense(dim, activation='relu')(interaction_features)

    # BERT 임베딩 모듈
    bert_input = keras.Input(shape=(bert_embedding_dim,), name='bert_embedding')
    bert_features = bert_input
    for dim in bert_mlp_dims:
        bert_features = layers.Dense(dim, activation='relu')(bert_features)

    # 최종 예측 모듈
    final_combined_features = layers.concatenate([interaction_features, bert_features], axis=1)

    predicted_rating = final_combined_features
    for dim in final_mlp_dims:
        predicted_rating = layers.Dense(dim, activation='relu')(predicted_rating)
    predicted_rating = layers.Dense(1, activation='linear', name='output_rating')(predicted_rating)

    model = models.Model(inputs=[user_input, business_input, bert_input],
                         outputs=predicted_rating)
    return model

데이터 분할 / 5회 반복

In [7]:
all_rmse = []
all_mae = []
all_mape = []
all_mse = []

for i in range(5):
    keras.backend.clear_session()
    print(f"{i+1}번째\n")

    # 1. 데이터를 'random_state=i'로 분할 (매번 다른 분할)
    train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42 + i)
    val_size_ratio = 1 / 8
    train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42 + i)

    user_encoder = LabelEncoder()
    business_encoder = LabelEncoder()
    train_df.loc[:, 'user_encoded'] = user_encoder.fit_transform(train_df['user_id'])
    train_df.loc[:, 'business_encoded'] = business_encoder.fit_transform(train_df['business_id'])

    user_mapping = {label: i for i, label in enumerate(user_encoder.classes_)}
    business_mapping = {label: i for i, label in enumerate(business_encoder.classes_)}
    val_df.loc[:, 'user_encoded'] = val_df['user_id'].map(user_mapping).fillna(-1).astype(int)
    test_df.loc[:, 'user_encoded'] = test_df['user_id'].map(user_mapping).fillna(-1).astype(int)
    val_df.loc[:, 'business_encoded'] = val_df['business_id'].map(business_mapping).fillna(-1).astype(int)
    test_df.loc[:, 'business_encoded'] = test_df['business_id'].map(business_mapping).fillna(-1).astype(int)

    num_users = len(user_encoder.classes_)
    num_businesses = len(business_encoder.classes_)

    train_embeddings = np.vstack(train_df['embedding'].values)
    val_embeddings = np.vstack(val_df['embedding'].values)
    test_embeddings = np.vstack(test_df['embedding'].values)

    # BERT 임베딩 차원은 768로 고정
    bert_embedding_dim = 768

    # 모델 빌드 함수와 파라미터 업데이트
    final_model = build_hybrid_bert_model(
        num_users, num_businesses,
        best_params['user_embedding_dim'], best_params['business_embedding_dim'],
        bert_embedding_dim,
        best_params['user_biz_mlp_dims'], best_params['bert_mlp_dims'],
        best_params['final_mlp_dims'])

    final_model.compile(optimizer=keras.optimizers.Adam(learning_rate=best_params['learning_rate']),
                        loss='mse',
                        metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), 'mae'])

    early_stopping_callback = callbacks.EarlyStopping(
        monitor='val_rmse',
        patience=5,
        min_delta=0.0005,
        mode='min',
        restore_best_weights=True)

    # 모델 저장 경로 업데이트
    model_save_path = f'final_best_hybrid_bert_model_fold_{i+1}.keras'
    model_checkpoint_callback = callbacks.ModelCheckpoint(
        filepath=model_save_path,
        monitor='val_rmse',
        save_best_only=True,
        mode='min',
        verbose=0)

    # 모델 학습 데이터셋 입력 이름 업데이트
    history = final_model.fit(
        {'user_id': train_df['user_encoded'],
         'business_id': train_df['business_encoded'],
         'bert_embedding': train_embeddings},
        train_df['stars'],
        batch_size=best_params['batch_size'],
        epochs=50,
        validation_data=(
            {'user_id': val_df['user_encoded'],
             'business_id': val_df['business_encoded'],
             'bert_embedding': val_embeddings},
            val_df['stars']
        ),
        callbacks=[early_stopping_callback, model_checkpoint_callback],
        verbose=1)

    # 테스트셋 예측 입력 이름 업데이트
    test_predictions = final_model.predict(
        {'user_id': test_df['user_encoded'],
         'business_id': test_df['business_encoded'],
         'bert_embedding': test_embeddings}
    ).flatten()

    true_ratings = test_df['stars'].values

    # 평가 지표 계산
    mse = mean_squared_error(true_ratings, test_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_ratings, test_predictions)

    # MAPE 계산은 NaN 또는 inf가 나올 수 있으므로, 0으로 나누기 오류를 방지합니다.
    try:
        mape = mean_absolute_percentage_error(true_ratings, test_predictions)
    except (ValueError, ZeroDivisionError):
        mape = np.nan

    all_mse.append(mse)
    all_rmse.append(rmse)
    all_mae.append(mae)
    all_mape.append(mape)

    print(f" {i+1}번째 - MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, MAPE: {mape:.4f}")

1번째

Epoch 1/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - loss: 1.9941 - mae: 1.0797 - rmse: 1.3791 - val_loss: 1.2712 - val_mae: 0.8853 - val_rmse: 1.1275
Epoch 2/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 1.1531 - mae: 0.8454 - rmse: 1.0738 - val_loss: 1.1868 - val_mae: 0.8648 - val_rmse: 1.0894
Epoch 3/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.9991 - mae: 0.7729 - rmse: 0.9995 - val_loss: 1.1794 - val_mae: 0.8460 - val_rmse: 1.0860
Epoch 4/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.9327 - mae: 0.7421 - rmse: 0.9657 - val_loss: 1.1820 - val_mae: 0.8410 - val_rmse: 1.0872
Epoch 5/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.8973 - mae: 0.7245 - rmse: 0.9472 - val_loss: 1.2068 - val_mae: 0.8428 - val_rmse: 1.0985
Epoch 6/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

최종 결과

In [8]:
print("5회 실험 최종 결과 요약\n")

mean_rmse = np.mean(all_rmse)
std_rmse = np.std(all_rmse)
mean_mae = np.mean(all_mae)
std_mae = np.std(all_mae)
mean_mape = np.mean(all_mape)
std_mape = np.std(all_mape)
all_mse = [x**2 for x in all_rmse]
mean_mse = np.mean(all_mse)
std_mse = np.std(all_mse)

print(f"평균 MSE: {mean_mse:.4f}")
print(f"MSE 표준편차: {std_mse:.5f}")

print(f"평균 RMSE: {mean_rmse:.4f}")
print(f"RMSE 표준편차: {std_rmse:.5f}")

print(f"평균 MAE: {mean_mae:.4f}")
print(f"MAE 표준편차: {std_mae:.5f}")

print(f"평균 MAPE: {mean_mape:.4f}")
print(f"MAPE 표준편차: {std_mape:.5f}")

5회 실험 최종 결과 요약

평균 MSE: 1.1799
MSE 표준편차: 0.00679
평균 RMSE: 1.0862
RMSE 표준편차: 0.00312
평균 MAE: 0.8451
MAE 표준편차: 0.00762
평균 MAPE: 0.3466
MAPE 표준편차: 0.00484
