라이브러리

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
import shutil

파라미터

In [3]:
PARQUET_PATH = 'review_data_optimized_la.parquet'
MODEL_SAVE_PATH = 'final_best_hybrid_gemini_model_la.keras'

best_params = {
    'user_embedding_dim': 128,
    'business_embedding_dim': 32,
    'gemini_mlp_dims': [1536, 768, 384, 192],
    'user_biz_mlp_dims': [128, 64],
    'final_mlp_dims': [64, 32],
    'learning_rate': 0.0001,
    'batch_size': 256
}

df = pd.read_parquet(PARQUET_PATH)
print(" 데이터 로드")
df_processed = df[['user_id', 'business_id', 'review_stars', 'embedding']].copy()

 데이터 로드


모델

In [4]:
def build_hybrid_gemini_model(num_users, num_businesses, user_embedding_dim, business_embedding_dim,
                              gemini_embedding_dim, user_biz_mlp_dims, gemini_mlp_dims, final_mlp_dims):
    
    # 사용자-비즈니스 상호작용 모듈
    user_input = keras.Input(shape=(1,), name='user_id')
    business_input = keras.Input(shape=(1,), name='business_id')

    user_embedding = layers.Embedding(num_users, user_embedding_dim, name='user_embedding')(user_input)
    user_vec = layers.Flatten()(user_embedding)

    business_embedding = layers.Embedding(num_businesses, business_embedding_dim, name='business_embedding')(business_input)
    business_vec = layers.Flatten()(business_embedding)

    combined_vec = layers.concatenate([user_vec, business_vec], axis=1)
    interaction_features = combined_vec
    for dim in user_biz_mlp_dims:
        interaction_features = layers.Dense(dim, activation='relu')(interaction_features)

    # Gemini 임베딩 모듈
    gemini_input = keras.Input(shape=(gemini_embedding_dim,), name='gemini_embedding')
    gemini_features = gemini_input
    for dim in gemini_mlp_dims:
        gemini_features = layers.Dense(dim, activation='relu')(gemini_features)
    
    # 최종 예측 모듈
    final_combined_features = layers.concatenate([interaction_features, gemini_features], axis=1)

    predicted_rating = final_combined_features
    for dim in final_mlp_dims:
        predicted_rating = layers.Dense(dim, activation='relu')(predicted_rating)
    predicted_rating = layers.Dense(1, activation='linear', name='output_rating')(predicted_rating)
    
    model = models.Model(inputs=[user_input, business_input, gemini_input],
                         outputs=predicted_rating)
    return model

데이터 분할 / 5회 반복

In [5]:
all_rmse = []
all_mae = []
all_mape = []
all_mse = []



for i in range(5):
    keras.backend.clear_session()

    print(f"{i+1}번째\n")


    # 1. 데이터를 'random_state=i'로 분할 (매번 다른 분할)
    train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42+i)
    val_size_ratio = 1 / 8
    train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42+i)

    user_encoder = LabelEncoder()
    business_encoder = LabelEncoder()
    train_df.loc[:, 'user_encoded'] = user_encoder.fit_transform(train_df['user_id'])
    train_df.loc[:, 'business_encoded'] = business_encoder.fit_transform(train_df['business_id'])

    user_mapping = {label: i for i, label in enumerate(user_encoder.classes_)}
    business_mapping = {label: i for i, label in enumerate(business_encoder.classes_)}
    val_df.loc[:, 'user_encoded'] = val_df['user_id'].map(user_mapping).fillna(-1).astype(int)
    val_df.loc[:, 'business_encoded'] = val_df['business_id'].map(business_mapping).fillna(-1).astype(int)
    test_df.loc[:, 'user_encoded'] = test_df['user_id'].map(user_mapping).fillna(-1).astype(int)
    test_df.loc[:, 'business_encoded'] = test_df['business_id'].map(business_mapping).fillna(-1).astype(int)

    num_users = len(user_encoder.classes_)
    num_businesses = len(business_encoder.classes_)
    
    train_embeddings = np.vstack(train_df['embedding'].values)
    val_embeddings = np.vstack(val_df['embedding'].values)
    test_embeddings = np.vstack(test_df['embedding'].values)
    gemini_embedding_dim = len(train_embeddings[0]) if len(train_embeddings) > 0 else 3072


    final_model = build_hybrid_gemini_model(
        num_users, num_businesses,
        best_params['user_embedding_dim'], best_params['business_embedding_dim'],
        gemini_embedding_dim,
        best_params['user_biz_mlp_dims'], best_params['gemini_mlp_dims'],
        best_params['final_mlp_dims'])
    final_model.compile(optimizer=keras.optimizers.Adam(learning_rate=best_params['learning_rate']),
                        loss='mse',
                        metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), 'mae'])
    
    early_stopping_callback = callbacks.EarlyStopping(
        monitor='val_rmse',
        patience=5,
        min_delta=0.0005,
        mode='min',
        restore_best_weights=True)
    model_checkpoint_callback = callbacks.ModelCheckpoint(
        filepath=MODEL_SAVE_PATH,
        monitor='val_rmse',
        save_best_only=True,
        mode='min',
        verbose=0)


    history = final_model.fit(
        {'user_id': train_df['user_encoded'],
         'business_id': train_df['business_encoded'],
         'gemini_embedding': train_embeddings},
        train_df['review_stars'],
        batch_size=best_params['batch_size'],
        epochs=50,
        validation_data=(
            {'user_id': val_df['user_encoded'],
             'business_id': val_df['business_encoded'],
             'gemini_embedding': val_embeddings},
            val_df['review_stars']
        ),
        callbacks=[early_stopping_callback, model_checkpoint_callback],
        verbose=1)

    test_predictions = final_model.predict(
        {'user_id': test_df['user_encoded'],
         'business_id': test_df['business_encoded'],
         'gemini_embedding': test_embeddings}
    ).flatten()
    true_ratings = test_df['review_stars'].values

    mse = mean_squared_error(true_ratings, test_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_ratings, test_predictions)
    mape = mean_absolute_percentage_error(true_ratings, test_predictions)
    
    all_mse.append(mse)
    all_rmse.append(rmse)
    all_mae.append(mae)
    all_mape.append(mape)
    
    print(f" {i+1}번째 - MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, MAPE: {mape:.4f}")


1번째

Epoch 1/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 23ms/step - loss: 2.1193 - mae: 0.8830 - rmse: 1.2984 - val_loss: 0.2666 - val_mae: 0.4104 - val_rmse: 0.5163
Epoch 2/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 23ms/step - loss: 0.2428 - mae: 0.3796 - rmse: 0.4927 - val_loss: 0.2421 - val_mae: 0.3720 - val_rmse: 0.4920
Epoch 3/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - loss: 0.2014 - mae: 0.3427 - rmse: 0.4488 - val_loss: 0.2312 - val_mae: 0.3697 - val_rmse: 0.4808
Epoch 4/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 24ms/step - loss: 0.1848 - mae: 0.3265 - rmse: 0.4299 - val_loss: 0.2422 - val_mae: 0.3846 - val_rmse: 0.4921
Epoch 5/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 24ms/step - loss: 0.1710 - mae: 0.3132 - rmse: 0.4135 - val_loss: 0.2340 - val_mae: 0.3663 - val_rmse: 0.4837
Epoch 6/50
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[

최종 결과

In [6]:
print("5회 실험 최종 결과 요약\n")

mean_rmse = np.mean(all_rmse)
std_rmse = np.std(all_rmse)
mean_mae = np.mean(all_mae)
std_mae = np.std(all_mae)
mean_mape = np.mean(all_mape)
std_mape = np.std(all_mape)
all_mse = [x**2 for x in all_rmse]
mean_mse = np.mean(all_mse)
std_mse = np.std(all_mse)

print(f"평균 MSE: {mean_mse:.4f}")
print(f"MSE 표준편차: {std_mse:.5f}")

print(f"평균 RMSE: {mean_rmse:.4f}")
print(f"RMSE 표준편차: {std_rmse:.5f}")

print(f"평균 MAE: {mean_mae:.4f}")
print(f"MAE 표준편차: {std_mae:.5f}")

print(f"평균 MAPE: {mean_mape:.4f}")
print(f"MAPE 표준편차: {std_mape:.5f}")

5회 실험 최종 결과 요약

평균 MSE: 0.2279
MSE 표준편차: 0.00261
평균 RMSE: 0.4774
RMSE 표준편차: 0.00273
평균 MAE: 0.3640
MAE 표준편차: 0.00511
평균 MAPE: 0.1192
MAPE 표준편차: 0.00061
